From fea0f430d41789c732e3c2f684e385a337d20cbf Mon Sep 17 00:00:00 2001 From: Planet-Lab Support Date: Fri, 21 Jan 2005 03:34:26 +0000 Subject: [PATCH] This commit was manufactured by cvs2svn to create tag 'after-bindmountpatch-merge'. --- Documentation/ckrm/block_io | 154 -- Documentation/ckrm/ckrm_basics | 66 - Documentation/ckrm/core_usage | 72 - Documentation/ckrm/crbce | 33 - Documentation/ckrm/installation | 70 - Documentation/ckrm/mem_rc.design | 134 -- Documentation/ckrm/mem_rc.usage | 72 - Documentation/ckrm/rbce_basics | 67 - Documentation/ckrm/rbce_usage | 98 - Makefile | 2 +- arch/i386/kernel/entry.S | 2 - arch/ppc/kernel/misc.S | 2 - ...kernel-2.6.8-i686-planetlab-desktop.config | 1750 ----------------- configs/kernel-2.6.8-i686-planetlab.config | 33 +- drivers/block/Makefile | 3 +- drivers/block/cfq-iosched.c | 1061 ++-------- drivers/block/ckrm-io.c | 163 +- drivers/block/ckrm-iostub.c | 4 +- drivers/block/elevator.c | 10 +- drivers/block/ll_rw_blk.c | 45 +- drivers/char/hangcheck-timer.c | 2 +- fs/exec.c | 13 - fs/ext2/acl.c | 4 - fs/ext2/inode.c | 2 +- fs/ext2/ioctl.c | 8 +- fs/ext3/acl.c | 4 - fs/ext3/inode.c | 2 +- fs/ext3/ioctl.c | 8 +- fs/ioctl.c | 13 - fs/namei.c | 29 +- fs/rcfs/dir.c | 2 +- fs/rcfs/magic.c | 67 +- fs/rcfs/rootdir.c | 2 +- fs/rcfs/socket_fs.c | 6 - fs/rcfs/super.c | 2 +- fs/rcfs/tc_magic.c | 13 +- fs/reiserfs/xattr.c | 4 - include/asm-i386/unistd.h | 4 +- include/asm-ppc/unistd.h | 4 +- include/asm-x86_64/unistd.h | 6 +- include/linux/ckrm-io.h | 7 +- include/linux/ckrm.h | 11 +- include/linux/ckrm_ce.h | 13 +- include/linux/ckrm_classqueue.h | 5 +- include/linux/ckrm_mem.h | 23 +- include/linux/ckrm_mem_inline.h | 110 +- include/linux/ckrm_rc.h | 8 +- include/linux/ckrm_sched.h | 482 +---- include/linux/crbce.h | 175 -- include/linux/elevator.h | 5 - include/linux/ext2_fs.h | 5 - include/linux/ext3_fs.h | 5 - include/linux/fs.h | 14 +- include/linux/init_task.h | 1 - include/linux/mm.h | 3 - include/linux/mm_inline.h | 7 - .../linux/netfilter_ipv4/ip_conntrack_pptp.h | 310 --- .../netfilter_ipv4/ip_conntrack_proto_gre.h | 123 -- include/linux/netfilter_ipv4/ip_nat_pptp.h | 11 - include/linux/page-flags.h | 1 - include/linux/rbce.h | 127 -- include/linux/rcfs.h | 1 - include/linux/sched.h | 152 +- include/linux/socket.h | 3 - include/linux/taskdelays.h | 4 +- include/linux/tcp.h | 1 + include/linux/vserver/inode.h | 7 - include/net/sock.h | 4 +- init/Kconfig | 74 +- init/main.c | 4 - kernel/Makefile | 5 +- kernel/ckrm/Makefile | 10 +- kernel/ckrm/ckrm.c | 43 +- kernel/ckrm/ckrm_cpu_class.c | 145 +- kernel/ckrm/ckrm_cpu_monitor.c | 828 ++------ kernel/ckrm/ckrm_laq.c | 495 ----- kernel/ckrm/ckrm_mem.c | 204 +- kernel/ckrm/ckrm_sockc.c | 14 +- kernel/ckrm/{ckrm_numtasks.c => ckrm_tasks.c} | 49 +- ...ckrm_numtasks_stub.c => ckrm_tasks_stub.c} | 0 kernel/ckrm/ckrm_tc.c | 97 +- kernel/ckrm/ckrmutils.c | 19 + kernel/ckrm/rbce/bitvector.h | 6 +- kernel/ckrm/rbce/info.h | 6 + kernel/ckrm/rbce/rbce_fs.c | 51 +- kernel/ckrm/rbce/rbcemod.c | 168 +- kernel/ckrm/rbce/rbcemod_ext.c | 35 +- kernel/ckrm/rbce/token.c | 25 +- kernel/ckrm_classqueue.c | 49 +- kernel/ckrm_sched.c | 213 +- kernel/exit.c | 10 +- kernel/exit.c.orig | 1192 +++++++++++ kernel/fork.c | 21 - kernel/itimer.c | 4 +- kernel/panic.c | 4 +- kernel/sched.c | 954 ++++----- kernel/signal.c | 22 +- kernel/vserver/inode.c | 31 - kernel/vserver/sysctl.c | 2 - mm/Makefile | 4 +- mm/memory.c | 5 +- mm/oom_panic.c | 51 - mm/page_alloc.c | 7 - mm/vmscan.c | 173 +- net/core/sock.c | 12 - net/ipv4/netfilter/ip_conntrack_core.c | 2 - net/ipv4/netfilter/ip_conntrack_pptp.c | 712 ------- net/ipv4/netfilter/ip_conntrack_pptp_priv.h | 24 - net/ipv4/netfilter/ip_conntrack_proto_gre.c | 349 ---- net/ipv4/netfilter/ip_conntrack_standalone.c | 4 +- net/ipv4/netfilter/ip_nat_pptp.c | 477 ----- net/ipv4/netfilter/ip_nat_proto_gre.c | 210 -- net/ipv4/tcp_ipv4.c | 2 +- net/packet/af_packet.c | 2 +- scripts/kernel-2.6-planetlab.spec | 33 +- 115 files changed, 2706 insertions(+), 9795 deletions(-) delete mode 100644 Documentation/ckrm/block_io delete mode 100644 Documentation/ckrm/ckrm_basics delete mode 100644 Documentation/ckrm/core_usage delete mode 100644 Documentation/ckrm/crbce delete mode 100644 Documentation/ckrm/installation delete mode 100644 Documentation/ckrm/mem_rc.design delete mode 100644 Documentation/ckrm/mem_rc.usage delete mode 100644 Documentation/ckrm/rbce_basics delete mode 100644 Documentation/ckrm/rbce_usage delete mode 100644 configs/kernel-2.6.8-i686-planetlab-desktop.config delete mode 100644 include/linux/crbce.h delete mode 100644 include/linux/netfilter_ipv4/ip_conntrack_pptp.h delete mode 100644 include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h delete mode 100644 include/linux/netfilter_ipv4/ip_nat_pptp.h delete mode 100644 include/linux/rbce.h delete mode 100644 kernel/ckrm/ckrm_laq.c rename kernel/ckrm/{ckrm_numtasks.c => ckrm_tasks.c} (90%) rename kernel/ckrm/{ckrm_numtasks_stub.c => ckrm_tasks_stub.c} (100%) create mode 100644 kernel/exit.c.orig delete mode 100644 mm/oom_panic.c delete mode 100644 net/ipv4/netfilter/ip_conntrack_pptp.c delete mode 100644 net/ipv4/netfilter/ip_conntrack_pptp_priv.h delete mode 100644 net/ipv4/netfilter/ip_conntrack_proto_gre.c delete mode 100644 net/ipv4/netfilter/ip_nat_pptp.c delete mode 100644 net/ipv4/netfilter/ip_nat_proto_gre.c diff --git a/Documentation/ckrm/block_io b/Documentation/ckrm/block_io deleted file mode 100644 index e4a0b8b95..000000000 --- a/Documentation/ckrm/block_io +++ /dev/null @@ -1,154 +0,0 @@ -CKRM I/O controller - -Last updated: Sep 21, 2004 - - -Intro ------ - -CKRM's I/O scheduler is developed as a delta over a modified version of -the Complete Fair Queuing scheduler (CFQ) that implements I/O priorities. -The latter's original posting can be found at: - http://www.ussg.iu.edu/hypermail/linux/kernel/0311.1/0019.html - -Please note that this is not the CFQ version currently in the linus kernel -(2.6.8.1 at time of writing) which provides equal, not prioritized, -bandwidth allocation amongst processes. Since the CFQ in the kernel is likely -to eventually move towards I/O priority implementation, CKRM has not renamed -the underlying I/O scheduler and simply replaces drivers/block/cfq-iosched.c -with the modified version. - -Installation ------------- - -1. Configure "Disk I/O Resource Controller" under CKRM (see -Documentation/ckrm/installation) - -2. After booting into the new kernel, load ckrm-io - # modprobe ckrm-io - -3. Verify that reading /rcfs/taskclass/shares displays values for the -I/O controller (res=cki). - -4. Mount sysfs for monitoring bandwidth received (temporary solution till -a userlevel tool is developed) - # mount -t sysfs none /sys - - -Usage ------ - -For brevity, we assume we are in the /rcfs/taskclass directory for all the -code snippets below. - -Initially, the systemwide default class gets 100% of the I/O bandwidth. - - $ cat stats - - - 20 total ioprio - 20 unused/default ioprio - -The first value is the share of a class, as a parent. The second is the share -of its default subclass. Initially the two are equal. As named subclasses get -created and assigned shares, the default subclass' share (which equals the -"unused" portion of the parent's allocation) dwindles. - - -CFQ assigns one of 20 I/O priorities to all I/O requests. Each priority level -gets a fixed proportion of the total bandwidth in increments of 5%. e.g. - ioprio=1 gets 5%, - ioprio=2 gets 10%..... - all the way through ioprio=19 getting 95% - -ioprio=0 gets bandwidth only if no other priority level submits I/O i.e. it can -get starved. -ioprio=20 is considered realtime I/O and always gets priority. - -CKRM's I/O scheduler distributes these 20 priority levels amongst the hierarchy -of classes according to the relative share of each class. Thus, root starts out -with the total allocation of 20 initially. As children get created and shares -assigned to them, root's allocation reduces. At any time, the sum of absolute -share values of all classes equals 20. - - - -Class creation --------------- - - $ mkdir a - -Its initial share is zero. The parent's share values will be unchanged. Note -that even classes with zero share get unused bandwidth under CFQ. - -Setting a new class share -------------------------- - - $ echo "res=cki,guarantee=20" > /rcfs/taskclass/a/shares - Set cki shares to 20 -1 -1 -1 - - $ echo a/shares - - res=cki,guarantee=20,limit=100,total_guarantee=100,max_limit=100 - -The limit and max_limit fields can be ignored as they are not implemented. -The absolute share of a is 20% of parent's absolute total (20) and can be seen -through - $ echo a/stats - - - 4 total ioprio - 4 unused/default ioprio - -Since a gets 4, parent's default's share diminishes accordingly. Thus - - $ echo stats - - - 20 total ioprio - 16 unused/default ioprio - - -Monitoring ----------- - -Each priority level's request service rate can be viewed through sysfs (mounted -during installation). To view the servicing of priority 4's requests, - - $ while : ; echo /sys/block//queue/iosched/p4 ; sleep 1 ; done - rq (10,15) sec (20,30) q (40,50) - - - -where - rq = cumulative I/O requests received (10) and serviced (15) - sec = cumulative sectors requested (20) and served (30) - q = cumulative number of times the queue was created(40)/destroyed (50) - -The rate at which requests or sectors are serviced should differ for different -priority levels. The difference in received and serviced values indicates queue -depth - with insufficient depth, differentiation between I/O priority levels -will not be observed. - -The rate of q creation is not significant for CKRM. - - -Caveats -------- - -CFQ's I/O differentiation is still being worked upon so its better to choose -widely separated share values to observe differences in delivered I/O -bandwidth. - -CFQ, and consequently CKRM, does not provide limits yet. So it is not possible -to completely limit an I/O hog process by putting it in a class with a low I/O -share. Only if the competing classes maintain sufficient queue depth (i.e a -high I/O issue rate) will they get preferential treatment. However, they may -still see latency degradation due to seeks caused by servicing of the low -priority class. - -When limits are implemented, this behaviour will be rectified. - -Please post questions on the CKRM I/O scheduler on ckrm-tech@lists.sf.net. - - diff --git a/Documentation/ckrm/ckrm_basics b/Documentation/ckrm/ckrm_basics deleted file mode 100644 index cfd9a9256..000000000 --- a/Documentation/ckrm/ckrm_basics +++ /dev/null @@ -1,66 +0,0 @@ -CKRM Basics -------------- -A brief review of CKRM concepts and terminology will help make installation -and testing easier. For more details, please visit http://ckrm.sf.net. - -Currently there are two class types, taskclass and socketclass for grouping, -regulating and monitoring tasks and sockets respectively. - -To avoid repeating instructions for each classtype, this document assumes a -task to be the kernel object being grouped. By and large, one can replace task -with socket and taskclass with socketclass. - -RCFS depicts a CKRM class as a directory. Hierarchy of classes can be -created in which children of a class share resources allotted to -the parent. Tasks can be classified to any class which is at any level. -There is no correlation between parent-child relationship of tasks and -the parent-child relationship of classes they belong to. - -Without a Classification Engine, class is inherited by a task. A privileged -user can reassigned a task to a class as described below, after which all -the child tasks under that task will be assigned to that class, unless the -user reassigns any of them. - -A Classification Engine, if one exists, will be used by CKRM to -classify a task to a class. The Rule based classification engine uses some -of the attributes of the task to classify a task. When a CE is present -class is not inherited by a task. - -Characteristics of a class can be accessed/changed through the following magic -files under the directory representing the class: - -shares: allows to change the shares of different resources managed by the - class -stats: allows to see the statistics associated with each resources managed - by the class -target: allows to assign a task to a class. If a CE is present, assigning - a task to a class through this interface will prevent CE from - reassigning the task to any class during reclassification. -members: allows to see which tasks has been assigned to a class -config: allow to view and modify configuration information of different - resources in a class. - -Resource allocations for a class is controlled by the parameters: - -guarantee: specifies how much of a resource is guranteed to a class. A - special value DONT_CARE(-2) mean that there is no specific - guarantee of a resource is specified, this class may not get - any resource if the system is runing short of resources -limit: specifies the maximum amount of resource that is allowed to be - allocated by a class. A special value DONT_CARE(-2) mean that - there is no specific limit is specified, this class can get all - the resources available. -total_guarantee: total guarantee that is allowed among the children of this - class. In other words, the sum of "guarantee"s of all children - of this class cannot exit this number. -max_limit: Maximum "limit" allowed for any of this class's children. In - other words, "limit" of any children of this class cannot exceed - this value. - -None of this parameters are absolute or have any units associated with -them. These are just numbers(that are relative to its parents') that are -used to calculate the absolute number of resource available for a specific -class. - -Note: The root class has an absolute number of resource units associated with it. - diff --git a/Documentation/ckrm/core_usage b/Documentation/ckrm/core_usage deleted file mode 100644 index 6b5d808c3..000000000 --- a/Documentation/ckrm/core_usage +++ /dev/null @@ -1,72 +0,0 @@ -Usage of CKRM without a classification engine ------------------------------------------------ - -1. Create a class - - # mkdir /rcfs/taskclass/c1 - creates a taskclass named c1 , while - # mkdir /rcfs/socket_class/s1 - creates a socketclass named s1 - -The newly created class directory is automatically populated by magic files -shares, stats, members, target and config. - -2. View default shares - - # cat /rcfs/taskclass/c1/shares - - "guarantee=-2,limit=-2,total_guarantee=100,max_limit=100" is the default - value set for resources that have controllers registered with CKRM. - -3. change shares of a - - One or more of the following fields can/must be specified - res= #mandatory - guarantee= - limit= - total_guarantee= - max_limit= - e.g. - # echo "res=numtasks,limit=20" > /rcfs/taskclass/c1 - - If any of these parameters are not specified, the current value will be - retained. - -4. Reclassify a task (listening socket) - - write the pid of the process to the destination class' target file - # echo 1004 > /rcfs/taskclass/c1/target - - write the "\" string to the destination class' target file - # echo "0.0.0.0\32770" > /rcfs/taskclass/c1/target - -5. Get a list of tasks (sockets) assigned to a taskclass (socketclass) - - # cat /rcfs/taskclass/c1/members - lists pids of tasks belonging to c1 - - # cat /rcfs/socket_class/s1/members - lists the ipaddress\port of all listening sockets in s1 - -6. Get the statictics of different resources of a class - - # cat /rcfs/tasksclass/c1/stats - shows c1's statistics for each resource with a registered resource - controller. - - # cat /rcfs/socket_class/s1/stats - show's s1's stats for the listenaq controller. - -7. View the configuration values of the resources associated with a class - - # cat /rcfs/taskclass/c1/config - shows per-controller config values for c1. - -8. Change the configuration values of resources associated with a class - Configuration values are different for different resources. the comman - field "res=" must always be specified. - - # echo "res=numtasks,parameter=value" > /rcfs/taskclass/c1/config - to change (without any effect), the value associated with . - - diff --git a/Documentation/ckrm/crbce b/Documentation/ckrm/crbce deleted file mode 100644 index dfb4b1e96..000000000 --- a/Documentation/ckrm/crbce +++ /dev/null @@ -1,33 +0,0 @@ -CRBCE ----------- - -crbce is a superset of rbce. In addition to providing automatic -classification, the crbce module -- monitors per-process delay data that is collected by the delay -accounting patch -- collects data on significant kernel events where reclassification -could occur e.g. fork/exec/setuid/setgid etc., and -- uses relayfs to supply both these datapoints to userspace - -To illustrate the utility of the data gathered by crbce, we provide a -userspace daemon called crbcedmn that prints the header info received -from the records sent by the crbce module. - -0. Ensure that a CKRM-enabled kernel with following options configured - has been compiled. At a minimum, core, rcfs, atleast one classtype, - delay-accounting patch and relayfs. For testing, it is recommended - all classtypes and resource controllers be compiled as modules. - -1. Ensure that the Makefile's BUILD_CRBCE=1 and KDIR points to the - kernel of step 1 and call make. - This also builds the userspace daemon, crbcedmn. - -2..9 Same as rbce installation and testing instructions, - except replacing rbce.ko with crbce.ko - -10. Read the pseudo daemon help file - # ./crbcedmn -h - -11. Run the crbcedmn to display all records being processed - # ./crbcedmn - diff --git a/Documentation/ckrm/installation b/Documentation/ckrm/installation deleted file mode 100644 index 0c9033891..000000000 --- a/Documentation/ckrm/installation +++ /dev/null @@ -1,70 +0,0 @@ -Kernel installation ------------------------------- - - = version of mainline Linux kernel - = version of CKRM - -Note: It is expected that CKRM versions will change fairly rapidly. Hence once -a CKRM version has been released for some , it will only be made -available for future 's until the next CKRM version is released. - -1. Patch - - Apply ckrm/kernel//ckrm-.patch to a mainline kernel - tree with version . - - If CRBCE will be used, additionally apply the following patches, in order: - delayacctg-.patch - relayfs-.patch - - -2. Configure - -Select appropriate configuration options: - -a. for taskclasses - - General Setup-->Class Based Kernel Resource Management - - [*] Class Based Kernel Resource Management - Resource Class File System (User API) - [*] Class Manager for Task Groups - Number of Tasks Resource Manager - -b. To test socket_classes and multiple accept queue controller - - General Setup-->Class Based Kernel Resource Management - [*] Class Based Kernel Resource Management - Resource Class File System (User API) - [*] Class Manager for socket groups - Multiple Accept Queues Resource Manager - - Device Drivers-->Networking Support-->Networking options--> - [*] Network packet filtering (replaces ipchains) - [*] IP: TCP Multiple accept queues support - -c. To test CRBCE later (requires 2a.) - - File Systems-->Pseudo filesystems--> - Relayfs filesystem support - (enable all sub fields) - - General Setup--> - [*] Enable delay accounting - - -3. Build, boot into kernel - -4. Enable rcfs - - # insmod /fs/rcfs/rcfs.ko - # mount -t rcfs rcfs /rcfs - - This will create the directories /rcfs/taskclass and - /rcfs/socketclass which are the "roots" of subtrees for creating - taskclasses and socketclasses respectively. - -5. Load numtasks and listenaq controllers - - # insmod /kernel/ckrm/ckrm_tasks.ko - # insmod /kernel/ckrm/ckrm_listenaq.ko diff --git a/Documentation/ckrm/mem_rc.design b/Documentation/ckrm/mem_rc.design deleted file mode 100644 index bc565c6a0..000000000 --- a/Documentation/ckrm/mem_rc.design +++ /dev/null @@ -1,134 +0,0 @@ -0. Lifecycle of a LRU Page: ----------------------------- -These are the events in a page's lifecycle: - - allocation of the page - there are multiple high level page alloc functions; __alloc_pages() - is the lowest level function that does the real allocation. - - get into LRU list (active list or inactive list) - - get out of LRU list - - freeing the page - there are multiple high level page free functions; free_pages_bulk() - is the lowest level function that does the real free. - -When the memory subsystem runs low on LRU pages, pages are reclaimed by - - moving pages from active list to inactive list (refill_inactive_zone()) - - freeing pages from the inactive list (shrink_zone) -depending on the recent usage of the page(approximately). - -1. Introduction ---------------- -Memory resource controller controls the number of lru physical pages -(active and inactive list) a class uses. It does not restrict any -other physical pages (slabs etc.,) - -For simplicity, this document will always refer lru physical pages as -physical pages or simply pages. - -There are two parameters(that are set by the user) that affect the number -of pages a class is allowed to have in active/inactive list. -They are - - guarantee - specifies the number of pages a class is - guaranteed to get. In other words, if a class is using less than - 'guarantee' number of pages, its pages will not be freed when the - memory subsystem tries to free some pages. - - limit - specifies the maximum number of pages a class can get; - 'limit' in essence can be considered as the 'hard limit' - -Rest of this document details how these two parameters are used in the -memory allocation logic. - -Note that the numbers that are specified in the shares file, doesn't -directly correspond to the number of pages. But, the user can make -it so by making the total_guarantee and max_limit of the default class -(/rcfs/taskclass) to be the total number of pages(given in config file) -available in the system. - - for example: - # cd /rcfs/taskclass - # cat config - res=mem;tot_pages=239778,active=60473,inactive=135285,free=44555 - # cat shares - res=mem,guarantee=-2,limit=-2,total_guarantee=100,max_limit=100 - - "tot_pages=239778" above mean there are 239778 lru pages in - the system. - - By making total_guarantee and max_limit to be same as this number at - this level (/rcfs/taskclass), one can make guarantee and limit in all - classes refer to the number of pages. - - # echo 'res=mem,total_guarantee=239778,max_limit=239778' > shares - # cat shares - res=mem,guarantee=-2,limit=-2,total_guarantee=239778,max_limit=239778 - - -The number of pages a class can use be anywhere between its guarantee and -limit. CKRM memory controller springs into action when the system needs -to choose a victim page to swap out. While the number of pages a class can -have allocated may be anywhere between its guarantee and limit, victim -pages will be choosen from classes that are above their guarantee. - -Pages will be freed from classes that are close to their "limit" before -freeing pages from the classes that are close to their guarantee. Pages -belonging to classes that are below their guarantee will not be chosen as -a victim. - -2. Core Design --------------------------- - -CKRM memory resource controller taps at appropriate low level memory -management functions to associate a page with a class and to charge -a class that brings the page to the LRU list. - -2.1 Changes in page allocation function(__alloc_pages()) --------------------------------------------------------- -- If the class that the current task belong to is over 110% of its 'limit', - allocation of page(s) fail. -- After succesful allocation of a page, the page is attached with the class - to which the current task belongs to. -- Note that the class is _not_ charged for the page(s) here. - -2.2 Changes in page free(free_pages_bulk()) -------------------------------------------- -- page is freed from the class it belongs to. - -2.3 Adding/Deleting page to active/inactive list -------------------------------------------------- -When a page is added to the active or inactive list, the class that the -page belongs to is charged for the page usage. - -When a page is deleted from the active or inactive list, the class that the -page belongs to is credited back. - -If a class uses upto its limit, attempt is made to shrink the class's usage -to 90% of its limit, in order to help the class stay within its limit. -But, if the class is aggressive, and keep getting over the class's limit -often(more than 10 shrink events in 10 seconds), then the memory resource -controller gives up on the class and doesn't try to shrink the class, which -will eventually lead the class to reach its 110% of its limit and then the -page allocations will start failing. - -2.4 Chages in the page reclaimation path (refill_inactive_zone and shrink_zone) -------------------------------------------------------------------------------- -Pages will be moved from active to inactive list(refill_inactive_zone) and -pages from inactive list will be freed in the following order: -(range is calculated by subtracting 'guarantee' from 'limit') - - Classes that are over 110% of their range - - Classes that are over 100% of their range - - Classes that are over 75% of their range - - Classes that are over 50% of their range - - Classes that are over 25% of their range - - Classes whose parent is over 110% of its range - - Classes that are over their guarantee - -2.5 Handling of Shared pages ----------------------------- -Even if a mm is shared by tasks, the pages that belong to the mm will be -charged against the individual tasks that bring the page into LRU. - -But, when any task that is using a mm moves to a different class or exits, -then all pages that belong to the mm will be charged against the richest -class among the tasks that are using the mm. - -Note: Shared page handling need to be improved with a better policy. - diff --git a/Documentation/ckrm/mem_rc.usage b/Documentation/ckrm/mem_rc.usage deleted file mode 100644 index faddbf84e..000000000 --- a/Documentation/ckrm/mem_rc.usage +++ /dev/null @@ -1,72 +0,0 @@ -Installation ------------- - -1. Configure "Class based physical memory controller" under CKRM (see - Documentation/ckrm/installation) - -2. Reboot the system with the new kernel. - -3. Verify that the memory controller is present by reading the file - /rcfs/taskclass/config (should show a line with res=mem) - -Usage ------ - -For brevity, unless otherwise specified all the following commands are -executed in the default class (/rcfs/taskclass). - -Initially, the systemwide default class gets 100% of the LRU pages, and the -config file displays the total number of physical pages. - - # cd /rcfs/taskclass - # cat config - res=mem;tot_pages=239778,active=60473,inactive=135285,free=44555 - # cat shares - res=mem,guarantee=-2,limit=-2,total_guarantee=100,max_limit=100 - - tot_pages - total number of pages - active - number of pages in the active list ( sum of all zones) - inactive - number of pages in the inactive list ( sum of all zones ) - free - number of free pages (sum of all pages) - - By making total_guarantee and max_limit to be same as tot_pages, one make - make the numbers in shares file be same as the number of pages for a - class. - - # echo 'res=mem,total_guarantee=239778,max_limit=239778' > shares - # cat shares - res=mem,guarantee=-2,limit=-2,total_guarantee=239778,max_limit=239778 - - -Class creation --------------- - - # mkdir c1 - -Its initial share is don't care. The parent's share values will be unchanged. - -Setting a new class share -------------------------- - - # echo 'res=mem,guarantee=25000,limit=50000' > c1/shares - - # cat c1/shares - res=mem,guarantee=25000,limit=50000,total_guarantee=100,max_limit=100 - - 'guarantee' specifies the number of pages this class entitled to get - 'limit' is the maximum number of pages this class can get. - -Monitoring ----------- - -stats file shows statistics of the page usage of a class - # cat stats - ----------- Memory Resource stats start ----------- - Number of pages used(including pages lent to children): 196654 - Number of pages guaranteed: 239778 - Maximum limit of pages: 239778 - Total number of pages available(after serving guarantees to children): 214778 - Number of pages lent to children: 0 - Number of pages borrowed from the parent: 0 - ----------- Memory Resource stats end ----------- - diff --git a/Documentation/ckrm/rbce_basics b/Documentation/ckrm/rbce_basics deleted file mode 100644 index fd66ef2fb..000000000 --- a/Documentation/ckrm/rbce_basics +++ /dev/null @@ -1,67 +0,0 @@ -Rule-based Classification Engine (RBCE) -------------------------------------------- - -The ckrm/rbce directory contains the sources for two classification engines -called rbce and crbce. Both are optional, built as kernel modules and share much -of their codebase. Only one classification engine (CE) can be loaded at a time -in CKRM. - - -With RBCE, user can specify rules for how tasks are classified to a -class. Rules are specified by one or more attribute-value pairs and -an associated class. The tasks that match all the attr-value pairs -will get classified to the class attached with the rule. - -The file rbce_info under /rcfs/ce directory details the functionality -of different files available under the directory and also details -about attributes that can are used to define rules. - -order: When multiple rules are defined the rules are executed - according to the order of a rule. Order can be specified - while defining a rule. If order is not specified, the - highest order will be assigned to the rule(i.e, the new - rule will be executed after all the previously defined - evaluate false). So, order of rules is important as that - will decide, which class a task will get assigned to. For - example, if we have the two following rules: r1: - uid=1004,order=10,class=/rcfs/taskclass/c1 r2: - uid=1004,cmd=grep,order=20,class=/rcfs/taskclass/c2 then, - the task "grep" executed by user 1004 will always be - assigned to class /rcfs/taskclass/c1, as rule r1 will be - executed before r2 and the task successfully matched the - rule's attr-value pairs. Rule r2 will never be consulted - for the command. Note: The order in which the rules are - displayed(by ls) has no correlation with the order of the - rule. - -dependency: Rules can be defined to be depend on another rule. i.e a - rule can be dependent on one rule and has its own - additional attr-value pairs. the dependent rule will - evaluate true only if all the attr-value pairs of both - rules are satisfied. ex: r1: gid=502,class=/rcfs/taskclass - r2: depend=r1,cmd=grep,class=rcfstaskclass/c1 r2 is a - dependent rule that depends on r1, a task will be assigned - to /rcfs/taskclass/c1 if its gid is 502 and the executable - command name is "grep". If a task's gid is 502 but the - command name is _not_ "grep" then it will be assigned to - /rcfs/taskclass - - Note: The order of dependent rule must be _lesser_ than the - rule it depends on, so that it is evaluated _before the - base rule is evaluated. Otherwise the base rule will - evaluate true and the task will be assigned to the class of - that rule without the dependent rule ever getting - evaluated. In the example above, order of r2 must be lesser - than order of r1. - -app_tag: a task can be attached with a tag(ascii string), that becomes - an attribute of that task and rules can be defined with the - tag value. - -state: states are at two levels in RBCE. The entire RBCE can be - enabled or disabled which writing 1 or 0 to the file - rbce_state under /rcfs/ce. Disabling RBCE, would mean that - the rules defined in RBCE will not be utilized for - classifying a task to a class. A specific rule can be - enabled/disabled by changing the state of that rule. Once - it is disabled, the rule will not be evaluated. diff --git a/Documentation/ckrm/rbce_usage b/Documentation/ckrm/rbce_usage deleted file mode 100644 index 6d1592646..000000000 --- a/Documentation/ckrm/rbce_usage +++ /dev/null @@ -1,98 +0,0 @@ -Usage of CKRM with RBCE --------------------------- - -0. Ensure that a CKRM-enabled kernel with following options configured - has been compiled. At a minimum, core, rcfs and atleast one - classtype. For testing, it is recommended all classtypes and - resource controllers be compiled as modules. - -1. Change ckrm/rbce/Makefile's KDIR to point to this compiled kernel's source - tree and call make - -2. Load rbce module. - # insmod ckrm/rbce/rbce.ko - Note that /rcfs has to be mounted before this. - Note: this command should populate the directory /rcfs/ce with files - rbce_reclassify, rbce_tag, rbce_info, rbce_state and a directory - rules. - - Note2: If these are not created automatically, just create them by - using the commands touch and mkdir.(bug that needs to be fixed) - -3. Defining a rule - Rules are defined by creating(by writing) to a file under the - /rcfs/ce/rules directory by concatinating multiple attribute value - pairs. - - Note that the classes must be defined before defining rules that - uses the classes. eg: the command # echo - "uid=1004,class=/rcfs/taskclass/c1" > /rcfs/ce/rules/r1 will define - a rule r1 that classifies all tasks belong to user id 1004 to class - /rcfs/taskclass/c1 - -4. Viewing a rule - read the corresponding file. - to read rule r1, issue the command: - # cat /rcfs/ce/rules/r1 - -5. Changing a rule - - Changing a rule is done the same way as defining a rule, the new - rule will include the old set of attr-value pairs slapped with new - attr-value pairs. eg: if the current r2 is - uid=1004,depend=r1,class=/rcfs/taskclass/c1 - (r1 as defined in step 3) - - the command: - # echo gid=502 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,gid=502,depend=r1,class=/rcfs/taskclass/c1 - - the command: - # echo uid=1005 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1005,class=/rcfs/taskclass/c1 - - the command: - # echo class=/rcfs/taskclass/c2 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,depend=r1,class=/rcfs/taskclass/c2 - - the command: - # echo depend=r4 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,depend=r4,class=/rcfs/taskclass/c2 - - the command: - # echo +depend=r4 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,depend=r1,depend=r4,class=/rcfs/taskclass/c2 - - the command: - # echo -depend=r1 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,class=/rcfs/taskclass/c2 - -6. Checking the state of RBCE - State(enabled/disabled) of RBCE can be checked by reading the file - /rcfs/ce/rbce_state, it will show 1(enabled) or 0(disabled). - By default, RBCE is enabled(1). - ex: # cat /rcfs/ce/rbce_state - -7. Changing the state of RBCE - State of RBCE can be changed by writing 1(enable) or 0(disable). - ex: # echo 1 > cat /rcfs/ce/rbce_state - -8. Checking the state of a rule - State of a rule is displayed in the rule. Rule can be viewed by - reading the rule file. ex: # cat /rcfs/ce/rules/r1 - -9. Changing the state of a rule - - State of a rule can be changed by writing "state=1"(enable) or - "state=0"(disable) to the corresponding rule file. By defeault, the - rule is enabled when defined. ex: to disable an existing rule r1, - issue the command - # echo "state=0" > /rcfs/ce/rules/r1 - - diff --git a/Makefile b/Makefile index 4d94580e0..c23dcfbea 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 8 -EXTRAVERSION = -1.521.2.5.planetlab +EXTRAVERSION = -1.planetlab NAME=Zonked Quokka # *DOCUMENTATION* diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3ac74183c..bb91de327 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1030,7 +1030,5 @@ ENTRY(sys_call_table) .long sys_mq_notify .long sys_mq_getsetattr .long sys_ni_syscall /* reserved for kexec */ - .long sys_ioprio_set - .long sys_ioprio_get /* 285 */ syscall_table_size=(.-sys_call_table) diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S index 81a72414a..017da4476 100644 --- a/arch/ppc/kernel/misc.S +++ b/arch/ppc/kernel/misc.S @@ -1450,5 +1450,3 @@ _GLOBAL(sys_call_table) .long sys_mq_notify .long sys_mq_getsetattr .long sys_ni_syscall /* 268 reserved for sys_kexec_load */ - .long sys_ioprio_set - .long sys_ioprio_get diff --git a/configs/kernel-2.6.8-i686-planetlab-desktop.config b/configs/kernel-2.6.8-i686-planetlab-desktop.config deleted file mode 100644 index 9426fb0c2..000000000 --- a/configs/kernel-2.6.8-i686-planetlab-desktop.config +++ /dev/null @@ -1,1750 +0,0 @@ -# -# Automatically generated make config: don't edit -# -CONFIG_X86=y -CONFIG_MMU=y -CONFIG_UID16=y -CONFIG_GENERIC_ISA_DMA=y - -# -# Code maturity level options -# -CONFIG_EXPERIMENTAL=y -CONFIG_CLEAN_COMPILE=y -CONFIG_BROKEN_ON_SMP=y - -# -# General setup -# -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -# CONFIG_BSD_PROCESS_ACCT_V3 is not set - -# -# Class Based Kernel Resource Management -# -CONFIG_CKRM=y -CONFIG_RCFS_FS=y -CONFIG_CKRM_TYPE_TASKCLASS=y -CONFIG_CKRM_RES_NUMTASKS=y -CONFIG_CKRM_CPU_SCHEDULE=y -CONFIG_CKRM_RES_BLKIO=y -CONFIG_CKRM_RES_MEM=y -# CONFIG_CKRM_MEM_LRUORDER_CHANGE is not set -# CONFIG_CKRM_TYPE_SOCKETCLASS is not set -CONFIG_CKRM_RBCE=y -CONFIG_SYSCTL=y -# CONFIG_AUDIT is not set -CONFIG_LOG_BUF_SHIFT=17 -# CONFIG_HOTPLUG is not set -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_OOM_PANIC=y -# CONFIG_EMBEDDED is not set -# CONFIG_DELAY_ACCT is not set -CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_ALL is not set -CONFIG_KALLSYMS_EXTRA_PASS=y -CONFIG_FUTEX=y -CONFIG_EPOLL=y -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y -CONFIG_CC_OPTIMIZE_FOR_SIZE=y - -# -# Loadable module support -# -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_MODULE_FORCE_UNLOAD is not set -CONFIG_OBSOLETE_MODPARM=y -# CONFIG_MODVERSIONS is not set -# CONFIG_MODULE_SIG is not set -CONFIG_KMOD=y - -# -# Processor type and features -# -CONFIG_X86_PC=y -# CONFIG_X86_ELAN is not set -# CONFIG_X86_VOYAGER is not set -# CONFIG_X86_NUMAQ is not set -# CONFIG_X86_SUMMIT is not set -# CONFIG_X86_BIGSMP is not set -# CONFIG_X86_VISWS is not set -# CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_ES7000 is not set -# CONFIG_M386 is not set -# CONFIG_M486 is not set -# CONFIG_M586 is not set -# CONFIG_M586TSC is not set -# CONFIG_M586MMX is not set -# CONFIG_M686 is not set -# CONFIG_MPENTIUMII is not set -CONFIG_MPENTIUMIII=y -# CONFIG_MPENTIUMM is not set -# CONFIG_MPENTIUM4 is not set -# CONFIG_MK6 is not set -# CONFIG_MK7 is not set -# CONFIG_MK8 is not set -# CONFIG_MCRUSOE is not set -# CONFIG_MWINCHIPC6 is not set -# CONFIG_MWINCHIP2 is not set -# CONFIG_MWINCHIP3D is not set -# CONFIG_MCYRIXIII is not set -# CONFIG_MVIAC3_2 is not set -CONFIG_X86_GENERIC=y -CONFIG_X86_CMPXCHG=y -CONFIG_X86_XADD=y -CONFIG_X86_L1_CACHE_SHIFT=7 -CONFIG_RWSEM_XCHGADD_ALGORITHM=y -CONFIG_X86_WP_WORKS_OK=y -CONFIG_X86_INVLPG=y -CONFIG_X86_BSWAP=y -CONFIG_X86_POPAD_OK=y -CONFIG_X86_GOOD_APIC=y -CONFIG_X86_INTEL_USERCOPY=y -CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_4G=y -CONFIG_X86_SWITCH_PAGETABLES=y -CONFIG_X86_4G_VM_LAYOUT=y -CONFIG_X86_UACCESS_INDIRECT=y -CONFIG_X86_HIGH_ENTRY=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -# CONFIG_SMP is not set -# CONFIG_PREEMPT is not set -# CONFIG_PREEMPT_VOLUNTARY is not set -# CONFIG_X86_UP_APIC is not set -CONFIG_X86_TSC=y -CONFIG_X86_MCE=y -# CONFIG_X86_MCE_NONFATAL is not set -CONFIG_TOSHIBA=m -CONFIG_I8K=m -CONFIG_MICROCODE=m -CONFIG_X86_MSR=m -CONFIG_X86_CPUID=m - -# -# Firmware Drivers -# -CONFIG_EDD=m -# CONFIG_NOHIGHMEM is not set -CONFIG_HIGHMEM4G=y -# CONFIG_HIGHMEM64G is not set -CONFIG_HIGHMEM=y -CONFIG_HIGHPTE=y -# CONFIG_MATH_EMULATION is not set -CONFIG_MTRR=y -# CONFIG_EFI is not set -CONFIG_REGPARM=y - -# -# Power management options (ACPI, APM) -# -CONFIG_PM=y -# CONFIG_SOFTWARE_SUSPEND is not set -# CONFIG_PM_DISK is not set - -# -# ACPI (Advanced Configuration and Power Interface) Support -# -CONFIG_ACPI=y -CONFIG_ACPI_BOOT=y -CONFIG_ACPI_INTERPRETER=y -CONFIG_ACPI_SLEEP=y -CONFIG_ACPI_SLEEP_PROC_FS=y -CONFIG_ACPI_AC=m -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=m -CONFIG_ACPI_FAN=y -CONFIG_ACPI_PROCESSOR=y -CONFIG_ACPI_THERMAL=y -CONFIG_ACPI_ASUS=m -CONFIG_ACPI_TOSHIBA=m -# CONFIG_ACPI_DEBUG is not set -CONFIG_ACPI_BUS=y -CONFIG_ACPI_EC=y -CONFIG_ACPI_POWER=y -CONFIG_ACPI_PCI=y -CONFIG_ACPI_SYSTEM=y -CONFIG_X86_PM_TIMER=y - -# -# APM (Advanced Power Management) BIOS Support -# -CONFIG_APM=m -# CONFIG_APM_IGNORE_USER_SUSPEND is not set -# CONFIG_APM_DO_ENABLE is not set -CONFIG_APM_CPU_IDLE=y -# CONFIG_APM_DISPLAY_BLANK is not set -CONFIG_APM_RTC_IS_GMT=y -# CONFIG_APM_ALLOW_INTS is not set -# CONFIG_APM_REAL_MODE_POWER_OFF is not set - -# -# CPU Frequency scaling -# -CONFIG_CPU_FREQ=y -# CONFIG_CPU_FREQ_PROC_INTF is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m -CONFIG_CPU_FREQ_GOV_USERSPACE=y -# CONFIG_CPU_FREQ_24_API is not set -CONFIG_CPU_FREQ_TABLE=y - -# -# CPUFreq processor drivers -# -CONFIG_X86_ACPI_CPUFREQ=m -# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set -CONFIG_X86_POWERNOW_K6=m -CONFIG_X86_POWERNOW_K7=m -CONFIG_X86_POWERNOW_K8=m -# CONFIG_X86_GX_SUSPMOD is not set -CONFIG_X86_SPEEDSTEP_CENTRINO=m -CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE=y -CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI=y -CONFIG_X86_SPEEDSTEP_ICH=m -CONFIG_X86_SPEEDSTEP_SMI=m -CONFIG_X86_P4_CLOCKMOD=m -CONFIG_X86_SPEEDSTEP_LIB=m -# CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK is not set -CONFIG_X86_LONGRUN=m -CONFIG_X86_LONGHAUL=m - -# -# Bus options (PCI, PCMCIA, EISA, MCA, ISA) -# -CONFIG_PCI=y -# CONFIG_PCI_GOBIOS is not set -# CONFIG_PCI_GOMMCONFIG is not set -# CONFIG_PCI_GODIRECT is not set -CONFIG_PCI_GOANY=y -CONFIG_PCI_BIOS=y -CONFIG_PCI_DIRECT=y -CONFIG_PCI_MMCONFIG=y -CONFIG_PCI_LEGACY_PROC=y -# CONFIG_PCI_NAMES is not set -CONFIG_ISA=y -# CONFIG_EISA is not set -# CONFIG_MCA is not set -# CONFIG_SCx200 is not set - -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -# CONFIG_BINFMT_AOUT is not set -CONFIG_BINFMT_MISC=y - -# -# Device Drivers -# - -# -# Generic Driver Options -# -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y -# CONFIG_DEBUG_DRIVER is not set - -# -# Memory Technology Devices (MTD) -# -CONFIG_MTD=m -# CONFIG_MTD_DEBUG is not set -CONFIG_MTD_PARTITIONS=y -CONFIG_MTD_CONCAT=m -CONFIG_MTD_REDBOOT_PARTS=m -# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set -# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set -CONFIG_MTD_CMDLINE_PARTS=y - -# -# User Modules And Translation Layers -# -CONFIG_MTD_CHAR=m -CONFIG_MTD_BLOCK=m -CONFIG_MTD_BLOCK_RO=m -CONFIG_FTL=m -CONFIG_NFTL=m -CONFIG_NFTL_RW=y -CONFIG_INFTL=m - -# -# RAM/ROM/Flash chip drivers -# -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_GEN_PROBE=m -# CONFIG_MTD_CFI_ADV_OPTIONS is not set -CONFIG_MTD_MAP_BANK_WIDTH_1=y -CONFIG_MTD_MAP_BANK_WIDTH_2=y -CONFIG_MTD_MAP_BANK_WIDTH_4=y -# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set -# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set -# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set -CONFIG_MTD_CFI_I1=y -CONFIG_MTD_CFI_I2=y -# CONFIG_MTD_CFI_I4 is not set -# CONFIG_MTD_CFI_I8 is not set -CONFIG_MTD_CFI_INTELEXT=m -CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_CFI_AMDSTD_RETRY=3 -CONFIG_MTD_CFI_STAA=m -CONFIG_MTD_CFI_UTIL=m -CONFIG_MTD_RAM=m -CONFIG_MTD_ROM=m -CONFIG_MTD_ABSENT=m - -# -# Mapping drivers for chip access -# -CONFIG_MTD_COMPLEX_MAPPINGS=y -# CONFIG_MTD_PHYSMAP is not set -# CONFIG_MTD_PNC2000 is not set -CONFIG_MTD_SC520CDP=m -CONFIG_MTD_NETSC520=m -CONFIG_MTD_SBC_GXX=m -CONFIG_MTD_ELAN_104NC=m -CONFIG_MTD_SCx200_DOCFLASH=m -# CONFIG_MTD_AMD76XROM is not set -# CONFIG_MTD_ICHXROM is not set -CONFIG_MTD_SCB2_FLASH=m -# CONFIG_MTD_NETtel is not set -# CONFIG_MTD_DILNETPC is not set -# CONFIG_MTD_L440GX is not set -CONFIG_MTD_PCI=m - -# -# Self-contained MTD device drivers -# -CONFIG_MTD_PMC551=m -# CONFIG_MTD_PMC551_BUGFIX is not set -# CONFIG_MTD_PMC551_DEBUG is not set -# CONFIG_MTD_SLRAM is not set -# CONFIG_MTD_PHRAM is not set -CONFIG_MTD_MTDRAM=m -CONFIG_MTDRAM_TOTAL_SIZE=4096 -CONFIG_MTDRAM_ERASE_SIZE=128 -# CONFIG_MTD_BLKMTD is not set - -# -# Disk-On-Chip Device Drivers -# -CONFIG_MTD_DOC2000=m -# CONFIG_MTD_DOC2001 is not set -CONFIG_MTD_DOC2001PLUS=m -CONFIG_MTD_DOCPROBE=m -CONFIG_MTD_DOCECC=m -# CONFIG_MTD_DOCPROBE_ADVANCED is not set -CONFIG_MTD_DOCPROBE_ADDRESS=0 - -# -# NAND Flash Device Drivers -# -CONFIG_MTD_NAND=m -# CONFIG_MTD_NAND_VERIFY_WRITE is not set -CONFIG_MTD_NAND_IDS=m -# CONFIG_MTD_NAND_DISKONCHIP is not set - -# -# Parallel port support -# -# CONFIG_PARPORT is not set - -# -# Plug and Play support -# -CONFIG_PNP=y -# CONFIG_PNP_DEBUG is not set - -# -# Protocols -# -CONFIG_ISAPNP=y -# CONFIG_PNPBIOS is not set - -# -# Block devices -# -CONFIG_BLK_DEV_FD=m -# CONFIG_BLK_DEV_XD is not set -CONFIG_BLK_CPQ_DA=m -CONFIG_BLK_CPQ_CISS_DA=m -CONFIG_CISS_SCSI_TAPE=y -CONFIG_BLK_DEV_DAC960=m -CONFIG_BLK_DEV_UMEM=m -CONFIG_BLK_DEV_LOOP=m -# CONFIG_BLK_DEV_CRYPTOLOOP is not set -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_SX8=m -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_BLK_DEV_INITRD=y -CONFIG_LBD=y - -# -# ATA/ATAPI/MFM/RLL support -# -CONFIG_IDE=y -CONFIG_BLK_DEV_IDE=y - -# -# Please see Documentation/ide.txt for help/info on IDE drives -# -# CONFIG_BLK_DEV_IDE_SATA is not set -# CONFIG_BLK_DEV_HD_IDE is not set -CONFIG_BLK_DEV_IDEDISK=y -CONFIG_IDEDISK_MULTI_MODE=y -CONFIG_BLK_DEV_IDECD=y -# CONFIG_BLK_DEV_IDETAPE is not set -CONFIG_BLK_DEV_IDEFLOPPY=y -CONFIG_BLK_DEV_IDESCSI=m -# CONFIG_IDE_TASK_IOCTL is not set -# CONFIG_IDE_TASKFILE_IO is not set - -# -# IDE chipset support/bugfixes -# -CONFIG_IDE_GENERIC=y -# CONFIG_BLK_DEV_CMD640 is not set -CONFIG_BLK_DEV_IDEPNP=y -CONFIG_BLK_DEV_IDEPCI=y -CONFIG_IDEPCI_SHARE_IRQ=y -# CONFIG_BLK_DEV_OFFBOARD is not set -CONFIG_BLK_DEV_GENERIC=y -# CONFIG_BLK_DEV_OPTI621 is not set -CONFIG_BLK_DEV_RZ1000=y -CONFIG_BLK_DEV_IDEDMA_PCI=y -# CONFIG_BLK_DEV_IDEDMA_FORCED is not set -CONFIG_IDEDMA_PCI_AUTO=y -# CONFIG_IDEDMA_ONLYDISK is not set -CONFIG_BLK_DEV_ADMA=y -CONFIG_BLK_DEV_AEC62XX=y -CONFIG_BLK_DEV_ALI15X3=y -# CONFIG_WDC_ALI15X3 is not set -CONFIG_BLK_DEV_AMD74XX=y -CONFIG_BLK_DEV_ATIIXP=y -CONFIG_BLK_DEV_CMD64X=y -CONFIG_BLK_DEV_TRIFLEX=y -CONFIG_BLK_DEV_CY82C693=y -CONFIG_BLK_DEV_CS5520=y -CONFIG_BLK_DEV_CS5530=y -CONFIG_BLK_DEV_HPT34X=y -# CONFIG_HPT34X_AUTODMA is not set -CONFIG_BLK_DEV_HPT366=y -# CONFIG_BLK_DEV_SC1200 is not set -CONFIG_BLK_DEV_PIIX=y -# CONFIG_BLK_DEV_NS87415 is not set -CONFIG_BLK_DEV_PDC202XX_OLD=y -# CONFIG_PDC202XX_BURST is not set -CONFIG_BLK_DEV_PDC202XX_NEW=y -CONFIG_PDC202XX_FORCE=y -CONFIG_BLK_DEV_SVWKS=y -CONFIG_BLK_DEV_SIIMAGE=y -CONFIG_BLK_DEV_SIS5513=y -CONFIG_BLK_DEV_SLC90E66=y -# CONFIG_BLK_DEV_TRM290 is not set -CONFIG_BLK_DEV_VIA82CXXX=y -# CONFIG_IDE_ARM is not set -# CONFIG_IDE_CHIPSETS is not set -CONFIG_BLK_DEV_IDEDMA=y -# CONFIG_IDEDMA_IVB is not set -CONFIG_IDEDMA_AUTO=y -# CONFIG_BLK_DEV_HD is not set - -# -# SCSI device support -# -CONFIG_SCSI=m -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=m -CONFIG_CHR_DEV_ST=m -CONFIG_CHR_DEV_OSST=m -CONFIG_BLK_DEV_SR=m -CONFIG_BLK_DEV_SR_VENDOR=y -CONFIG_CHR_DEV_SG=m - -# -# Some SCSI devices (e.g. CD jukebox) support multiple LUNs -# -# CONFIG_SCSI_MULTI_LUN is not set -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y - -# -# SCSI Transport Attributes -# -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=m - -# -# SCSI low-level drivers -# -CONFIG_BLK_DEV_3W_XXXX_RAID=m -CONFIG_SCSI_3W_9XXX=m -# CONFIG_SCSI_7000FASST is not set -CONFIG_SCSI_ACARD=m -CONFIG_SCSI_AHA152X=m -CONFIG_SCSI_AHA1542=m -CONFIG_SCSI_AACRAID=m -CONFIG_SCSI_AIC7XXX=m -CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set -# CONFIG_AIC7XXX_DEBUG_ENABLE is not set -CONFIG_AIC7XXX_DEBUG_MASK=0 -# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set -CONFIG_SCSI_AIC7XXX_OLD=m -CONFIG_SCSI_AIC79XX=m -CONFIG_AIC79XX_CMDS_PER_DEVICE=4 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -# CONFIG_AIC79XX_BUILD_FIRMWARE is not set -# CONFIG_AIC79XX_ENABLE_RD_STRM is not set -# CONFIG_AIC79XX_DEBUG_ENABLE is not set -CONFIG_AIC79XX_DEBUG_MASK=0 -# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set -# CONFIG_SCSI_DPT_I2O is not set -CONFIG_SCSI_IN2000=m -CONFIG_SCSI_MEGARAID=m -CONFIG_SCSI_SATA=y -CONFIG_SCSI_SATA_SVW=m -CONFIG_SCSI_ATA_PIIX=m -CONFIG_SCSI_SATA_NV=m -CONFIG_SCSI_SATA_PROMISE=m -CONFIG_SCSI_SATA_SX4=m -CONFIG_SCSI_SATA_SIL=m -CONFIG_SCSI_SATA_SIS=m -CONFIG_SCSI_SATA_VIA=m -CONFIG_SCSI_SATA_VITESSE=m -CONFIG_SCSI_BUSLOGIC=m -# CONFIG_SCSI_OMIT_FLASHPOINT is not set -# CONFIG_SCSI_DMX3191D is not set -# CONFIG_SCSI_DTC3280 is not set -# CONFIG_SCSI_EATA is not set -# CONFIG_SCSI_EATA_PIO is not set -CONFIG_SCSI_FUTURE_DOMAIN=m -CONFIG_SCSI_GDTH=m -# CONFIG_SCSI_GENERIC_NCR5380 is not set -# CONFIG_SCSI_GENERIC_NCR5380_MMIO is not set -CONFIG_SCSI_IPS=m -CONFIG_SCSI_INIA100=m -# CONFIG_SCSI_NCR53C406A is not set -CONFIG_SCSI_SYM53C8XX_2=m -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 -# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set -# CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_PAS16 is not set -# CONFIG_SCSI_PSI240I is not set -CONFIG_SCSI_QLOGIC_FAS=m -CONFIG_SCSI_QLOGIC_ISP=m -# CONFIG_SCSI_QLOGIC_FC is not set -CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLA2XXX=m -CONFIG_SCSI_QLA21XX=m -CONFIG_SCSI_QLA22XX=m -CONFIG_SCSI_QLA2300=m -CONFIG_SCSI_QLA2322=m -CONFIG_SCSI_QLA6312=m -CONFIG_SCSI_QLA6322=m -# CONFIG_SCSI_SYM53C416 is not set -# CONFIG_SCSI_DC395x is not set -CONFIG_SCSI_DC390T=m -# CONFIG_SCSI_T128 is not set -# CONFIG_SCSI_U14_34F is not set -# CONFIG_SCSI_ULTRASTOR is not set -# CONFIG_SCSI_NSP32 is not set -# CONFIG_SCSI_DEBUG is not set - -# -# Old CD-ROM drivers (not SCSI, not IDE) -# -# CONFIG_CD_NO_IDESCSI is not set - -# -# Multi-device support (RAID and LVM) -# -CONFIG_MD=y -CONFIG_BLK_DEV_MD=y -CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID5=m -CONFIG_MD_RAID6=m -CONFIG_MD_MULTIPATH=m -CONFIG_BLK_DEV_DM=m -# CONFIG_DM_CRYPT is not set -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_MIRROR=m -CONFIG_DM_ZERO=m - -# -# Fusion MPT device support -# -CONFIG_FUSION=m -CONFIG_FUSION_MAX_SGE=40 -# CONFIG_FUSION_ISENSE is not set -CONFIG_FUSION_CTL=m - -# -# IEEE 1394 (FireWire) support -# -CONFIG_IEEE1394=m - -# -# Subsystem Options -# -# CONFIG_IEEE1394_VERBOSEDEBUG is not set -CONFIG_IEEE1394_OUI_DB=y -# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set - -# -# Device Drivers -# -# CONFIG_IEEE1394_PCILYNX is not set -CONFIG_IEEE1394_OHCI1394=m - -# -# Protocol Drivers -# -# CONFIG_IEEE1394_VIDEO1394 is not set -CONFIG_IEEE1394_SBP2=m -# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set -# CONFIG_IEEE1394_ETH1394 is not set -CONFIG_IEEE1394_DV1394=m -CONFIG_IEEE1394_RAWIO=m -CONFIG_IEEE1394_CMP=m -CONFIG_IEEE1394_AMDTP=m - -# -# I2O device support -# -CONFIG_I2O=m -CONFIG_I2O_CONFIG=m -CONFIG_I2O_BLOCK=m -CONFIG_I2O_SCSI=m -CONFIG_I2O_PROC=m - -# -# Networking support -# -CONFIG_NET=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_MMAP=y -# CONFIG_NETLINK_DEV is not set -CONFIG_UNIX=y -# CONFIG_NET_KEY is not set -CONFIG_INET=y -# CONFIG_IP_MULTICAST is not set -# CONFIG_IP_ADVANCED_ROUTER is not set -# CONFIG_IP_PNP is not set -# CONFIG_NET_IPIP is not set -# CONFIG_NET_IPGRE is not set -# CONFIG_ARPD is not set -# CONFIG_SYN_COOKIES is not set -# CONFIG_INET_AH is not set -# CONFIG_INET_ESP is not set -# CONFIG_INET_IPCOMP is not set -# CONFIG_ACCEPT_QUEUES is not set - -# -# IP: Virtual Server Configuration -# -# CONFIG_IP_VS is not set -CONFIG_ICMP_IPOD=y -# CONFIG_IPV6 is not set -CONFIG_NETFILTER=y -# CONFIG_NETFILTER_DEBUG is not set - -# -# IP: Netfilter Configuration -# -CONFIG_IP_NF_CONNTRACK=m -CONFIG_IP_NF_FTP=m -CONFIG_IP_NF_IRC=m -CONFIG_IP_NF_TFTP=m -CONFIG_IP_NF_AMANDA=m -CONFIG_IP_NF_QUEUE=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_LIMIT=m -CONFIG_IP_NF_MATCH_IPRANGE=m -CONFIG_IP_NF_MATCH_MAC=m -CONFIG_IP_NF_MATCH_PKTTYPE=m -CONFIG_IP_NF_MATCH_MARK=m -CONFIG_IP_NF_MATCH_MULTIPORT=m -CONFIG_IP_NF_MATCH_TOS=m -CONFIG_IP_NF_MATCH_RECENT=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_DSCP=m -CONFIG_IP_NF_MATCH_AH_ESP=m -CONFIG_IP_NF_MATCH_LENGTH=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_MATCH_TCPMSS=m -CONFIG_IP_NF_MATCH_HELPER=m -CONFIG_IP_NF_MATCH_STATE=m -CONFIG_IP_NF_MATCH_CONNTRACK=m -CONFIG_IP_NF_MATCH_OWNER=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_NAT_NEEDED=y -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_SAME=m -CONFIG_IP_NF_NAT_LOCAL=y -CONFIG_IP_NF_NAT_SNMP_BASIC=m -CONFIG_IP_NF_NAT_IRC=m -CONFIG_IP_NF_NAT_FTP=m -CONFIG_IP_NF_NAT_TFTP=m -CONFIG_IP_NF_NAT_AMANDA=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_TOS=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_DSCP=m -CONFIG_IP_NF_TARGET_MARK=m -CONFIG_IP_NF_TARGET_CLASSIFY=m -CONFIG_IP_NF_TARGET_LOG=m -CONFIG_IP_NF_TARGET_ULOG=m -CONFIG_IP_NF_TARGET_TCPMSS=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -# CONFIG_IP_NF_COMPAT_IPCHAINS is not set -# CONFIG_IP_NF_COMPAT_IPFWADM is not set -CONFIG_IP_NF_TARGET_NOTRACK=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_MATCH_ADDRTYPE=m -CONFIG_IP_NF_MATCH_REALM=m - -# -# SCTP Configuration (EXPERIMENTAL) -# -# CONFIG_IP_SCTP is not set -# CONFIG_ATM is not set -# CONFIG_BRIDGE is not set -# CONFIG_VLAN_8021Q is not set -# CONFIG_DECNET is not set -# CONFIG_LLC2 is not set -# CONFIG_IPX is not set -# CONFIG_ATALK is not set -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set -# CONFIG_ECONET is not set -# CONFIG_WAN_ROUTER is not set -# CONFIG_NET_HW_FLOWCONTROL is not set - -# -# QoS and/or fair queueing -# -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_CLK_JIFFIES=y -# CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set -# CONFIG_NET_SCH_CLK_CPU is not set -# CONFIG_NET_SCH_CBQ is not set -CONFIG_NET_SCH_HTB=m -# CONFIG_NET_SCH_HFSC is not set -# CONFIG_NET_SCH_PRIO is not set -# CONFIG_NET_SCH_RED is not set -# CONFIG_NET_SCH_SFQ is not set -# CONFIG_NET_SCH_TEQL is not set -# CONFIG_NET_SCH_TBF is not set -# CONFIG_NET_SCH_GRED is not set -# CONFIG_NET_SCH_DSMARK is not set -# CONFIG_NET_SCH_NETEM is not set -# CONFIG_NET_SCH_INGRESS is not set -# CONFIG_NET_QOS is not set -CONFIG_NET_CLS=y -# CONFIG_NET_CLS_TCINDEX is not set -# CONFIG_NET_CLS_ROUTE4 is not set -CONFIG_NET_CLS_ROUTE=y -CONFIG_NET_CLS_FW=m -# CONFIG_NET_CLS_U32 is not set -# CONFIG_NET_CLS_IND is not set - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set -# CONFIG_NETPOLL is not set -# CONFIG_NET_POLL_CONTROLLER is not set -# CONFIG_HAMRADIO is not set -# CONFIG_IRDA is not set -# CONFIG_BT is not set -# CONFIG_TUX is not set -CONFIG_NETDEVICES=y -CONFIG_DUMMY=m -# CONFIG_BONDING is not set -# CONFIG_EQUALIZER is not set -CONFIG_TUN=m -# CONFIG_NET_SB1000 is not set - -# -# ARCnet devices -# -# CONFIG_ARCNET is not set - -# -# Ethernet (10 or 100Mbit) -# -CONFIG_NET_ETHERNET=y -CONFIG_MII=m -CONFIG_HAPPYMEAL=m -CONFIG_SUNGEM=m -CONFIG_NET_VENDOR_3COM=y -CONFIG_EL1=m -CONFIG_EL2=m -CONFIG_ELPLUS=m -CONFIG_EL16=m -CONFIG_EL3=m -CONFIG_3C515=m -CONFIG_VORTEX=m -CONFIG_TYPHOON=m -CONFIG_LANCE=m -CONFIG_NET_VENDOR_SMC=y -CONFIG_WD80x3=m -CONFIG_ULTRA=m -CONFIG_SMC9194=m -CONFIG_NET_VENDOR_RACAL=y -# CONFIG_NI5010 is not set -CONFIG_NI52=m -CONFIG_NI65=m - -# -# Tulip family network device support -# -CONFIG_NET_TULIP=y -CONFIG_DE2104X=m -CONFIG_TULIP=m -# CONFIG_TULIP_MWI is not set -CONFIG_TULIP_MMIO=y -# CONFIG_TULIP_NAPI is not set -CONFIG_DE4X5=m -CONFIG_WINBOND_840=m -CONFIG_DM9102=m -# CONFIG_AT1700 is not set -CONFIG_DEPCA=m -CONFIG_HP100=m -# CONFIG_NET_ISA is not set -CONFIG_NE2000=m -CONFIG_NET_PCI=y -CONFIG_PCNET32=m -CONFIG_AMD8111_ETH=m -CONFIG_AMD8111E_NAPI=y -CONFIG_ADAPTEC_STARFIRE=m -CONFIG_ADAPTEC_STARFIRE_NAPI=y -CONFIG_AC3200=m -CONFIG_APRICOT=m -CONFIG_B44=m -CONFIG_FORCEDETH=m -CONFIG_CS89x0=m -CONFIG_DGRS=m -CONFIG_EEPRO100=m -# CONFIG_EEPRO100_PIO is not set -CONFIG_E100=m -CONFIG_E100_NAPI=y -CONFIG_FEALNX=m -CONFIG_NATSEMI=m -CONFIG_NE2K_PCI=m -CONFIG_8139CP=m -CONFIG_8139TOO=m -CONFIG_8139TOO_PIO=y -# CONFIG_8139TOO_TUNE_TWISTER is not set -CONFIG_8139TOO_8129=y -# CONFIG_8139_OLD_RX_RESET is not set -CONFIG_SIS900=m -CONFIG_EPIC100=m -CONFIG_SUNDANCE=m -# CONFIG_SUNDANCE_MMIO is not set -CONFIG_TLAN=m -CONFIG_VIA_RHINE=m -CONFIG_VIA_RHINE_MMIO=y -CONFIG_VIA_VELOCITY=m -CONFIG_NET_POCKET=y -CONFIG_ATP=m -CONFIG_DE600=m -CONFIG_DE620=m - -# -# Ethernet (1000 Mbit) -# -CONFIG_ACENIC=m -# CONFIG_ACENIC_OMIT_TIGON_I is not set -CONFIG_DL2K=m -CONFIG_E1000=m -CONFIG_E1000_NAPI=y -CONFIG_NS83820=m -CONFIG_HAMACHI=m -CONFIG_YELLOWFIN=m -CONFIG_R8169=m -CONFIG_SK98LIN=m -CONFIG_TIGON3=m - -# -# Ethernet (10000 Mbit) -# -CONFIG_IXGB=m -CONFIG_IXGB_NAPI=y -CONFIG_S2IO=m -CONFIG_S2IO_NAPI=y - -# -# Token Ring devices -# -# CONFIG_TR is not set - -# -# Wireless LAN (non-hamradio) -# -# CONFIG_NET_RADIO is not set - -# -# Wan interfaces -# -# CONFIG_WAN is not set -# CONFIG_FDDI is not set -# CONFIG_HIPPI is not set -# CONFIG_PPP is not set -# CONFIG_SLIP is not set -# CONFIG_NET_FC is not set -# CONFIG_SHAPER is not set -# CONFIG_NETCONSOLE is not set - -# -# ISDN subsystem -# -# CONFIG_ISDN is not set - -# -# Telephony Support -# -# CONFIG_PHONE is not set - -# -# Input device support -# -CONFIG_INPUT=y - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -# CONFIG_INPUT_JOYDEV is not set -# CONFIG_INPUT_TSDEV is not set -# CONFIG_INPUT_EVDEV is not set -# CONFIG_INPUT_EVBUG is not set - -# -# Input I/O drivers -# -# CONFIG_GAMEPORT is not set -CONFIG_SOUND_GAMEPORT=y -CONFIG_SERIO=y -CONFIG_SERIO_I8042=y -# CONFIG_SERIO_SERPORT is not set -# CONFIG_SERIO_CT82C710 is not set -# CONFIG_SERIO_PCIPS2 is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -CONFIG_KEYBOARD_ATKBD=y -# CONFIG_KEYBOARD_SUNKBD is not set -# CONFIG_KEYBOARD_LKKBD is not set -# CONFIG_KEYBOARD_XTKBD is not set -# CONFIG_KEYBOARD_NEWTON is not set -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=y -# CONFIG_MOUSE_SERIAL is not set -# CONFIG_MOUSE_INPORT is not set -# CONFIG_MOUSE_LOGIBM is not set -# CONFIG_MOUSE_PC110PAD is not set -# CONFIG_MOUSE_VSXXXAA is not set -# CONFIG_INPUT_JOYSTICK is not set -# CONFIG_INPUT_TOUCHSCREEN is not set -# CONFIG_INPUT_MISC is not set - -# -# Character devices -# -CONFIG_VT=y -CONFIG_VT_CONSOLE=y -CONFIG_HW_CONSOLE=y -# CONFIG_SERIAL_NONSTANDARD is not set - -# -# Serial drivers -# -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -# CONFIG_SERIAL_8250_ACPI is not set -CONFIG_SERIAL_8250_NR_UARTS=4 -CONFIG_SERIAL_8250_EXTENDED=y -# CONFIG_SERIAL_8250_MANY_PORTS is not set -CONFIG_SERIAL_8250_SHARE_IRQ=y -CONFIG_SERIAL_8250_DETECT_IRQ=y -CONFIG_SERIAL_8250_MULTIPORT=y -CONFIG_SERIAL_8250_RSA=y - -# -# Non-8250 serial port support -# -CONFIG_SERIAL_CORE=y -CONFIG_SERIAL_CORE_CONSOLE=y -CONFIG_UNIX98_PTYS=y -CONFIG_LEGACY_PTYS=y -# CONFIG_CRASH is not set -CONFIG_LEGACY_PTY_COUNT=256 -# CONFIG_QIC02_TAPE is not set - -# -# IPMI -# -CONFIG_IPMI_HANDLER=m -# CONFIG_IPMI_PANIC_EVENT is not set -CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_SI=m -CONFIG_IPMI_WATCHDOG=m - -# -# Watchdog Cards -# -# CONFIG_WATCHDOG is not set -CONFIG_HW_RANDOM=m -CONFIG_NVRAM=m -CONFIG_RTC=y -# CONFIG_DTLK is not set -# CONFIG_R3964 is not set -# CONFIG_APPLICOM is not set -# CONFIG_SONYPI is not set - -# -# Ftape, the floppy tape device driver -# -# CONFIG_FTAPE is not set -CONFIG_AGP=m -CONFIG_AGP_ALI=m -CONFIG_AGP_ATI=m -CONFIG_AGP_AMD=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m -CONFIG_AGP_INTEL_MCH=m -CONFIG_AGP_NVIDIA=m -CONFIG_AGP_SIS=m -CONFIG_AGP_SWORKS=m -CONFIG_AGP_VIA=m -CONFIG_AGP_EFFICEON=m -CONFIG_DRM=y -CONFIG_DRM_TDFX=m -CONFIG_DRM_GAMMA=m -CONFIG_DRM_R128=m -CONFIG_DRM_RADEON=m -CONFIG_DRM_I810=m -CONFIG_DRM_I830=m -CONFIG_DRM_MGA=m -CONFIG_DRM_SIS=m -CONFIG_MWAVE=m -# CONFIG_RAW_DRIVER is not set -# CONFIG_HPET is not set -CONFIG_HANGCHECK_TIMER=m - -# -# I2C support -# -CONFIG_I2C=m -CONFIG_I2C_CHARDEV=m - -# -# I2C Algorithms -# -CONFIG_I2C_ALGOBIT=m -CONFIG_I2C_ALGOPCF=m - -# -# I2C Hardware Bus support -# -CONFIG_I2C_ALI1535=m -CONFIG_I2C_ALI1563=m -CONFIG_I2C_ALI15X3=m -CONFIG_I2C_AMD756=m -CONFIG_I2C_AMD8111=m -# CONFIG_I2C_ELEKTOR is not set -# CONFIG_I2C_I801 is not set -CONFIG_I2C_I810=m -CONFIG_I2C_ISA=m -CONFIG_I2C_NFORCE2=m -# CONFIG_I2C_PARPORT_LIGHT is not set -CONFIG_I2C_PIIX4=m -CONFIG_I2C_PROSAVAGE=m -CONFIG_I2C_SAVAGE4=m -# CONFIG_SCx200_ACB is not set -CONFIG_I2C_SIS5595=m -CONFIG_I2C_SIS630=m -CONFIG_I2C_SIS96X=m -CONFIG_I2C_VIA=m -CONFIG_I2C_VIAPRO=m -CONFIG_I2C_VOODOO3=m - -# -# Hardware Sensors Chip support -# -CONFIG_I2C_SENSOR=m -CONFIG_SENSORS_ADM1021=m -CONFIG_SENSORS_ADM1025=m -CONFIG_SENSORS_ADM1031=m -CONFIG_SENSORS_ASB100=m -CONFIG_SENSORS_DS1621=m -CONFIG_SENSORS_FSCHER=m -CONFIG_SENSORS_GL518SM=m -CONFIG_SENSORS_IT87=m -CONFIG_SENSORS_LM75=m -CONFIG_SENSORS_LM77=m -CONFIG_SENSORS_LM78=m -CONFIG_SENSORS_LM80=m -CONFIG_SENSORS_LM83=m -CONFIG_SENSORS_LM85=m -CONFIG_SENSORS_LM90=m -CONFIG_SENSORS_MAX1619=m -CONFIG_SENSORS_VIA686A=m -CONFIG_SENSORS_W83781D=m -CONFIG_SENSORS_W83L785TS=m -CONFIG_SENSORS_W83627HF=m - -# -# Other I2C Chip support -# -CONFIG_SENSORS_EEPROM=m -CONFIG_SENSORS_PCF8574=m -CONFIG_SENSORS_PCF8591=m -CONFIG_SENSORS_RTC8564=m -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# CONFIG_I2C_DEBUG_CHIP is not set - -# -# Dallas's 1-wire bus -# -# CONFIG_W1 is not set - -# -# Misc devices -# -CONFIG_IBM_ASM=m - -# -# Multimedia devices -# -CONFIG_VIDEO_DEV=m - -# -# Video For Linux -# - -# -# Video Adapters -# -# CONFIG_VIDEO_BT848 is not set -CONFIG_VIDEO_PMS=m -CONFIG_VIDEO_CPIA=m -# CONFIG_VIDEO_CPIA_USB is not set -CONFIG_VIDEO_SAA5246A=m -CONFIG_VIDEO_SAA5249=m -CONFIG_TUNER_3036=m -CONFIG_VIDEO_STRADIS=m -CONFIG_VIDEO_ZORAN=m -CONFIG_VIDEO_ZORAN_BUZ=m -CONFIG_VIDEO_ZORAN_DC10=m -CONFIG_VIDEO_ZORAN_DC30=m -CONFIG_VIDEO_ZORAN_LML33=m -CONFIG_VIDEO_ZORAN_LML33R10=m -CONFIG_VIDEO_SAA7134=m -CONFIG_VIDEO_MXB=m -CONFIG_VIDEO_DPC=m -CONFIG_VIDEO_HEXIUM_ORION=m -CONFIG_VIDEO_HEXIUM_GEMINI=m -CONFIG_VIDEO_CX88=m -CONFIG_VIDEO_OVCAMCHIP=m - -# -# Radio Adapters -# -CONFIG_RADIO_CADET=m -CONFIG_RADIO_RTRACK=m -CONFIG_RADIO_RTRACK2=m -CONFIG_RADIO_AZTECH=m -CONFIG_RADIO_GEMTEK=m -CONFIG_RADIO_GEMTEK_PCI=m -CONFIG_RADIO_MAXIRADIO=m -CONFIG_RADIO_MAESTRO=m -CONFIG_RADIO_SF16FMI=m -CONFIG_RADIO_SF16FMR2=m -CONFIG_RADIO_TERRATEC=m -CONFIG_RADIO_TRUST=m -CONFIG_RADIO_TYPHOON=m -CONFIG_RADIO_TYPHOON_PROC_FS=y -CONFIG_RADIO_ZOLTRIX=m - -# -# Digital Video Broadcasting Devices -# -# CONFIG_DVB is not set -CONFIG_VIDEO_SAA7146=m -CONFIG_VIDEO_SAA7146_VV=m -CONFIG_VIDEO_VIDEOBUF=m -CONFIG_VIDEO_TUNER=m -CONFIG_VIDEO_BUF=m -CONFIG_VIDEO_BTCX=m -CONFIG_VIDEO_IR=m - -# -# Graphics support -# -CONFIG_FB=y -CONFIG_FB_CIRRUS=m -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -CONFIG_FB_VGA16=m -CONFIG_FB_VESA=y -CONFIG_VIDEO_SELECT=y -CONFIG_FB_HGA=m -CONFIG_FB_HGA_ACCEL=y -CONFIG_FB_RIVA=m -# CONFIG_FB_RIVA_I2C is not set -# CONFIG_FB_RIVA_DEBUG is not set -CONFIG_FB_I810=m -CONFIG_FB_I810_GTF=y -CONFIG_FB_MATROX=m -CONFIG_FB_MATROX_MILLENIUM=y -CONFIG_FB_MATROX_MYSTIQUE=y -CONFIG_FB_MATROX_G450=y -CONFIG_FB_MATROX_G100=y -CONFIG_FB_MATROX_I2C=m -CONFIG_FB_MATROX_MAVEN=m -CONFIG_FB_MATROX_MULTIHEAD=y -# CONFIG_FB_RADEON_OLD is not set -CONFIG_FB_RADEON=m -CONFIG_FB_RADEON_I2C=y -# CONFIG_FB_RADEON_DEBUG is not set -CONFIG_FB_ATY128=m -CONFIG_FB_ATY=m -CONFIG_FB_ATY_CT=y -CONFIG_FB_ATY_GX=y -# CONFIG_FB_ATY_XL_INIT is not set -# CONFIG_FB_SIS is not set -CONFIG_FB_NEOMAGIC=m -CONFIG_FB_KYRO=m -CONFIG_FB_3DFX=m -CONFIG_FB_3DFX_ACCEL=y -CONFIG_FB_VOODOO1=m -CONFIG_FB_TRIDENT=m -CONFIG_FB_TRIDENT_ACCEL=y -# CONFIG_FB_VIRTUAL is not set - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_MDA_CONSOLE=m -CONFIG_DUMMY_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE=y -# CONFIG_FONTS is not set -CONFIG_FONT_8x8=y -CONFIG_FONT_8x16=y - -# -# Logo configuration -# -# CONFIG_LOGO is not set - -# -# Sound -# -CONFIG_SOUND=m - -# -# Advanced Linux Sound Architecture -# -CONFIG_SND=m -CONFIG_SND_TIMER=m -CONFIG_SND_PCM=m -CONFIG_SND_HWDEP=m -CONFIG_SND_RAWMIDI=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -CONFIG_SND_OSSEMUL=y -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y -CONFIG_SND_RTCTIMER=m -# CONFIG_SND_VERBOSE_PRINTK is not set -# CONFIG_SND_DEBUG is not set - -# -# Generic devices -# -CONFIG_SND_MPU401_UART=m -CONFIG_SND_OPL3_LIB=m -CONFIG_SND_OPL4_LIB=m -CONFIG_SND_VX_LIB=m -CONFIG_SND_DUMMY=m -CONFIG_SND_VIRMIDI=m -CONFIG_SND_MTPAV=m -# CONFIG_SND_SERIAL_U16550 is not set -CONFIG_SND_MPU401=m - -# -# ISA devices -# -CONFIG_SND_AD1816A=m -CONFIG_SND_AD1848=m -CONFIG_SND_CS4231=m -CONFIG_SND_CS4232=m -CONFIG_SND_CS4236=m -CONFIG_SND_ES968=m -CONFIG_SND_ES1688=m -CONFIG_SND_ES18XX=m -CONFIG_SND_GUSCLASSIC=m -CONFIG_SND_GUSEXTREME=m -CONFIG_SND_GUSMAX=m -CONFIG_SND_INTERWAVE=m -CONFIG_SND_INTERWAVE_STB=m -CONFIG_SND_OPTI92X_AD1848=m -CONFIG_SND_OPTI92X_CS4231=m -CONFIG_SND_OPTI93X=m -CONFIG_SND_SB8=m -CONFIG_SND_SB16=m -CONFIG_SND_SBAWE=m -CONFIG_SND_SB16_CSP=y -# CONFIG_SND_WAVEFRONT is not set -CONFIG_SND_ALS100=m -CONFIG_SND_AZT2320=m -CONFIG_SND_CMI8330=m -CONFIG_SND_DT019X=m -CONFIG_SND_OPL3SA2=m -CONFIG_SND_SGALAXY=m -CONFIG_SND_SSCAPE=m - -# -# PCI devices -# -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_ALI5451=m -CONFIG_SND_ATIIXP=m -CONFIG_SND_AU8810=m -CONFIG_SND_AU8820=m -CONFIG_SND_AU8830=m -CONFIG_SND_AZT3328=m -CONFIG_SND_BT87X=m -CONFIG_SND_CS46XX=m -CONFIG_SND_CS46XX_NEW_DSP=y -CONFIG_SND_CS4281=m -CONFIG_SND_EMU10K1=m -CONFIG_SND_KORG1212=m -CONFIG_SND_MIXART=m -CONFIG_SND_NM256=m -CONFIG_SND_RME32=m -CONFIG_SND_RME96=m -CONFIG_SND_RME9652=m -CONFIG_SND_HDSP=m -CONFIG_SND_TRIDENT=m -CONFIG_SND_YMFPCI=m -CONFIG_SND_ALS4000=m -CONFIG_SND_CMIPCI=m -CONFIG_SND_ENS1370=m -CONFIG_SND_ENS1371=m -CONFIG_SND_ES1938=m -CONFIG_SND_ES1968=m -CONFIG_SND_MAESTRO3=m -CONFIG_SND_FM801=m -CONFIG_SND_FM801_TEA575X=m -CONFIG_SND_ICE1712=m -CONFIG_SND_ICE1724=m -CONFIG_SND_INTEL8X0=m -CONFIG_SND_INTEL8X0M=m -CONFIG_SND_SONICVIBES=m -CONFIG_SND_VIA82XX=m -CONFIG_SND_VX222=m - -# -# ALSA USB devices -# -# CONFIG_SND_USB_AUDIO is not set - -# -# Open Sound System -# -# CONFIG_SOUND_PRIME is not set - -# -# USB support -# -CONFIG_USB=m -# CONFIG_USB_DEBUG is not set - -# -# Miscellaneous USB options -# -CONFIG_USB_DEVICEFS=y -# CONFIG_USB_BANDWIDTH is not set -# CONFIG_USB_DYNAMIC_MINORS is not set - -# -# USB Host Controller Drivers -# -CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_SPLIT_ISO=y -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_OHCI_HCD=m -CONFIG_USB_UHCI_HCD=m - -# -# USB Device Class drivers -# -# CONFIG_USB_AUDIO is not set -# CONFIG_USB_BLUETOOTH_TTY is not set -# CONFIG_USB_MIDI is not set -# CONFIG_USB_ACM is not set -# CONFIG_USB_PRINTER is not set -CONFIG_USB_STORAGE=m -# CONFIG_USB_STORAGE_DEBUG is not set -CONFIG_USB_STORAGE_RW_DETECT=y -CONFIG_USB_STORAGE_DATAFAB=y -CONFIG_USB_STORAGE_FREECOM=y -CONFIG_USB_STORAGE_ISD200=y -CONFIG_USB_STORAGE_DPCM=y -CONFIG_USB_STORAGE_HP8200e=y -CONFIG_USB_STORAGE_SDDR09=y -CONFIG_USB_STORAGE_SDDR55=y -CONFIG_USB_STORAGE_JUMPSHOT=y - -# -# USB Human Interface Devices (HID) -# -# CONFIG_USB_HID is not set - -# -# USB HID Boot Protocol drivers -# -# CONFIG_USB_KBD is not set -# CONFIG_USB_MOUSE is not set -# CONFIG_USB_AIPTEK is not set -# CONFIG_USB_WACOM is not set -# CONFIG_USB_KBTAB is not set -# CONFIG_USB_POWERMATE is not set -# CONFIG_USB_MTOUCH is not set -# CONFIG_USB_EGALAX is not set -# CONFIG_USB_XPAD is not set -# CONFIG_USB_ATI_REMOTE is not set - -# -# USB Imaging devices -# -# CONFIG_USB_MDC800 is not set -# CONFIG_USB_MICROTEK is not set -# CONFIG_USB_HPUSBSCSI is not set - -# -# USB Multimedia devices -# -# CONFIG_USB_DABUSB is not set -# CONFIG_USB_VICAM is not set -# CONFIG_USB_DSBR is not set -# CONFIG_USB_IBMCAM is not set -# CONFIG_USB_KONICAWC is not set -# CONFIG_USB_OV511 is not set -# CONFIG_USB_PWC is not set -# CONFIG_USB_SE401 is not set -# CONFIG_USB_SN9C102 is not set -# CONFIG_USB_STV680 is not set -# CONFIG_USB_W9968CF is not set - -# -# USB Network adaptors -# -CONFIG_USB_CATC=m -CONFIG_USB_KAWETH=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_USBNET=m - -# -# USB Host-to-Host Cables -# -CONFIG_USB_ALI_M5632=y -CONFIG_USB_AN2720=y -CONFIG_USB_BELKIN=y -CONFIG_USB_GENESYS=y -CONFIG_USB_NET1080=y -CONFIG_USB_PL2301=y - -# -# Intelligent USB Devices/Gadgets -# -CONFIG_USB_ARMLINUX=y -CONFIG_USB_EPSON2888=y -CONFIG_USB_ZAURUS=y -CONFIG_USB_CDCETHER=y - -# -# USB Network Adapters -# -CONFIG_USB_AX8817X=y - -# -# USB port drivers -# - -# -# USB Serial Converter support -# -# CONFIG_USB_SERIAL is not set - -# -# USB Miscellaneous drivers -# -# CONFIG_USB_EMI62 is not set -# CONFIG_USB_EMI26 is not set -# CONFIG_USB_TIGL is not set -# CONFIG_USB_AUERSWALD is not set -# CONFIG_USB_RIO500 is not set -# CONFIG_USB_LEGOTOWER is not set -# CONFIG_USB_LCD is not set -# CONFIG_USB_LED is not set -# CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGETSERVO is not set -# CONFIG_USB_TEST is not set - -# -# USB Gadget Support -# -# CONFIG_USB_GADGET is not set - -# -# File systems -# -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_POSIX_ACL=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=y -CONFIG_EXT3_FS_XATTR=y -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_JBD=y -# CONFIG_JBD_DEBUG is not set -CONFIG_FS_MBCACHE=y -# CONFIG_REISERFS_FS is not set -# CONFIG_JFS_FS is not set -CONFIG_FS_POSIX_ACL=y -# CONFIG_XFS_FS is not set -# CONFIG_MINIX_FS is not set -# CONFIG_ROMFS_FS is not set -CONFIG_QUOTA=y -# CONFIG_QFMT_V1 is not set -CONFIG_QFMT_V2=y -CONFIG_QUOTACTL=y -CONFIG_AUTOFS_FS=m -CONFIG_AUTOFS4_FS=m - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=y -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_ZISOFS_FS=y -CONFIG_UDF_FS=m -CONFIG_UDF_NLS=y - -# -# DOS/FAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -# CONFIG_NTFS_FS is not set - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_SYSFS=y -# CONFIG_DEVFS_FS is not set -CONFIG_DEVPTS_FS_XATTR=y -CONFIG_DEVPTS_FS_SECURITY=y -CONFIG_TMPFS=y -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y -CONFIG_RAMFS=y -# CONFIG_RELAYFS_FS is not set - -# -# Miscellaneous filesystems -# -# CONFIG_ADFS_FS is not set -# CONFIG_AFFS_FS is not set -# CONFIG_HFS_FS is not set -# CONFIG_HFSPLUS_FS is not set -# CONFIG_BEFS_FS is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -# CONFIG_JFFS_FS is not set -# CONFIG_JFFS2_FS is not set -# CONFIG_CRAMFS is not set -# CONFIG_VXFS_FS is not set -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_SYSV_FS is not set -# CONFIG_UFS_FS is not set - -# -# Network File Systems -# -# CONFIG_NFS_FS is not set -# CONFIG_NFSD is not set -# CONFIG_EXPORTFS is not set -# CONFIG_SMB_FS is not set -# CONFIG_CIFS is not set -# CONFIG_NCP_FS is not set -# CONFIG_CODA_FS is not set -# CONFIG_AFS_FS is not set - -# -# Partition Types -# -# CONFIG_PARTITION_ADVANCED is not set -CONFIG_MSDOS_PARTITION=y - -# -# Native Language Support -# -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -# CONFIG_NLS_CODEPAGE_737 is not set -# CONFIG_NLS_CODEPAGE_775 is not set -# CONFIG_NLS_CODEPAGE_850 is not set -# CONFIG_NLS_CODEPAGE_852 is not set -# CONFIG_NLS_CODEPAGE_855 is not set -# CONFIG_NLS_CODEPAGE_857 is not set -# CONFIG_NLS_CODEPAGE_860 is not set -# CONFIG_NLS_CODEPAGE_861 is not set -# CONFIG_NLS_CODEPAGE_862 is not set -# CONFIG_NLS_CODEPAGE_863 is not set -# CONFIG_NLS_CODEPAGE_864 is not set -# CONFIG_NLS_CODEPAGE_865 is not set -# CONFIG_NLS_CODEPAGE_866 is not set -# CONFIG_NLS_CODEPAGE_869 is not set -# CONFIG_NLS_CODEPAGE_936 is not set -# CONFIG_NLS_CODEPAGE_950 is not set -# CONFIG_NLS_CODEPAGE_932 is not set -# CONFIG_NLS_CODEPAGE_949 is not set -# CONFIG_NLS_CODEPAGE_874 is not set -# CONFIG_NLS_ISO8859_8 is not set -# CONFIG_NLS_CODEPAGE_1250 is not set -# CONFIG_NLS_CODEPAGE_1251 is not set -# CONFIG_NLS_ASCII is not set -CONFIG_NLS_ISO8859_1=m -# CONFIG_NLS_ISO8859_2 is not set -# CONFIG_NLS_ISO8859_3 is not set -# CONFIG_NLS_ISO8859_4 is not set -# CONFIG_NLS_ISO8859_5 is not set -# CONFIG_NLS_ISO8859_6 is not set -# CONFIG_NLS_ISO8859_7 is not set -# CONFIG_NLS_ISO8859_9 is not set -# CONFIG_NLS_ISO8859_13 is not set -# CONFIG_NLS_ISO8859_14 is not set -# CONFIG_NLS_ISO8859_15 is not set -# CONFIG_NLS_KOI8_R is not set -# CONFIG_NLS_KOI8_U is not set -CONFIG_NLS_UTF8=m - -# -# Profiling support -# -# CONFIG_PROFILING is not set - -# -# Kernel hacking -# -CONFIG_CRASH_DUMP=y -CONFIG_CRASH_DUMP_BLOCKDEV=y -# CONFIG_CRASH_DUMP_NETDEV is not set -# CONFIG_CRASH_DUMP_MEMDEV is not set -# CONFIG_CRASH_DUMP_COMPRESS_RLE is not set -# CONFIG_CRASH_DUMP_COMPRESS_GZIP is not set -CONFIG_DEBUG_KERNEL=y -CONFIG_EARLY_PRINTK=y -CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_DEBUG_STACK_USAGE is not set -CONFIG_DEBUG_SLAB=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_SPINLOCK=y -# CONFIG_DEBUG_PAGEALLOC is not set -CONFIG_DEBUG_HIGHMEM=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_SPINLOCK_SLEEP=y -# CONFIG_FRAME_POINTER is not set - -# -# Linux VServer -# -CONFIG_VSERVER_LEGACY=y -# CONFIG_VSERVER_PROC_SECURE is not set -# CONFIG_VSERVER_HARDCPU is not set -# CONFIG_INOXID_NONE is not set -# CONFIG_INOXID_UID16 is not set -# CONFIG_INOXID_GID16 is not set -CONFIG_INOXID_UGID24=y -# CONFIG_INOXID_INTERN is not set -# CONFIG_INOXID_RUNTIME is not set -CONFIG_VSERVER_DEBUG=y - -# -# Security options -# -# CONFIG_SECURITY is not set - -# -# Cryptographic options -# -# CONFIG_CRYPTO is not set - -# -# Library routines -# -CONFIG_CRC_CCITT=m -CONFIG_CRC32=y -CONFIG_LIBCRC32C=m -CONFIG_ZLIB_INFLATE=y -CONFIG_X86_BIOS_REBOOT=y -CONFIG_PC=y diff --git a/configs/kernel-2.6.8-i686-planetlab.config b/configs/kernel-2.6.8-i686-planetlab.config index ea66387e5..7d9936db6 100644 --- a/configs/kernel-2.6.8-i686-planetlab.config +++ b/configs/kernel-2.6.8-i686-planetlab.config @@ -20,27 +20,18 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_BSD_PROCESS_ACCT=y -# CONFIG_BSD_PROCESS_ACCT_V3 is not set # # Class Based Kernel Resource Management # -CONFIG_CKRM=y -CONFIG_RCFS_FS=y -CONFIG_CKRM_TYPE_TASKCLASS=y -CONFIG_CKRM_RES_NUMTASKS=y -CONFIG_CKRM_CPU_SCHEDULE=y -CONFIG_CKRM_RES_BLKIO=y -# CONFIG_CKRM_RES_MEM is not set -# CONFIG_CKRM_TYPE_SOCKETCLASS is not set -CONFIG_CKRM_RBCE=y +# CONFIG_CKRM is not set +# CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set CONFIG_LOG_BUF_SHIFT=17 # CONFIG_HOTPLUG is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y -CONFIG_OOM_PANIC=y # CONFIG_EMBEDDED is not set # CONFIG_DELAY_ACCT is not set CONFIG_KALLSYMS=y @@ -579,13 +570,8 @@ CONFIG_NET_SCH_HTB=m # CONFIG_NET_SCH_NETEM is not set # CONFIG_NET_SCH_INGRESS is not set # CONFIG_NET_QOS is not set -CONFIG_NET_CLS=y -# CONFIG_NET_CLS_TCINDEX is not set -# CONFIG_NET_CLS_ROUTE4 is not set +# CONFIG_NET_CLS is not set CONFIG_NET_CLS_ROUTE=y -CONFIG_NET_CLS_FW=m -# CONFIG_NET_CLS_U32 is not set -# CONFIG_NET_CLS_IND is not set # # Network testing @@ -849,7 +835,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_DRM is not set # CONFIG_MWAVE is not set # CONFIG_RAW_DRIVER is not set -CONFIG_HANGCHECK_TIMER=y +CONFIG_HANGCHECK_TIMER=m # # I2C support @@ -1053,7 +1039,12 @@ CONFIG_NLS_UTF8=m # # Kernel hacking # -# CONFIG_CRASH_DUMP is not set +CONFIG_CRASH_DUMP=y +CONFIG_CRASH_DUMP_BLOCKDEV=y +# CONFIG_CRASH_DUMP_NETDEV is not set +# CONFIG_CRASH_DUMP_MEMDEV is not set +# CONFIG_CRASH_DUMP_COMPRESS_RLE is not set +# CONFIG_CRASH_DUMP_COMPRESS_GZIP is not set CONFIG_DEBUG_KERNEL=y CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y @@ -1072,14 +1063,14 @@ CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_VSERVER_LEGACY=y # CONFIG_VSERVER_PROC_SECURE is not set -# CONFIG_VSERVER_HARDCPU is not set +CONFIG_VSERVER_HARDCPU=y # CONFIG_INOXID_NONE is not set # CONFIG_INOXID_UID16 is not set # CONFIG_INOXID_GID16 is not set CONFIG_INOXID_UGID24=y # CONFIG_INOXID_INTERN is not set # CONFIG_INOXID_RUNTIME is not set -# CONFIG_VSERVER_DEBUG is not set +CONFIG_VSERVER_DEBUG=y # # Security options diff --git a/drivers/block/Makefile b/drivers/block/Makefile index c66498bad..2654b5b76 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -13,13 +13,12 @@ # kblockd threads # -obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o ckrm-iostub.o +obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_CKRM_RES_BLKIO) += ckrm-io.o obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o obj-$(CONFIG_BLK_DEV_FD98) += floppy98.o diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c index 7b45a805d..068f4eae0 100644 --- a/drivers/block/cfq-iosched.c +++ b/drivers/block/cfq-iosched.c @@ -6,18 +6,6 @@ * Based on ideas from a previously unfinished io * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. * - * IO priorities are supported, from 0% to 100% in 5% increments. Both of - * those values have special meaning - 0% class is allowed to do io if - * noone else wants to use the disk. 100% is considered real-time io, and - * always get priority. Default process io rate is 95%. In absence of other - * io, a class may consume 100% disk bandwidth regardless. Withing a class, - * bandwidth is distributed equally among the citizens. - * - * TODO: - * - cfq_select_requests() needs some work for 5-95% io - * - barriers not supported - * - export grace periods in ms, not jiffies - * * Copyright (C) 2003 Jens Axboe */ #include @@ -33,186 +21,78 @@ #include #include #include -#include - -#if IOPRIO_NR > BITS_PER_LONG -#error Cannot support this many io priority levels -#endif - -#define LIMIT_DEBUG 1 /* * tunables */ -static int cfq_quantum = 6; -static int cfq_quantum_io = 256; -static int cfq_idle_quantum = 1; -static int cfq_idle_quantum_io = 64; -static int cfq_queued = 4; -static int cfq_grace_rt = HZ / 100 ?: 1; -static int cfq_grace_idle = HZ / 10; +static int cfq_quantum = 4; +static int cfq_queued = 8; #define CFQ_QHASH_SHIFT 6 #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) -#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) +#define list_entry_qhash(entry) list_entry((entry), struct cfq_queue, cfq_hash) #define CFQ_MHASH_SHIFT 8 #define CFQ_MHASH_BLOCK(sec) ((sec) >> 3) #define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT) #define CFQ_MHASH_FN(sec) (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT)) +#define ON_MHASH(crq) !list_empty(&(crq)->hash) #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) +#define list_entry_hash(ptr) list_entry((ptr), struct cfq_rq, hash) #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) -#define list_entry_prio(ptr) list_entry((ptr), struct cfq_rq, prio_list) - -#define cfq_account_io(crq) \ - ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT) - -/* define to be 50 ms for now; make tunable later */ -#define CFQ_EPOCH 50000 -/* Needs to be made tunable right away, in MiB/s */ -#define CFQ_DISKBW 10 -/* Temporary global limit, as percent of available b/w, for each "class" */ -#define CFQ_TEMPLIM 10 - -/* - * defines how we distribute bandwidth (can be tgid, uid, etc) - */ - -/* FIXME: change hash_key to be sizeof(void *) rather than sizeof(int) - * otherwise the cast of cki_tsk_icls will not work reliably on 64-bit arches. - * OR, change cki_tsk_icls to return ints (will need another id space to be - * managed) - */ - -#if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE) -extern inline void *cki_hash_key(struct task_struct *tsk); -extern inline int cki_ioprio(struct task_struct *tsk); -#define cfq_hash_key(current) ((int)cki_hash_key((current))) -#define cfq_ioprio(current) (cki_ioprio((current))) - -#else -#define cfq_hash_key(current) ((current)->tgid) - -/* - * move to io_context - */ -#define cfq_ioprio(current) ((current)->ioprio) -#endif -#define CFQ_WAIT_RT 0 -#define CFQ_WAIT_NORM 1 +#define RQ_DATA(rq) ((struct cfq_rq *) (rq)->elevator_private) static kmem_cache_t *crq_pool; static kmem_cache_t *cfq_pool; static mempool_t *cfq_mpool; -/* - * defines an io priority level - */ -struct io_prio_data { - struct list_head rr_list; - int busy_queues; - int busy_rq; - unsigned long busy_sectors; - - /* requests, sectors and queues - * added(in),dispatched/deleted(out) - * at this priority level. - */ - atomic_t cum_rq_in,cum_rq_out; - atomic_t cum_sectors_in,cum_sectors_out; - atomic_t cum_queues_in,cum_queues_out; - -#ifdef LIMIT_DEBUG - int nskip; - unsigned long navsec; - unsigned long csectorate; - unsigned long lsectorate; -#endif - - struct list_head prio_list; - int last_rq; - int last_sectors; -}; - -/* - * per-request queue structure - */ struct cfq_data { struct list_head rr_list; struct list_head *dispatch; - struct hlist_head *cfq_hash; - struct hlist_head *crq_hash; - mempool_t *crq_pool; + struct list_head *cfq_hash; - struct io_prio_data cid[IOPRIO_NR]; + struct list_head *crq_hash; - /* - * total number of busy queues and requests - */ - int busy_rq; - int busy_queues; - unsigned long busy_sectors; + unsigned int busy_queues; + unsigned int max_queued; + mempool_t *crq_pool; request_queue_t *queue; - unsigned long rq_starved_mask; - - /* - * grace period handling - */ - struct timer_list timer; - unsigned long wait_end; - unsigned long flags; - struct work_struct work; /* * tunables */ unsigned int cfq_quantum; - unsigned int cfq_quantum_io; - unsigned int cfq_idle_quantum; - unsigned int cfq_idle_quantum_io; unsigned int cfq_queued; - unsigned int cfq_grace_rt; - unsigned int cfq_grace_idle; - - unsigned long cfq_epoch; /* duration for limit enforcement */ - unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */ }; -/* - * per-class structure - */ struct cfq_queue { + struct list_head cfq_hash; struct list_head cfq_list; - struct hlist_node cfq_hash; - int hash_key; struct rb_root sort_list; + int pid; int queued[2]; - int ioprio; - - unsigned long avsec; /* avg sectors dispatched/epoch */ - unsigned long long lastime; /* timestamp of last request served */ - unsigned long sectorate; /* limit for sectors served/epoch */ - int skipped; /* queue skipped at last dispatch ? */ +#if 0 + /* + * with a simple addition like this, we can do io priorities. almost. + * does need a split request free list, too. + */ + int io_prio +#endif }; -/* - * per-request structure - */ struct cfq_rq { - struct cfq_queue *cfq_queue; struct rb_node rb_node; - struct hlist_node hash; sector_t rb_key; struct request *request; - struct list_head prio_list; - unsigned long nr_sectors; - int ioprio; + + struct cfq_queue *cfq_queue; + + struct list_head hash; }; static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq); @@ -223,13 +103,18 @@ static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * lots of deadline iosched dupes, can be abstracted later... */ +static inline void __cfq_del_crq_hash(struct cfq_rq *crq) +{ + list_del_init(&crq->hash); +} + static inline void cfq_del_crq_hash(struct cfq_rq *crq) { - hlist_del_init(&crq->hash); + if (ON_MHASH(crq)) + __cfq_del_crq_hash(crq); } -static inline void -cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) +static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) { cfq_del_crq_hash(crq); @@ -240,26 +125,27 @@ cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) { struct request *rq = crq->request; - const int hash_idx = CFQ_MHASH_FN(rq_hash_key(rq)); - BUG_ON(!hlist_unhashed(&crq->hash)); - - hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); + BUG_ON(ON_MHASH(crq)); + + list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]); } static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) { - struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; - struct hlist_node *entry, *next; + struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; + struct list_head *entry, *next = hash_list->next; - hlist_for_each_safe(entry, next, hash_list) { + while ((entry = next) != hash_list) { struct cfq_rq *crq = list_entry_hash(entry); struct request *__rq = crq->request; - BUG_ON(hlist_unhashed(&crq->hash)); + next = entry->next; + + BUG_ON(!ON_MHASH(crq)); if (!rq_mergeable(__rq)) { - cfq_del_crq_hash(crq); + __cfq_del_crq_hash(crq); continue; } @@ -273,27 +159,20 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) /* * rb tree support functions */ -#define RB_EMPTY(node) ((node)->rb_node == NULL) +#define RB_NONE (2) +#define RB_EMPTY(node) ((node)->rb_node == NULL) +#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) +#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL) +#define ON_RB(node) ((node)->rb_color != RB_NONE) #define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) #define rq_rb_key(rq) (rq)->sector -static void -cfq_del_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) +static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) { - if (crq->cfq_queue) { - crq->cfq_queue = NULL; - - if (cfq_account_io(crq)) { - cfqd->busy_rq--; - cfqd->busy_sectors -= crq->nr_sectors; - cfqd->cid[crq->ioprio].busy_rq--; - cfqd->cid[crq->ioprio].busy_sectors -= crq->nr_sectors; - } - atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_out)); - atomic_add(crq->nr_sectors, - &(cfqd->cid[crq->ioprio].cum_sectors_out)); + if (ON_RB(&crq->rb_node)) { cfqq->queued[rq_data_dir(crq->request)]--; rb_erase(&crq->rb_node, &cfqq->sort_list); + crq->cfq_queue = NULL; } } @@ -326,22 +205,12 @@ cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) struct request *rq = crq->request; struct cfq_rq *__alias; - + crq->rb_key = rq_rb_key(rq); cfqq->queued[rq_data_dir(rq)]++; - if (cfq_account_io(crq)) { - cfqd->busy_rq++; - cfqd->busy_sectors += crq->nr_sectors; - cfqd->cid[crq->ioprio].busy_rq++; - cfqd->cid[crq->ioprio].busy_sectors += crq->nr_sectors; - } - atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_in)); - atomic_add(crq->nr_sectors, - &(cfqd->cid[crq->ioprio].cum_sectors_in)); retry: __alias = __cfq_add_crq_rb(cfqq, crq); if (!__alias) { rb_insert_color(&crq->rb_node, &cfqq->sort_list); - crq->rb_key = rq_rb_key(rq); crq->cfq_queue = cfqq; return; } @@ -353,7 +222,7 @@ retry: static struct request * cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) { - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); + struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid); struct rb_node *n; if (!cfqq) @@ -378,31 +247,16 @@ out: static void cfq_remove_request(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); if (crq) { + struct cfq_queue *cfqq = crq->cfq_queue; cfq_remove_merge_hints(q, crq); - list_del_init(&crq->prio_list); list_del_init(&rq->queuelist); - /* - * set a grace period timer to allow realtime io to make real - * progress, if we release an rt request. for normal request, - * set timer so idle io doesn't interfere with other io - */ - if (crq->ioprio == IOPRIO_RT) { - set_bit(CFQ_WAIT_RT, &cfqd->flags); - cfqd->wait_end = jiffies + cfqd->cfq_grace_rt; - } else if (crq->ioprio != IOPRIO_IDLE) { - set_bit(CFQ_WAIT_NORM, &cfqd->flags); - cfqd->wait_end = jiffies + cfqd->cfq_grace_idle; - } - - if (crq->cfq_queue) { - struct cfq_queue *cfqq = crq->cfq_queue; - - cfq_del_crq_rb(cfqd, cfqq, crq); + if (cfqq) { + cfq_del_crq_rb(cfqq, crq); if (RB_EMPTY(&cfqq->sort_list)) cfq_put_queue(cfqd, cfqq); @@ -452,26 +306,18 @@ out_insert: static void cfq_merged_request(request_queue_t *q, struct request *req) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(req); - int tmp; + struct cfq_rq *crq = RQ_DATA(req); cfq_del_crq_hash(crq); cfq_add_crq_hash(cfqd, crq); - if (crq->cfq_queue && (rq_rb_key(req) != crq->rb_key)) { + if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) { struct cfq_queue *cfqq = crq->cfq_queue; - cfq_del_crq_rb(cfqd, cfqq, crq); + cfq_del_crq_rb(cfqq, crq); cfq_add_crq_rb(cfqd, cfqq, crq); } - tmp = req->hard_nr_sectors - crq->nr_sectors; - cfqd->busy_sectors += tmp; - cfqd->cid[crq->ioprio].busy_sectors += tmp; - atomic_add(tmp,&(cfqd->cid[crq->ioprio].cum_sectors_in)); - - crq->nr_sectors = req->hard_nr_sectors; - q->last_merge = req; } @@ -483,9 +329,6 @@ cfq_merged_requests(request_queue_t *q, struct request *req, cfq_remove_request(q, next); } -/* - * sort into dispatch list, in optimal ascending order - */ static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rq *crq) @@ -493,7 +336,7 @@ cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct list_head *head = cfqd->dispatch, *entry = head; struct request *__rq; - cfq_del_crq_rb(cfqd, cfqq, crq); + cfq_del_crq_rb(cfqq, crq); cfq_remove_merge_hints(cfqd->queue, crq); if (!list_empty(head)) { @@ -516,219 +359,47 @@ link: list_add_tail(&crq->request->queuelist, entry); } -/* - * remove from io scheduler core and put on dispatch list for service - */ -static inline int +static inline void __cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_rq *crq; - unsigned long long ts, gap; - unsigned long newavsec; - - crq = rb_entry_crq(rb_first(&cfqq->sort_list)); - -#if 1 - /* Determine if queue should be skipped for being overshare */ - ts = sched_clock(); - gap = ts - cfqq->lastime; -#ifdef LIMIT_DEBUG - cfqq->sectorate = (cfqd->cfq_epochsectors - * CFQ_TEMPLIM)/100; - -#endif - if ((gap >= cfqd->cfq_epoch) || (gap < 0)) { - cfqq->avsec = crq->nr_sectors ; - cfqq->lastime = ts; - } else { - u64 tmp; - /* Age old average and accumalate request to be served */ - -// tmp = (u64) (cfqq->avsec * gap) ; -// do_div(tmp, cfqd->cfq_epoch); - newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors; -// if (crq->ioprio >= 0 && crq->ioprio <= 20) -// cfqd->cid[crq->ioprio].lsectorate = newavsec; -// atomic_set(&(cfqd->cid[crq->ioprio].lsectorate), -// newavsec); - - if ((newavsec < cfqq->sectorate) || cfqq->skipped) { - cfqq->avsec = newavsec ; - cfqq->lastime = ts; - cfqq->skipped = 0; - } else { - /* queue over share ; skip once */ - cfqq->skipped = 1; -#ifdef LIMIT_DEBUG -// atomic_inc(&(cfqd->cid[crq->ioprio].nskip)); -// if (crq->ioprio >= 0 && crq->ioprio <= 20) -// cfqd->cid[crq->ioprio].nskip++; -#endif - return 0; - } - } -#endif - -#ifdef LIMIT_DEBUG -// if (crq->ioprio >= 0 && crq->ioprio <= 20) { -// cfqd->cid[crq->ioprio].navsec = cfqq->avsec; -// cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate; -// } + struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list)); -// atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec); -// atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate); -#endif cfq_dispatch_sort(cfqd, cfqq, crq); - - /* - * technically, for IOPRIO_RT we don't need to add it to the list. - */ - list_add_tail(&crq->prio_list, &cfqd->cid[cfqq->ioprio].prio_list); - return crq->nr_sectors; } -static int -cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors) +static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd) { - struct cfq_data *cfqd = q->elevator.elevator_data; - struct list_head *plist = &cfqd->cid[prio].rr_list; - struct list_head *entry, *nxt; - int q_rq, q_io; - int ret ; + struct cfq_queue *cfqq; + struct list_head *entry, *tmp; + int ret, queued, good_queues; - /* - * for each queue at this prio level, dispatch a request - */ - q_rq = q_io = 0; - list_for_each_safe(entry, nxt, plist) { - struct cfq_queue *cfqq = list_entry_cfqq(entry); + if (list_empty(&cfqd->rr_list)) + return 0; + + queued = ret = 0; +restart: + good_queues = 0; + list_for_each_safe(entry, tmp, &cfqd->rr_list) { + cfqq = list_entry_cfqq(cfqd->rr_list.next); BUG_ON(RB_EMPTY(&cfqq->sort_list)); - ret = __cfq_dispatch_requests(q, cfqd, cfqq); - if (ret <= 0) { - continue; /* skip queue */ - /* can optimize more by moving q to end of plist ? */ - } - q_io += ret ; - q_rq++ ; + __cfq_dispatch_requests(q, cfqd, cfqq); if (RB_EMPTY(&cfqq->sort_list)) cfq_put_queue(cfqd, cfqq); - /* - * if we hit the queue limit, put the string of serviced - * queues at the back of the pending list - */ - if (q_io >= max_sectors || q_rq >= max_rq) { - struct list_head *prv = nxt->prev; - - if (prv != plist) { - list_del(plist); - list_add(plist, prv); - } - break; - } - } - - cfqd->cid[prio].last_rq = q_rq; - cfqd->cid[prio].last_sectors = q_io; - return q_rq; -} - -/* - * try to move some requests to the dispatch list. return 0 on success - */ -static int cfq_select_requests(request_queue_t *q, struct cfq_data *cfqd) -{ - int queued, busy_rq, busy_sectors, i; - - /* - * if there's any realtime io, only schedule that - */ - if (cfq_dispatch_requests(q, IOPRIO_RT, cfqd->cfq_quantum, cfqd->cfq_quantum_io)) - return 1; - - /* - * if RT io was last serviced and grace time hasn't expired, - * arm the timer to restart queueing if no other RT io has been - * submitted in the mean time - */ - if (test_bit(CFQ_WAIT_RT, &cfqd->flags)) { - if (time_before(jiffies, cfqd->wait_end)) { - mod_timer(&cfqd->timer, cfqd->wait_end); - return 0; - } - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - } - - /* - * for each priority level, calculate number of requests we - * are allowed to put into service. - */ - queued = 0; - busy_rq = cfqd->busy_rq; - busy_sectors = cfqd->busy_sectors; - for (i = IOPRIO_RT - 1; i > IOPRIO_IDLE; i--) { - const int o_rq = busy_rq - cfqd->cid[i].busy_rq; - const int o_sectors = busy_sectors - cfqd->cid[i].busy_sectors; - int q_rq = cfqd->cfq_quantum * (i + 1) / IOPRIO_NR; - int q_io = cfqd->cfq_quantum_io * (i + 1) / IOPRIO_NR; - - /* - * no need to keep iterating the list, if there are no - * requests pending anymore - */ - if (!cfqd->busy_rq) - break; - - /* - * find out how many requests and sectors we are allowed to - * service - */ - if (o_rq) - q_rq = o_sectors * (i + 1) / IOPRIO_NR; - if (q_rq > cfqd->cfq_quantum) - q_rq = cfqd->cfq_quantum; - - if (o_sectors) - q_io = o_sectors * (i + 1) / IOPRIO_NR; - if (q_io > cfqd->cfq_quantum_io) - q_io = cfqd->cfq_quantum_io; - - /* - * average with last dispatched for fairness - */ - if (cfqd->cid[i].last_rq != -1) - q_rq = (cfqd->cid[i].last_rq + q_rq) / 2; - if (cfqd->cid[i].last_sectors != -1) - q_io = (cfqd->cid[i].last_sectors + q_io) / 2; - - queued += cfq_dispatch_requests(q, i, q_rq, q_io); - } - - if (queued) - return 1; + else + good_queues++; - /* - * only allow dispatch of idle io, if the queue has been idle from - * servicing RT or normal io for the grace period - */ - if (test_bit(CFQ_WAIT_NORM, &cfqd->flags)) { - if (time_before(jiffies, cfqd->wait_end)) { - mod_timer(&cfqd->timer, cfqd->wait_end); - return 0; - } - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); + queued++; + ret = 1; } - /* - * if we found nothing to do, allow idle io to be serviced - */ - if (cfq_dispatch_requests(q, IOPRIO_IDLE, cfqd->cfq_idle_quantum, cfqd->cfq_idle_quantum_io)) - return 1; + if ((queued < cfqd->cfq_quantum) && good_queues) + goto restart; - return 0; + return ret; } static struct request *cfq_next_request(request_queue_t *q) @@ -739,82 +410,61 @@ static struct request *cfq_next_request(request_queue_t *q) if (!list_empty(cfqd->dispatch)) { struct cfq_rq *crq; dispatch: - /* - * end grace period, we are servicing a request - */ - del_timer(&cfqd->timer); - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); - - BUG_ON(list_empty(cfqd->dispatch)); rq = list_entry_rq(cfqd->dispatch->next); - BUG_ON(q->last_merge == rq); - crq = RQ_ELV_DATA(rq); - if (crq) { - BUG_ON(!hlist_unhashed(&crq->hash)); - list_del_init(&crq->prio_list); - } + crq = RQ_DATA(rq); + if (crq) + cfq_remove_merge_hints(q, crq); return rq; } - /* - * we moved requests to dispatch list, go back end serve one - */ - if (cfq_select_requests(q, cfqd)) + if (cfq_dispatch_requests(q, cfqd)) goto dispatch; return NULL; } static inline struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey, const int hashval) +__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval) { - struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; - struct hlist_node *entry; + struct list_head *hash_list = &cfqd->cfq_hash[hashval]; + struct list_head *entry; - hlist_for_each(entry, hash_list) { + list_for_each(entry, hash_list) { struct cfq_queue *__cfqq = list_entry_qhash(entry); - if (__cfqq->hash_key == hashkey) + if (__cfqq->pid == pid) return __cfqq; } return NULL; } - -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey) +static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid) { - const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); + const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); - return __cfq_find_cfq_hash(cfqd, hashkey, hashval); + return __cfq_find_cfq_hash(cfqd, pid, hashval); } static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { cfqd->busy_queues--; - WARN_ON(cfqd->busy_queues < 0); - - cfqd->cid[cfqq->ioprio].busy_queues--; - WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0); - atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out)); - list_del(&cfqq->cfq_list); - hlist_del(&cfqq->cfq_hash); + list_del(&cfqq->cfq_hash); mempool_free(cfqq, cfq_mpool); } -static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey, +static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int pid, int gfp_mask) { - const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); + const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); struct cfq_queue *cfqq, *new_cfqq = NULL; request_queue_t *q = cfqd->queue; retry: - cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval); + cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval); if (!cfqq) { if (new_cfqq) { @@ -828,15 +478,13 @@ retry: } else return NULL; - memset(cfqq, 0, sizeof(*cfqq)); - INIT_HLIST_NODE(&cfqq->cfq_hash); + INIT_LIST_HEAD(&cfqq->cfq_hash); INIT_LIST_HEAD(&cfqq->cfq_list); - cfqq->hash_key = cfq_hash_key(current); - cfqq->ioprio = cfq_ioprio(current); - cfqq->avsec = 0 ; - cfqq->lastime = sched_clock(); - cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100; - hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); + RB_CLEAR_ROOT(&cfqq->sort_list); + + cfqq->pid = pid; + cfqq->queued[0] = cfqq->queued[1] = 0; + list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); } if (new_cfqq) @@ -845,63 +493,31 @@ retry: return cfqq; } -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int hashkey, +static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid, int gfp_mask) { request_queue_t *q = cfqd->queue; struct cfq_queue *cfqq; spin_lock_irq(q->queue_lock); - cfqq = __cfq_get_queue(cfqd, hashkey, gfp_mask); + cfqq = __cfq_get_queue(cfqd, pid, gfp_mask); spin_unlock_irq(q->queue_lock); return cfqq; } -static void -__cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq) +static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) { - const int prio = crq->ioprio; struct cfq_queue *cfqq; - cfqq = __cfq_get_queue(cfqd, cfq_hash_key(current), GFP_ATOMIC); + cfqq = __cfq_get_queue(cfqd, current->tgid, GFP_ATOMIC); if (cfqq) { - - /* - * not too good... - */ - if (prio > cfqq->ioprio) { - printk("prio hash collision %d %d\n", - prio, cfqq->ioprio); - if (!list_empty(&cfqq->cfq_list)) { - cfqd->cid[cfqq->ioprio].busy_queues--; - WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues<0); - atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out)); - cfqd->cid[prio].busy_queues++; - atomic_inc(&(cfqd->cid[prio].cum_queues_in)); - list_move_tail(&cfqq->cfq_list, - &cfqd->cid[prio].rr_list); - } - cfqq->ioprio = prio; - } - cfq_add_crq_rb(cfqd, cfqq, crq); if (list_empty(&cfqq->cfq_list)) { - list_add_tail(&cfqq->cfq_list, - &cfqd->cid[prio].rr_list); - cfqd->cid[prio].busy_queues++; - atomic_inc(&(cfqd->cid[prio].cum_queues_in)); + list_add(&cfqq->cfq_list, &cfqd->rr_list); cfqd->busy_queues++; } - - if (rq_mergeable(crq->request)) { - cfq_add_crq_hash(cfqd, crq); - - if (!q->last_merge) - q->last_merge = crq->request; - } - } else { /* * should can only happen if the request wasn't allocated @@ -912,57 +528,16 @@ __cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq) } } -static void cfq_reenqueue(request_queue_t *q, struct cfq_data *cfqd, int prio) -{ - struct list_head *prio_list = &cfqd->cid[prio].prio_list; - struct list_head *entry, *tmp; - - list_for_each_safe(entry, tmp, prio_list) { - struct cfq_rq *crq = list_entry_prio(entry); - - list_del_init(entry); - list_del_init(&crq->request->queuelist); - __cfq_enqueue(q, cfqd, crq); - } -} - -static void -cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq) -{ - const int prio = cfq_ioprio(current); - - crq->ioprio = prio; - crq->nr_sectors = crq->request->hard_nr_sectors; - __cfq_enqueue(q, cfqd, crq); - - if (prio == IOPRIO_RT) { - int i; - - /* - * realtime io gets priority, move all other io back - */ - for (i = IOPRIO_IDLE; i < IOPRIO_RT; i++) - cfq_reenqueue(q, cfqd, i); - } else if (prio != IOPRIO_IDLE) { - /* - * check if we need to move idle io back into queue - */ - cfq_reenqueue(q, cfqd, IOPRIO_IDLE); - } -} - static void cfq_insert_request(request_queue_t *q, struct request *rq, int where) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); switch (where) { case ELEVATOR_INSERT_BACK: -#if 0 while (cfq_dispatch_requests(q, cfqd)) ; -#endif list_add_tail(&rq->queuelist, cfqd->dispatch); break; case ELEVATOR_INSERT_FRONT: @@ -970,20 +545,26 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where) break; case ELEVATOR_INSERT_SORT: BUG_ON(!blk_fs_request(rq)); - cfq_enqueue(q, cfqd, crq); + cfq_enqueue(cfqd, crq); break; default: - printk("%s: bad insert point %d\n", - __FUNCTION__,where); + printk("%s: bad insert point %d\n", __FUNCTION__,where); return; } + + if (rq_mergeable(rq)) { + cfq_add_crq_hash(cfqd, crq); + + if (!q->last_merge) + q->last_merge = rq; + } } static int cfq_queue_empty(request_queue_t *q) { struct cfq_data *cfqd = q->elevator.elevator_data; - if (list_empty(cfqd->dispatch) && !cfqd->busy_queues) + if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list)) return 1; return 0; @@ -992,7 +573,7 @@ static int cfq_queue_empty(request_queue_t *q) static struct request * cfq_former_request(request_queue_t *q, struct request *rq) { - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); struct rb_node *rbprev = rb_prev(&crq->rb_node); if (rbprev) @@ -1004,7 +585,7 @@ cfq_former_request(request_queue_t *q, struct request *rq) static struct request * cfq_latter_request(request_queue_t *q, struct request *rq) { - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); struct rb_node *rbnext = rb_next(&crq->rb_node); if (rbnext) @@ -1013,46 +594,27 @@ cfq_latter_request(request_queue_t *q, struct request *rq) return NULL; } -static void cfq_queue_congested(request_queue_t *q) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - - set_bit(cfq_ioprio(current), &cfqd->rq_starved_mask); -} - static int cfq_may_queue(request_queue_t *q, int rw) { struct cfq_data *cfqd = q->elevator.elevator_data; struct cfq_queue *cfqq; - const int prio = cfq_ioprio(current); - int limit, ret = 1; + int ret = 1; if (!cfqd->busy_queues) goto out; - cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); - if (!cfqq) - goto out; - - cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); - if (!cfqq) - goto out; - - /* - * if higher or equal prio io is sleeping waiting for a request, don't - * allow this one to allocate one. as long as ll_rw_blk does fifo - * waitqueue wakeups this should work... - */ - if (cfqd->rq_starved_mask & ~((1 << prio) - 1)) - goto out; + cfqq = cfq_find_cfq_hash(cfqd, current->tgid); + if (cfqq) { + int limit = (q->nr_requests - cfqd->cfq_queued) / cfqd->busy_queues; - if (cfqq->queued[rw] < cfqd->cfq_queued || !cfqd->cid[prio].busy_queues) - goto out; + if (limit < 3) + limit = 3; + else if (limit > cfqd->max_queued) + limit = cfqd->max_queued; - limit = q->nr_requests * (prio + 1) / IOPRIO_NR; - limit /= cfqd->cid[prio].busy_queues; - if (cfqq->queued[rw] > limit) - ret = 0; + if (cfqq->queued[rw] > limit) + ret = 0; + } out: return ret; } @@ -1060,13 +622,13 @@ out: static void cfq_put_request(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); struct request_list *rl; int other_rw; if (crq) { BUG_ON(q->last_merge == rq); - BUG_ON(!hlist_unhashed(&crq->hash)); + BUG_ON(ON_MHASH(crq)); mempool_free(crq, cfqd->crq_pool); rq->elevator_private = NULL; @@ -1099,21 +661,17 @@ static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) /* * prepare a queue up front, so cfq_enqueue() doesn't have to */ - cfqq = cfq_get_queue(cfqd, cfq_hash_key(current), gfp_mask); + cfqq = cfq_get_queue(cfqd, current->tgid, gfp_mask); if (!cfqq) return 1; crq = mempool_alloc(cfqd->crq_pool, gfp_mask); if (crq) { - /* - * process now has one request - */ - clear_bit(cfq_ioprio(current), &cfqd->rq_starved_mask); - memset(crq, 0, sizeof(*crq)); + RB_CLEAR(&crq->rb_node); crq->request = rq; - INIT_HLIST_NODE(&crq->hash); - INIT_LIST_HEAD(&crq->prio_list); + crq->cfq_queue = NULL; + INIT_LIST_HEAD(&crq->hash); rq->elevator_private = crq; return 0; } @@ -1132,26 +690,6 @@ static void cfq_exit(request_queue_t *q, elevator_t *e) kfree(cfqd); } -static void cfq_timer(unsigned long data) -{ - struct cfq_data *cfqd = (struct cfq_data *) data; - - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); - kblockd_schedule_work(&cfqd->work); -} - -static void cfq_work(void *data) -{ - request_queue_t *q = data; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (cfq_next_request(q)) - q->request_fn(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - static int cfq_init(request_queue_t *q, elevator_t *e) { struct cfq_data *cfqd; @@ -1162,75 +700,38 @@ static int cfq_init(request_queue_t *q, elevator_t *e) return -ENOMEM; memset(cfqd, 0, sizeof(*cfqd)); - init_timer(&cfqd->timer); - cfqd->timer.function = cfq_timer; - cfqd->timer.data = (unsigned long) cfqd; - - INIT_WORK(&cfqd->work, cfq_work, q); - - for (i = 0; i < IOPRIO_NR; i++) { - struct io_prio_data *cid = &cfqd->cid[i]; - - INIT_LIST_HEAD(&cid->rr_list); - INIT_LIST_HEAD(&cid->prio_list); - cid->last_rq = -1; - cid->last_sectors = -1; - - atomic_set(&cid->cum_rq_in,0); - atomic_set(&cid->cum_rq_out,0); - atomic_set(&cid->cum_sectors_in,0); - atomic_set(&cid->cum_sectors_out,0); - atomic_set(&cid->cum_queues_in,0); - atomic_set(&cid->cum_queues_out,0); -#if 0 - atomic_set(&cid->nskip,0); - atomic_set(&cid->navsec,0); - atomic_set(&cid->csectorate,0); - atomic_set(&cid->lsectorate,0); -#endif - } + INIT_LIST_HEAD(&cfqd->rr_list); - cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, - GFP_KERNEL); + cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); if (!cfqd->crq_hash) goto out_crqhash; - cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, - GFP_KERNEL); + cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); if (!cfqd->cfq_hash) goto out_cfqhash; - cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, crq_pool); + cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool); if (!cfqd->crq_pool) goto out_crqpool; for (i = 0; i < CFQ_MHASH_ENTRIES; i++) - INIT_HLIST_HEAD(&cfqd->crq_hash[i]); + INIT_LIST_HEAD(&cfqd->crq_hash[i]); for (i = 0; i < CFQ_QHASH_ENTRIES; i++) - INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); - - cfqd->cfq_queued = cfq_queued; - cfqd->cfq_quantum = cfq_quantum; - cfqd->cfq_quantum_io = cfq_quantum_io; - cfqd->cfq_idle_quantum = cfq_idle_quantum; - cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io; - cfqd->cfq_grace_rt = cfq_grace_rt; - cfqd->cfq_grace_idle = cfq_grace_idle; - - q->nr_requests <<= 2; + INIT_LIST_HEAD(&cfqd->cfq_hash[i]); cfqd->dispatch = &q->queue_head; e->elevator_data = cfqd; cfqd->queue = q; - cfqd->cfq_epoch = CFQ_EPOCH; - if (q->hardsect_size) - cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/ - q->hardsect_size)* (1000000 / CFQ_EPOCH); - else - cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512) - * (1000000 / CFQ_EPOCH) ; + /* + * just set it to some high value, we want anyone to be able to queue + * some requests. fairness is handled differently + */ + cfqd->max_queued = q->nr_requests; + q->nr_requests = 8192; + + cfqd->cfq_queued = cfq_queued; + cfqd->cfq_quantum = cfq_quantum; return 0; out_crqpool: @@ -1296,12 +797,7 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ return cfq_var_show(__VAR, (page)); \ } SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum); -SHOW_FUNCTION(cfq_quantum_io_show, cfqd->cfq_quantum_io); -SHOW_FUNCTION(cfq_idle_quantum_show, cfqd->cfq_idle_quantum); -SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued); -SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt); -SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ @@ -1315,271 +811,23 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \ return ret; \ } STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX); -STORE_FUNCTION(cfq_quantum_io_store, &cfqd->cfq_quantum_io, 4, INT_MAX); -STORE_FUNCTION(cfq_idle_quantum_store, &cfqd->cfq_idle_quantum, 1, INT_MAX); -STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX); STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX); -STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX); -STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX); #undef STORE_FUNCTION - -static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page) -{ - return sprintf(page, "%lu\n", cfqd->cfq_epoch); -} - -static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count) -{ - char *p = (char *) page; - cfqd->cfq_epoch = simple_strtoul(p, &p, 10); - return count; -} - -static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page) -{ - return sprintf(page, "%lu\n", cfqd->cfq_epochsectors); -} - -static ssize_t -cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count) -{ - char *p = (char *) page; - cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10); - return count; -} - -/* Additional entries to get priority level data */ -static ssize_t -cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl) -{ - int r1,r2,s1,s2,q1,q2; - - if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) - return 0; - - r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in)); - r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out)); - s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in)); - s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out)); - q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); - q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out)); - - return sprintf(page,"skip %d avsec %lu rate %lu new %lu" - "rq (%d,%d) sec (%d,%d) q (%d,%d)\n", - cfqd->cid[priolvl].nskip, - cfqd->cid[priolvl].navsec, - cfqd->cid[priolvl].csectorate, - cfqd->cid[priolvl].lsectorate, -// atomic_read(&cfqd->cid[priolvl].nskip), -// atomic_read(&cfqd->cid[priolvl].navsec), -// atomic_read(&cfqd->cid[priolvl].csectorate), -// atomic_read(&cfqd->cid[priolvl].lsectorate), - r1,r2, - s1,s2, - q1,q2); -} - -#define SHOW_PRIO_DATA(__PRIOLVL) \ -static ssize_t cfq_prio_##__PRIOLVL##_show(struct cfq_data *cfqd, char *page) \ -{ \ - return cfq_prio_show(cfqd,page,__PRIOLVL); \ -} -SHOW_PRIO_DATA(0); -SHOW_PRIO_DATA(1); -SHOW_PRIO_DATA(2); -SHOW_PRIO_DATA(3); -SHOW_PRIO_DATA(4); -SHOW_PRIO_DATA(5); -SHOW_PRIO_DATA(6); -SHOW_PRIO_DATA(7); -SHOW_PRIO_DATA(8); -SHOW_PRIO_DATA(9); -SHOW_PRIO_DATA(10); -SHOW_PRIO_DATA(11); -SHOW_PRIO_DATA(12); -SHOW_PRIO_DATA(13); -SHOW_PRIO_DATA(14); -SHOW_PRIO_DATA(15); -SHOW_PRIO_DATA(16); -SHOW_PRIO_DATA(17); -SHOW_PRIO_DATA(18); -SHOW_PRIO_DATA(19); -SHOW_PRIO_DATA(20); -#undef SHOW_PRIO_DATA - - -static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl) -{ - atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0); - atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0); - atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0); - - return count; -} - - -#define STORE_PRIO_DATA(__PRIOLVL) \ -static ssize_t cfq_prio_##__PRIOLVL##_store(struct cfq_data *cfqd, const char *page, size_t count) \ -{ \ - return cfq_prio_store(cfqd,page,count,__PRIOLVL); \ -} -STORE_PRIO_DATA(0); -STORE_PRIO_DATA(1); -STORE_PRIO_DATA(2); -STORE_PRIO_DATA(3); -STORE_PRIO_DATA(4); -STORE_PRIO_DATA(5); -STORE_PRIO_DATA(6); -STORE_PRIO_DATA(7); -STORE_PRIO_DATA(8); -STORE_PRIO_DATA(9); -STORE_PRIO_DATA(10); -STORE_PRIO_DATA(11); -STORE_PRIO_DATA(12); -STORE_PRIO_DATA(13); -STORE_PRIO_DATA(14); -STORE_PRIO_DATA(15); -STORE_PRIO_DATA(16); -STORE_PRIO_DATA(17); -STORE_PRIO_DATA(18); -STORE_PRIO_DATA(19); -STORE_PRIO_DATA(20); -#undef STORE_PRIO_DATA - - static struct cfq_fs_entry cfq_quantum_entry = { .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR }, .show = cfq_quantum_show, .store = cfq_quantum_store, }; -static struct cfq_fs_entry cfq_quantum_io_entry = { - .attr = {.name = "quantum_io", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_quantum_io_show, - .store = cfq_quantum_io_store, -}; -static struct cfq_fs_entry cfq_idle_quantum_entry = { - .attr = {.name = "idle_quantum", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_idle_quantum_show, - .store = cfq_idle_quantum_store, -}; -static struct cfq_fs_entry cfq_idle_quantum_io_entry = { - .attr = {.name = "idle_quantum_io", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_idle_quantum_io_show, - .store = cfq_idle_quantum_io_store, -}; static struct cfq_fs_entry cfq_queued_entry = { .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR }, .show = cfq_queued_show, .store = cfq_queued_store, }; -static struct cfq_fs_entry cfq_grace_rt_entry = { - .attr = {.name = "grace_rt", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_grace_rt_show, - .store = cfq_grace_rt_store, -}; -static struct cfq_fs_entry cfq_grace_idle_entry = { - .attr = {.name = "grace_idle", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_grace_idle_show, - .store = cfq_grace_idle_store, -}; -static struct cfq_fs_entry cfq_epoch_entry = { - .attr = {.name = "epoch", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_epoch_show, - .store = cfq_epoch_store, -}; -static struct cfq_fs_entry cfq_epochsectors_entry = { - .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_epochsectors_show, - .store = cfq_epochsectors_store, -}; - -#define P_0_STR "p0" -#define P_1_STR "p1" -#define P_2_STR "p2" -#define P_3_STR "p3" -#define P_4_STR "p4" -#define P_5_STR "p5" -#define P_6_STR "p6" -#define P_7_STR "p7" -#define P_8_STR "p8" -#define P_9_STR "p9" -#define P_10_STR "p10" -#define P_11_STR "p11" -#define P_12_STR "p12" -#define P_13_STR "p13" -#define P_14_STR "p14" -#define P_15_STR "p15" -#define P_16_STR "p16" -#define P_17_STR "p17" -#define P_18_STR "p18" -#define P_19_STR "p19" -#define P_20_STR "p20" - - -#define CFQ_PRIO_SYSFS_ENTRY(__PRIOLVL) \ -static struct cfq_fs_entry cfq_prio_##__PRIOLVL##_entry = { \ - .attr = {.name = P_##__PRIOLVL##_STR, .mode = S_IRUGO | S_IWUSR }, \ - .show = cfq_prio_##__PRIOLVL##_show, \ - .store = cfq_prio_##__PRIOLVL##_store, \ -}; -CFQ_PRIO_SYSFS_ENTRY(0); -CFQ_PRIO_SYSFS_ENTRY(1); -CFQ_PRIO_SYSFS_ENTRY(2); -CFQ_PRIO_SYSFS_ENTRY(3); -CFQ_PRIO_SYSFS_ENTRY(4); -CFQ_PRIO_SYSFS_ENTRY(5); -CFQ_PRIO_SYSFS_ENTRY(6); -CFQ_PRIO_SYSFS_ENTRY(7); -CFQ_PRIO_SYSFS_ENTRY(8); -CFQ_PRIO_SYSFS_ENTRY(9); -CFQ_PRIO_SYSFS_ENTRY(10); -CFQ_PRIO_SYSFS_ENTRY(11); -CFQ_PRIO_SYSFS_ENTRY(12); -CFQ_PRIO_SYSFS_ENTRY(13); -CFQ_PRIO_SYSFS_ENTRY(14); -CFQ_PRIO_SYSFS_ENTRY(15); -CFQ_PRIO_SYSFS_ENTRY(16); -CFQ_PRIO_SYSFS_ENTRY(17); -CFQ_PRIO_SYSFS_ENTRY(18); -CFQ_PRIO_SYSFS_ENTRY(19); -CFQ_PRIO_SYSFS_ENTRY(20); -#undef CFQ_PRIO_SYSFS_ENTRY static struct attribute *default_attrs[] = { &cfq_quantum_entry.attr, - &cfq_quantum_io_entry.attr, - &cfq_idle_quantum_entry.attr, - &cfq_idle_quantum_io_entry.attr, &cfq_queued_entry.attr, - &cfq_grace_rt_entry.attr, - &cfq_grace_idle_entry.attr, - &cfq_epoch_entry.attr, - &cfq_epochsectors_entry.attr, - &cfq_prio_0_entry.attr, - &cfq_prio_1_entry.attr, - &cfq_prio_2_entry.attr, - &cfq_prio_3_entry.attr, - &cfq_prio_4_entry.attr, - &cfq_prio_5_entry.attr, - &cfq_prio_6_entry.attr, - &cfq_prio_7_entry.attr, - &cfq_prio_8_entry.attr, - &cfq_prio_9_entry.attr, - &cfq_prio_10_entry.attr, - &cfq_prio_11_entry.attr, - &cfq_prio_12_entry.attr, - &cfq_prio_13_entry.attr, - &cfq_prio_14_entry.attr, - &cfq_prio_15_entry.attr, - &cfq_prio_16_entry.attr, - &cfq_prio_17_entry.attr, - &cfq_prio_18_entry.attr, - &cfq_prio_19_entry.attr, - &cfq_prio_20_entry.attr, NULL, }; @@ -1635,7 +883,6 @@ elevator_t iosched_cfq = { .elevator_set_req_fn = cfq_set_request, .elevator_put_req_fn = cfq_put_request, .elevator_may_queue_fn = cfq_may_queue, - .elevator_set_congested_fn = cfq_queue_congested, .elevator_init_fn = cfq_init, .elevator_exit_fn = cfq_exit, }; diff --git a/drivers/block/ckrm-io.c b/drivers/block/ckrm-io.c index 7edfce727..ce166e855 100644 --- a/drivers/block/ckrm-io.c +++ b/drivers/block/ckrm-io.c @@ -74,10 +74,10 @@ typedef struct ckrm_io_class { /* Absolute shares of this class * in local units. */ - - int cnt_guarantee; /* Allocation as parent */ - int cnt_unused; /* Allocation to default subclass */ - + + int ioprio; + int unused; + /* Statistics, for class and default subclass */ cki_stats_t stats; cki_stats_t mystats; @@ -90,12 +90,13 @@ typedef struct ckrm_io_class { static inline void cki_reset_stats(cki_stats_t *usg); static inline void init_icls_one(cki_icls_t *icls); static inline int cki_div(int *a, int b, int c); -//static inline int cki_recalc(cki_icls_t *icls, int rel2abs); -static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres); +static inline int cki_recalc(cki_icls_t *icls, int rel2abs); +#ifdef DOES_NOT_WORK_AND_NOT_NEEDED /* External functions e.g. interface to ioscheduler */ -void *cki_tsk_icls (struct task_struct *tsk); -int cki_tsk_ioprio (struct task_struct *tsk); +inline void *cki_tsk_icls(struct task_struct *tsk); +inline int cki_tsk_ioprio(struct task_struct *tsk); +#endif extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio); @@ -139,13 +140,9 @@ static inline void init_icls_stats(cki_icls_t *icls) static inline void init_icls_one(cki_icls_t *icls) { - // Assign zero as initial guarantee otherwise creations - // could fail due to inadequate share - - //icls->shares.my_guarantee = - // (CKI_IOPRIO_MIN * CKRM_SHARE_DFLT_TOTAL_GUARANTEE) / - // CKI_IOPRIO_DIV ; - icls->shares.my_guarantee = 0; + icls->shares.my_guarantee = + (CKI_IOPRIO_MIN * CKRM_SHARE_DFLT_TOTAL_GUARANTEE) / + CKI_IOPRIO_DIV ; icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; @@ -155,11 +152,8 @@ static inline void init_icls_one(cki_icls_t *icls) icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->cnt_guarantee = icls->cnt_unused = IOPRIO_IDLE; - - //Same rationale icls->ioprio = CKI_IOPRIO_MIN; - //IOPRIO_IDLE equivalence to zero my_guarantee (set above) relies - //on former being zero. + icls->ioprio = CKI_IOPRIO_MIN; + icls->unused = 0 ; init_icls_stats(icls); } @@ -180,55 +174,6 @@ static inline int cki_div(int *a, int b, int c) * Caller should have a lock on icls */ -static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) -{ - - ckrm_core_class_t *child = NULL; - cki_icls_t *childres; - int resid = cki_rcbs.resid; - - if (parres) { - struct ckrm_shares *par = &parres->shares; - struct ckrm_shares *self = &res->shares; - - - - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { - res->cnt_guarantee = CKRM_SHARE_DONTCARE; - } else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * - parres->cnt_guarantee; - do_div(temp, par->total_guarantee); - res->cnt_guarantee = (int) temp; - } else { - res->cnt_guarantee = 0; - } - - if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) { - res->cnt_unused = CKRM_SHARE_DONTCARE; - } else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * - res->cnt_guarantee; - do_div(temp, self->total_guarantee); - res->cnt_unused = (int) temp; - } else { - res->cnt_unused = 0; - } - } - // propagate to children - ckrm_lock_hier(res->core); - while ((child = ckrm_get_next_child(res->core,child)) != NULL){ - childres = ckrm_get_res_class(child, resid, - cki_icls_t); - - spin_lock(&childres->shares_lock); - cki_recalc_propagate(childres, res); - spin_unlock(&childres->shares_lock); - } - ckrm_unlock_hier(res->core); -} - -#if 0 static inline int cki_recalc(cki_icls_t *icls, int rel2abs) { u64 temp; @@ -239,10 +184,8 @@ static inline int cki_recalc(cki_icls_t *icls, int rel2abs) temp = icls->shares.my_guarantee * (IOPRIO_NR-1); do_div(temp, icls->shares.total_guarantee); - icls->total = IOPRIO_NR-1; icls->ioprio = temp ; - icls->unused = icls->total - icls->ioprio; -// icls->unused = (IOPRIO_NR-1)-icls->ioprio; + icls->unused = (IOPRIO_NR-1)-icls->ioprio; } else { cki_icls_t *parres; @@ -257,9 +200,9 @@ static inline int cki_recalc(cki_icls_t *icls, int rel2abs) return -EINVAL; } + partot = parres->ioprio + parres->unused; - temp = (icls->shares.my_guarantee * - parres->total); + temp = (icls->shares.my_guarantee * (parres->ioprio + parres->unused)); do_div(temp, parres->shares.total_guarantee); icls->ioprio = temp; @@ -270,19 +213,19 @@ static inline int cki_recalc(cki_icls_t *icls, int rel2abs) return 0; } -#endif -void *cki_tsk_icls(struct task_struct *tsk) + +inline void *cki_icls_tsk(struct task_struct *tsk) { return (void *) ckrm_get_res_class(class_core(tsk->taskclass), cki_rcbs.resid, cki_icls_t); } -int cki_tsk_ioprio(struct task_struct *tsk) +inline int cki_icls_ioprio(struct task_struct *tsk) { cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass), cki_rcbs.resid, cki_icls_t); - return icls->cnt_unused; + return icls->ioprio; } static void *cki_alloc(struct ckrm_core_class *core, @@ -302,13 +245,15 @@ static void *cki_alloc(struct ckrm_core_class *core, icls->shares_lock = SPIN_LOCK_UNLOCKED; if (parent == NULL) { + u64 temp; /* Root class gets same as "normal" CFQ priorities to * retain compatibility of behaviour in the absence of * other classes */ - icls->cnt_guarantee = icls->cnt_unused = IOPRIO_NR-1; + icls->ioprio = IOPRIO_NORM; + icls->unused = (IOPRIO_NR-1)-IOPRIO_NORM; /* Default gets normal, not minimum */ //icls->unused = IOPRIO_NORM; @@ -317,27 +262,24 @@ static void *cki_alloc(struct ckrm_core_class *core, /* Compute shares in abstract units */ icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + temp = (u64) icls->ioprio * icls->shares.total_guarantee; + do_div(temp, CKI_IOPRIO_DIV); + icls->shares.my_guarantee = (int) temp; - // my_guarantee for root is meaningless. Set to default - icls->shares.my_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - icls->shares.unused_guarantee = - CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - //temp = (u64) icls->cnt_unused * icls->shares.total_guarantee; - //do_div(temp, CKI_IOPRIO_DIV); - // temp now has root's default's share - //icls->shares.unused_guarantee = - // icls->shares.total_guarantee - temp; - + //icls->shares.my_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + //icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + + + icls->shares.unused_guarantee = + icls->shares.total_guarantee - + icls->shares.my_guarantee; + //icls->shares.cur_max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; } else { init_icls_one(icls); - /* No propagation to parent needed if icls' - initial share is zero */ } try_module_get(THIS_MODULE); return icls; @@ -373,7 +315,7 @@ static void cki_free(void *res) /* Update parent's shares */ spin_lock(&parres->shares_lock); child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0); - parres->cnt_unused += icls->cnt_guarantee; + parres->unused += icls->ioprio; spin_unlock(&parres->shares_lock); kfree(res); @@ -398,7 +340,9 @@ static int cki_setshare(void *res, struct ckrm_shares *new) /* limits not supported */ if ((new->max_limit != CKRM_SHARE_UNCHANGED) || (new->my_limit != CKRM_SHARE_UNCHANGED)) { - printk(KERN_ERR "limits not supported\n"); + printk(KERN_ERR "limits changed max_limit %d my_limit %d\n", + new->max_limit, new->my_limit); + return -EINVAL; } @@ -420,32 +364,17 @@ static int cki_setshare(void *res, struct ckrm_shares *new) } rc = set_shares(new, cur, par); + printk(KERN_ERR "rc from set_shares %d\n", rc); - if ((!rc) && parres) { - - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { - parres->cnt_unused = CKRM_SHARE_DONTCARE; - } else if (par->total_guarantee) { - u64 temp = (u64) par->unused_guarantee * - parres->cnt_guarantee; - do_div(temp, par->total_guarantee); - parres->cnt_unused = (int) temp; - } else { - parres->cnt_unused = 0; - } - cki_recalc_propagate(res, parres); - -#if 0 + if (!rc) { int old = icls->ioprio; - rc = cki_recalc(icls,0); if (!rc && parres) { int raise_tot = icls->ioprio - old ; - parres->unused -= raise_tot ; + parres->unused += raise_tot ; } -#endif } spin_unlock(&icls->shares_lock); if (icls->parent) { @@ -478,8 +407,8 @@ static int cki_getstats(void *res, struct seq_file *sfile) seq_printf(sfile, "%d total_write\n",atomic_read(&icls->stats.blkwr)); */ - seq_printf(sfile, "%d total ioprio\n",icls->cnt_guarantee); - seq_printf(sfile, "%d unused/default ioprio\n",icls->cnt_unused); + seq_printf(sfile, "%d ioprio\n",icls->ioprio); + seq_printf(sfile, "%d unused\n",icls->unused); return 0; } @@ -523,7 +452,7 @@ static void cki_chgcls(void *tsk, void *oldres, void *newres) struct ckrm_res_ctlr cki_rcbs = { - .res_name = "io", + .res_name = "cki", .res_hdepth = 1, .resid = -1, .res_alloc = cki_alloc, @@ -554,7 +483,7 @@ int __init cki_init(void) resid = ckrm_register_res_ctlr(clstype, &cki_rcbs); if (resid != -1) { cki_rcbs.classtype = clstype; - cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio); + cki_cfq_set(cki_icls_tsk,cki_icls_ioprio); } } diff --git a/drivers/block/ckrm-iostub.c b/drivers/block/ckrm-iostub.c index c325d8e8d..63beff3e3 100644 --- a/drivers/block/ckrm-iostub.c +++ b/drivers/block/ckrm-iostub.c @@ -35,7 +35,7 @@ void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio) spin_unlock(&stub_lock); } -void *cki_hash_key(struct task_struct *tsk) +inline void *cki_hash_key(struct task_struct *tsk) { void *ret; spin_lock(&stub_lock); @@ -47,7 +47,7 @@ void *cki_hash_key(struct task_struct *tsk) return ret; } -int cki_ioprio(struct task_struct *tsk) +inline int cki_ioprio(struct task_struct *tsk) { int ret; spin_lock(&stub_lock); diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 950eb9923..35c9385ac 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -339,14 +339,6 @@ void elv_put_request(request_queue_t *q, struct request *rq) e->elevator_put_req_fn(q, rq); } -void elv_set_congested(request_queue_t *q) -{ - elevator_t *e = &q->elevator; - - if (e->elevator_set_congested_fn) - e->elevator_set_congested_fn(q); -} - int elv_may_queue(request_queue_t *q, int rw) { elevator_t *e = &q->elevator; @@ -354,7 +346,7 @@ int elv_may_queue(request_queue_t *q, int rw) if (e->elevator_may_queue_fn) return e->elevator_may_queue_fn(q, rw); - return 1; + return 0; } void elv_completed_request(request_queue_t *q, struct request *rq) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 49ff5e0b7..5a570baa6 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1594,10 +1594,6 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) struct io_context *ioc = get_io_context(gfp_mask); spin_lock_irq(q->queue_lock); - - if (!elv_may_queue(q, rw)) - goto out_lock; - if (rl->count[rw]+1 >= q->nr_requests) { /* * The queue will fill after this allocation, so set it as @@ -1611,12 +1607,15 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) } } - /* - * The queue is full and the allocating process is not a - * "batcher", and not exempted by the IO scheduler - */ - if (blk_queue_full(q, rw) && !ioc_batching(ioc)) - goto out_lock; + if (blk_queue_full(q, rw) + && !ioc_batching(ioc) && !elv_may_queue(q, rw)) { + /* + * The queue is full and the allocating process is not a + * "batcher", and not exempted by the IO scheduler + */ + spin_unlock_irq(q->queue_lock); + goto out; + } rl->count[rw]++; if (rl->count[rw] >= queue_congestion_on_threshold(q)) @@ -1634,7 +1633,8 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) */ spin_lock_irq(q->queue_lock); freed_request(q, rw); - goto out_lock; + spin_unlock_irq(q->queue_lock); + goto out; } if (ioc_batching(ioc)) @@ -1664,11 +1664,6 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) out: put_io_context(ioc); return rq; -out_lock: - if (!rq) - elv_set_congested(q); - spin_unlock_irq(q->queue_lock); - goto out; } /* @@ -3173,21 +3168,3 @@ void blk_unregister_queue(struct gendisk *disk) kobject_put(&disk->kobj); } } - -asmlinkage int sys_ioprio_set(int ioprio) -{ - if (ioprio < IOPRIO_IDLE || ioprio > IOPRIO_RT) - return -EINVAL; - if (ioprio == IOPRIO_RT && !capable(CAP_SYS_ADMIN)) - return -EACCES; - - printk("%s: set ioprio %d\n", current->comm, ioprio); - current->ioprio = ioprio; - return 0; -} - -asmlinkage int sys_ioprio_get(void) -{ - return current->ioprio; -} - diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c index 764c6538e..83d6b37b3 100644 --- a/drivers/char/hangcheck-timer.c +++ b/drivers/char/hangcheck-timer.c @@ -55,7 +55,7 @@ static int hangcheck_tick = DEFAULT_IOFENCE_TICK; static int hangcheck_margin = DEFAULT_IOFENCE_MARGIN; -static int hangcheck_reboot = 1; /* Defaults to reboot */ +static int hangcheck_reboot; /* Defaults to not reboot */ /* Driver options */ module_param(hangcheck_tick, int, 0); diff --git a/fs/exec.c b/fs/exec.c index 90580ec70..bca37d6c0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -48,7 +48,6 @@ #include #include #include -#include #include #include @@ -559,18 +558,6 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); -#ifdef CONFIG_CKRM_RES_MEM - if (old_mm) { - spin_lock(&old_mm->peertask_lock); - list_del(&tsk->mm_peers); - ckrm_mem_evaluate_mm(old_mm); - spin_unlock(&old_mm->peertask_lock); - } - spin_lock(&mm->peertask_lock); - list_add_tail(&tsk->mm_peers, &mm->tasklist); - ckrm_mem_evaluate_mm(mm); - spin_unlock(&mm->peertask_lock); -#endif if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index d232026b4..74acc7846 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -9,7 +9,6 @@ #include #include #include -#include #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -292,9 +291,6 @@ ext2_permission(struct inode *inode, int mask, struct nameidata *nd) { int mode = inode->i_mode; - /* Prevent vservers from escaping chroot() barriers */ - if (IS_BARRIER(inode) && !vx_check(0, VX_ADMIN)) - return -EACCES; /* Nobody gets write access to a read-only fs */ if ((mask & MAY_WRITE) && (IS_RDONLY(inode) || (nd && MNT_IS_RDONLY(nd->mnt))) && diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fe9c6a13b..1ef02bccb 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1030,7 +1030,7 @@ void ext2_set_inode_flags(struct inode *inode) { unsigned int flags = EXT2_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_IUNLINK|S_BARRIER|S_NOATIME|S_DIRSYNC); + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); if (flags & EXT2_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & EXT2_APPEND_FL) diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 594c16c80..f6043a6e2 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -50,11 +50,11 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * * This test looks nicer. Thanks to Pauline Middelink */ - if (((oldflags & EXT2_IMMUTABLE_FL) || + if ((oldflags & EXT2_IMMUTABLE_FL) || ((flags ^ oldflags) & - (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL | EXT2_IUNLINK_FL))) - && !capable(CAP_LINUX_IMMUTABLE)) { - return -EPERM; + (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; } flags = flags & EXT2_FL_USER_MODIFIABLE; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index e89cb306c..cc26948d5 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "xattr.h" #include "acl.h" @@ -297,9 +296,6 @@ ext3_permission(struct inode *inode, int mask, struct nameidata *nd) { int mode = inode->i_mode; - /* Prevent vservers from escaping chroot() barriers */ - if (IS_BARRIER(inode) && !vx_check(0, VX_ADMIN)) - return -EACCES; /* Nobody gets write access to a read-only fs */ if ((mask & MAY_WRITE) && (IS_RDONLY(inode) || (nd && nd->mnt && MNT_IS_RDONLY(nd->mnt))) && diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 7bc33d5f5..962aef215 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2474,7 +2474,7 @@ void ext3_set_inode_flags(struct inode *inode) { unsigned int flags = EXT3_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_IUNLINK|S_BARRIER|S_NOATIME|S_DIRSYNC); + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); if (flags & EXT3_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & EXT3_APPEND_FL) diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index f58d49736..37bd4509d 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -59,11 +59,11 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * * This test looks nicer. Thanks to Pauline Middelink */ - if (((oldflags & EXT3_IMMUTABLE_FL) || + if ((oldflags & EXT3_IMMUTABLE_FL) || ((flags ^ oldflags) & - (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL))) - && !capable(CAP_LINUX_IMMUTABLE)) { - return -EPERM; + (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; } /* diff --git a/fs/ioctl.c b/fs/ioctl.c index 6404b0c10..96a1b601e 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -173,19 +173,6 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) error = vx_proc_ioctl(filp->f_dentry->d_inode, filp, cmd, arg); break; #endif - case FIOC_SETIATTR: - case FIOC_GETIATTR: - /* - * Verify that this filp is a file object, - * not (say) a socket. - */ - error = -ENOTTY; - if (S_ISREG(filp->f_dentry->d_inode->i_mode) || - S_ISDIR(filp->f_dentry->d_inode->i_mode)) - error = vc_iattr_ioctl(filp->f_dentry, - cmd, arg); - break; - default: error = -ENOTTY; if (S_ISREG(filp->f_dentry->d_inode->i_mode)) diff --git a/fs/namei.c b/fs/namei.c index 656430d6b..34da5b453 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -165,10 +165,6 @@ int vfs_permission(struct inode * inode, int mask) { umode_t mode = inode->i_mode; - /* Prevent vservers from escaping chroot() barriers */ - if (IS_BARRIER(inode) && !vx_check(0, VX_ADMIN)) - return -EACCES; - if (mask & MAY_WRITE) { /* * Nobody gets write access to a read-only fs. @@ -214,6 +210,20 @@ int vfs_permission(struct inode * inode, int mask) return -EACCES; } +static inline int xid_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + if (inode->i_xid == 0) + return 0; + if (vx_check(inode->i_xid, VX_ADMIN|VX_WATCH|VX_IDENT)) + return 0; +/* + printk("VSW: xid=%d denied access to %p[#%d,%lu] »%*s«.\n", + vx_current_xid(), inode, inode->i_xid, inode->i_ino, + nd->dentry->d_name.len, nd->dentry->d_name.name); +*/ + return -EACCES; +} + int permission(struct inode * inode,int mask, struct nameidata *nd) { int retval; @@ -227,6 +237,8 @@ int permission(struct inode * inode,int mask, struct nameidata *nd) (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; + if ((retval = xid_permission(inode, mask, nd))) + return retval; if (inode->i_op && inode->i_op->permission) retval = inode->i_op->permission(inode, submask, nd); else @@ -2013,13 +2025,8 @@ asmlinkage long sys_link(const char __user * oldname, const char __user * newnam error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) goto out; - /* - * We allow hard-links to be created to a bind-mount as long - * as the bind-mount is not read-only. Checking for cross-dev - * links is subsumed by the superblock check in vfs_link(). - */ - error = -EROFS; - if (MNT_IS_RDONLY(old_nd.mnt)) + error = -EXDEV; + if (old_nd.mnt != nd.mnt) goto out_release; new_dentry = lookup_create(&nd, 0); error = PTR_ERR(new_dentry); diff --git a/fs/rcfs/dir.c b/fs/rcfs/dir.c index 545500e6d..a72c75448 100644 --- a/fs/rcfs/dir.c +++ b/fs/rcfs/dir.c @@ -162,7 +162,7 @@ int rcfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) // create the default set of magic files clstype = (RCFS_I(dentry->d_inode))->core->classtype; rcfs_create_magic(dentry, &(((struct rcfs_magf *)clstype->mfdesc)[1]), - clstype->mfcount - 3); + clstype->mfcount - 2); return retval; diff --git a/fs/rcfs/magic.c b/fs/rcfs/magic.c index 1cada33e5..043df6e2d 100644 --- a/fs/rcfs/magic.c +++ b/fs/rcfs/magic.c @@ -100,7 +100,7 @@ FUNC ## _parse(char *options, char **resstr, char **otherstr) \ *resstr = NULL; \ \ if (!options) \ - return 0; \ + return -EINVAL; \ \ while ((p = strsep(&options, ",")) != NULL) { \ substring_t args[MAX_OPT_ARGS]; \ @@ -113,28 +113,17 @@ FUNC ## _parse(char *options, char **resstr, char **otherstr) \ switch (token) { \ case FUNC ## _res_type: \ *resstr = match_strdup(args); \ - if (!strcmp(#FUNC, "config")) { \ - char *str = p + strlen(p) + 1; \ - *otherstr = kmalloc(strlen(str) + 1, \ - GFP_KERNEL); \ - if (*otherstr == NULL) { \ - kfree(*resstr); \ - *resstr = NULL; \ - return 0; \ - } else { \ - strcpy(*otherstr, str); \ - return 1; \ - } \ - } \ break; \ case FUNC ## _str: \ *otherstr = match_strdup(args); \ break; \ default: \ - return 0; \ + return -EINVAL; \ } \ } \ - return (*resstr != NULL); \ + if (*resstr) \ + return 0; \ + return -EINVAL; \ } #define MAGIC_WRITE(FUNC,CLSTYPEFUN) \ @@ -210,16 +199,17 @@ struct file_operations FUNC ## _fileops = { \ EXPORT_SYMBOL(FUNC ## _fileops); /****************************************************************************** - * Shared function used by Target / Reclassify + * Target * + * pseudo file for manually reclassifying members to a class * *****************************************************************************/ #define TARGET_MAX_INPUT_SIZE 100 static ssize_t -target_reclassify_write(struct file *file, const char __user * buf, - size_t count, loff_t * ppos, int manual) +target_write(struct file *file, const char __user * buf, + size_t count, loff_t * ppos) { struct rcfs_inode_info *ri = RCFS_I(file->f_dentry->d_inode); char *optbuf; @@ -241,7 +231,7 @@ target_reclassify_write(struct file *file, const char __user * buf, clstype = ri->core->classtype; if (clstype->forced_reclassify) - rc = (*clstype->forced_reclassify) (manual ? ri->core: NULL, optbuf); + rc = (*clstype->forced_reclassify) (ri->core, optbuf); up(&(ri->vfs_inode.i_sem)); kfree(optbuf); @@ -249,46 +239,12 @@ target_reclassify_write(struct file *file, const char __user * buf, } -/****************************************************************************** - * Target - * - * pseudo file for manually reclassifying members to a class - * - *****************************************************************************/ - -static ssize_t -target_write(struct file *file, const char __user * buf, - size_t count, loff_t * ppos) -{ - return target_reclassify_write(file,buf,count,ppos,1); -} - struct file_operations target_fileops = { .write = target_write, }; EXPORT_SYMBOL(target_fileops); -/****************************************************************************** - * Reclassify - * - * pseudo file for reclassification of an object through CE - * - *****************************************************************************/ - -static ssize_t -reclassify_write(struct file *file, const char __user * buf, - size_t count, loff_t * ppos) -{ - return target_reclassify_write(file,buf,count,ppos,0); -} - -struct file_operations reclassify_fileops = { - .write = reclassify_write, -}; - -EXPORT_SYMBOL(reclassify_fileops); - /****************************************************************************** * Config * @@ -308,6 +264,7 @@ enum config_token_t { static match_table_t config_tokens = { {config_res_type, "res=%s"}, + {config_str, "config=%s"}, {config_err, NULL}, }; @@ -504,7 +461,7 @@ shares_write(struct file *file, const char __user * buf, } } - printk(KERN_DEBUG "Set %s shares to %d %d %d %d\n", + printk(KERN_ERR "Set %s shares to %d %d %d %d\n", resname, newshares.my_guarantee, newshares.my_limit, diff --git a/fs/rcfs/rootdir.c b/fs/rcfs/rootdir.c index d827db662..6da575ed6 100644 --- a/fs/rcfs/rootdir.c +++ b/fs/rcfs/rootdir.c @@ -91,7 +91,7 @@ int rcfs_mkroot(struct rcfs_magf *mfdesc, int mfcount, struct dentry **rootde) return -EINVAL; rootdesc = &mfdesc[0]; - printk(KERN_DEBUG "allocating classtype root <%s>\n", rootdesc->name); + printk("allocating classtype root <%s>\n", rootdesc->name); dentry = rcfs_create_internal(rcfs_rootde, rootdesc, 0); if (!dentry) { diff --git a/fs/rcfs/socket_fs.c b/fs/rcfs/socket_fs.c index f1c089921..9d9ba5241 100644 --- a/fs/rcfs/socket_fs.c +++ b/fs/rcfs/socket_fs.c @@ -113,12 +113,6 @@ struct rcfs_magf sock_rootdesc[] = { .i_op = &my_iops, .i_fop = &target_fileops, }, - { - .name = "reclassify", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &reclassify_fileops, - }, }; struct rcfs_magf sock_magf[] = { diff --git a/fs/rcfs/super.c b/fs/rcfs/super.c index f013df226..871b7fb17 100644 --- a/fs/rcfs/super.c +++ b/fs/rcfs/super.c @@ -164,7 +164,7 @@ static int rcfs_fill_super(struct super_block *sb, void *data, int silent) clstype = ckrm_classtypes[i]; if (clstype == NULL) continue; - printk(KERN_DEBUG "A non null classtype\n"); + printk("A non null classtype\n"); if ((rc = rcfs_register_classtype(clstype))) continue; // could return with an error too diff --git a/fs/rcfs/tc_magic.c b/fs/rcfs/tc_magic.c index 9ef6d4d18..1a9f69729 100644 --- a/fs/rcfs/tc_magic.c +++ b/fs/rcfs/tc_magic.c @@ -43,7 +43,7 @@ #define TC_FILE_MODE (S_IFREG | S_IRUGO | S_IWUSR) -#define NR_TCROOTMF 7 +#define NR_TCROOTMF 6 struct rcfs_magf tc_rootdesc[NR_TCROOTMF] = { /* First entry must be root */ { @@ -77,15 +77,8 @@ struct rcfs_magf tc_rootdesc[NR_TCROOTMF] = { .i_fop = &shares_fileops, .i_op = &rcfs_file_inode_operations, }, - // Reclassify and Config should be made available only at the - // root level. Make sure they are the last two entries, as - // rcfs_mkdir depends on it - { - .name = "reclassify", - .mode = TC_FILE_MODE, - .i_fop = &reclassify_fileops, - .i_op = &rcfs_file_inode_operations, - }, + // Config should be made available only at the root level + // Make sure this is the last entry, as rcfs_mkdir depends on it { .name = "config", .mode = TC_FILE_MODE, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index a70801f35..f8babe603 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -1338,10 +1338,6 @@ __reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd, { umode_t mode = inode->i_mode; - /* Prevent vservers from escaping chroot() barriers */ - if (IS_BARRIER(inode) && !vx_check(0, VX_ADMIN)) - return -EACCES; - if (mask & MAY_WRITE) { /* * Nobody gets write access to a read-only fs. diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index d27db1931..72b388bf5 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -289,10 +289,8 @@ #define __NR_mq_notify (__NR_mq_open+4) #define __NR_mq_getsetattr (__NR_mq_open+5) #define __NR_sys_kexec_load 283 -#define __NR_ioprio_set 284 -#define __NR_ioprio_get 285 -#define NR_syscalls 286 +#define NR_syscalls 284 #ifndef __KERNEL_SYSCALLS_NO_ERRNO__ /* user-visible error numbers are in the range -1 - -124: see */ diff --git a/include/asm-ppc/unistd.h b/include/asm-ppc/unistd.h index 64e443d47..21774ed93 100644 --- a/include/asm-ppc/unistd.h +++ b/include/asm-ppc/unistd.h @@ -273,10 +273,8 @@ #define __NR_mq_notify 266 #define __NR_mq_getsetattr 267 #define __NR_kexec_load 268 -#define __NR_ioprio_set 269 -#define __NR_ioprio_get 270 -#define __NR_syscalls 271 +#define __NR_syscalls 269 #define __NR(n) #n diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 81e4e85ba..311e25a4f 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -552,12 +552,8 @@ __SYSCALL(__NR_mq_notify, sys_mq_notify) __SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) #define __NR_kexec_load 246 __SYSCALL(__NR_kexec_load, sys_ni_syscall) -#define __NR_ioprio_set 247 -__SYSCALL(__NR_ioprio_set, sys_ioprio_set); -#define __NR_ioprio_get 248 -__SYSCALL(__NR_ioprio_get, sys_ioprio_get); -#define __NR_syscall_max __NR_ioprio_get +#define __NR_syscall_max __NR_kexec_load #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ diff --git a/include/linux/ckrm-io.h b/include/linux/ckrm-io.h index 36040b930..6d6e12749 100644 --- a/include/linux/ckrm-io.h +++ b/include/linux/ckrm-io.h @@ -30,10 +30,13 @@ typedef void *(*icls_tsk_t) (struct task_struct *tsk); typedef int (*icls_ioprio_t) (struct task_struct *tsk); + #ifdef CONFIG_CKRM_RES_BLKIO -extern void *cki_tsk_icls (struct task_struct *tsk); -extern int cki_tsk_ioprio (struct task_struct *tsk); +#ifdef DOES_NOT_WORK_AND_NOT_NEEDED +extern inline icls_tsk_t cki_tsk_icls; +extern inline icls_ioprio_t cki_tsk_ioprio; +#endif #endif /* CONFIG_CKRM_RES_BLKIO */ diff --git a/include/linux/ckrm.h b/include/linux/ckrm.h index a29bf282a..04f4ec00f 100644 --- a/include/linux/ckrm.h +++ b/include/linux/ckrm.h @@ -9,13 +9,10 @@ * * Latest version, more details at http://ckrm.sf.net * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. * */ diff --git a/include/linux/ckrm_ce.h b/include/linux/ckrm_ce.h index f4e91e91d..f3cbd9132 100644 --- a/include/linux/ckrm_ce.h +++ b/include/linux/ckrm_ce.h @@ -9,13 +9,10 @@ * * Latest version, more details at http://ckrm.sf.net * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. * */ @@ -32,7 +29,7 @@ #ifdef CONFIG_CKRM -#include // getting the event names +#include "ckrm.h" // getting the event names /* Action parameters identifying the cause of a task<->class notify callback * these can perculate up to user daemon consuming records send by the diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h index 3041c8179..1bdf9b775 100644 --- a/include/linux/ckrm_classqueue.h +++ b/include/linux/ckrm_classqueue.h @@ -28,8 +28,7 @@ #include -#define CLASSQUEUE_SIZE 1024 // acb: changed from 128 -//#define CLASSQUEUE_SIZE 128 +#define CLASSQUEUE_SIZE 128 #define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long)) /** @@ -117,7 +116,7 @@ void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int cq_node_t *classqueue_get_head(struct classqueue_struct *cq); /*update the base priority of the classqueue*/ -void classqueue_update_base(struct classqueue_struct *cq); +void classqueue_update_base(struct classqueue_struct *cq, int new_base); /** * class_compare_prio: compare the priority of this two nodes diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h index 4efebb993..52dc949ec 100644 --- a/include/linux/ckrm_mem.h +++ b/include/linux/ckrm_mem.h @@ -49,7 +49,6 @@ typedef struct ckrm_mem_res { // more than this is needed. int nr_active[MAX_NR_ZONES]; int nr_inactive[MAX_NR_ZONES]; - int tmp_cnt; int shrink_count; unsigned long last_shrink; int over_limit_failures; @@ -67,19 +66,17 @@ extern struct ckrm_res_ctlr mem_rcbs; // used to fill reclaim_flags, used only when memory is low in the system #define CLS_CLEAR (0) // class under its guarantee #define CLS_OVER_GUAR (1 << 0) // class is over its guarantee -#define CLS_PARENT_OVER (1 << 1) // parent is over 110% mark over limit -#define CLS_OVER_25 (1 << 2) // class over 25% mark bet guar(0) & limit(100) -#define CLS_OVER_50 (1 << 3) // class over 50% mark bet guar(0) & limit(100) -#define CLS_OVER_75 (1 << 4) // class over 75% mark bet guar(0) & limit(100) -#define CLS_OVER_100 (1 << 5) // class over its limit -#define CLS_OVER_110 (1 << 6) // class over 110% mark over limit -#define CLS_FLAGS_ALL ( CLS_OVER_GUAR | CLS_PARENT_OVER | CLS_OVER_25 | \ - CLS_OVER_50 | CLS_OVER_75 | CLS_OVER_100 | CLS_OVER_110 ) +#define CLS_PARENT_OVER (1 << 1) // parent is over 120% mark over limit +#define CLS_OVER_75 (1 << 2) // class over 75% mark bet guar(0) & limit(100) +#define CLS_OVER_100 (1 << 3) // class over its limit +#define CLS_OVER_110 (1 << 4) // class over 110% mark over limit +#define CLS_FLAGS_ALL ( CLS_OVER_GUAR | CLS_PARENT_OVER | CLS_OVER_75 | \ + CLS_OVER_100 | CLS_OVER_110 ) #define CLS_SHRINK_BIT (31) // used to both lock and set the bit #define CLS_SHRINK (1 << CLS_SHRINK_BIT) // shrink the given class // used in flags. set when a class is more than 90% of its maxlimit -#define MEM_AT_LIMIT 1 +#define MEM_NEAR_LIMIT 1 extern void ckrm_set_aggressive(ckrm_mem_res_t *); extern unsigned int ckrm_setup_reclamation(void); @@ -87,14 +84,16 @@ extern void ckrm_teardown_reclamation(void); extern void ckrm_get_reclaim_bits(unsigned int *, unsigned int *); extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *); extern void ckrm_mem_evaluate_mm(struct mm_struct *); -extern void ckrm_at_limit(ckrm_mem_res_t *); -extern int ckrm_memclass_valid(ckrm_mem_res_t *); +extern void ckrm_mem_evaluate_page_byadd(struct page *, struct mm_struct *); +extern void ckrm_near_limit(ckrm_mem_res_t *); #define ckrm_get_reclaim_flags(cls) ((cls)->reclaim_flags) #else #define ckrm_init_mm_to_current(a) do {} while (0) #define ckrm_mem_evaluate_mm(a) do {} while (0) +#define ckrm_mem_evaluate_page_byadd(a,b) do {} while (0) +#define page_class(page) (NULL) #define ckrm_get_reclaim_flags(a) (0) #define ckrm_setup_reclamation() (0) #define ckrm_teardown_reclamation() do {} while (0) diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h index 221f93601..0eb4e49c0 100644 --- a/include/linux/ckrm_mem_inline.h +++ b/include/linux/ckrm_mem_inline.h @@ -56,10 +56,6 @@ ckrm_mem_share_compare(ckrm_mem_res_t *a, ckrm_mem_res_t *b) return -(b != NULL) ; if (b == NULL) return 0; - if (a->pg_guar == CKRM_SHARE_DONTCARE) - return 1; - if (b->pg_guar == CKRM_SHARE_DONTCARE) - return -1; return (a->pg_unused - b->pg_unused); } @@ -73,45 +69,34 @@ mem_class_get(ckrm_mem_res_t *cls) static inline void mem_class_put(ckrm_mem_res_t *cls) { - const char *name; - if (cls && atomic_dec_and_test(&(cls->nr_users)) ) { - if (cls->core == NULL) { - name = "unknown"; - } else { - name = cls->core->name; - } - printk(KERN_DEBUG "freeing memclass %p of \n", cls, name); - - // BUG_ON(ckrm_memclass_valid(cls)); - // kfree(cls); + printk("freeing memclass %p of \n", cls, cls->core->name); + //kfree(cls); } } -static inline void +static inline int incr_use_count(ckrm_mem_res_t *cls, int borrow) { + int over_limit; + atomic_inc(&cls->pg_total); + over_limit = (atomic_read(&cls->pg_total) > ((9 * cls->pg_limit) / 10)); if (borrow) cls->pg_lent++; - if ((cls->pg_guar == CKRM_SHARE_DONTCARE) || + if ((cls->pg_guar != CKRM_SHARE_DONTCARE) && (atomic_read(&cls->pg_total) > cls->pg_unused)) { ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent, mem_rcbs.resid, ckrm_mem_res_t); if (parcls) { - incr_use_count(parcls, 1); + over_limit |= incr_use_count(parcls, 1); cls->pg_borrowed++; + return over_limit; } - } else { - atomic_inc(&ckrm_mem_real_count); } - if ((cls->pg_limit != CKRM_SHARE_DONTCARE) && - (atomic_read(&cls->pg_total) >= cls->pg_limit) && - ((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT)) { - ckrm_at_limit(cls); - } - return; + atomic_inc(&ckrm_mem_real_count); + return over_limit; } static inline void @@ -174,26 +159,10 @@ ckrm_clear_pages_class(struct page *pages, int numpages) } static inline void -ckrm_change_page_class(struct page *page, ckrm_mem_res_t *newcls) +ckrm_change_page_class(struct page *page, ckrm_mem_res_t *cls) { - ckrm_mem_res_t *oldcls = page_class(page); - - if (!newcls || oldcls == newcls) - return; - ckrm_clear_page_class(page); - ckrm_set_page_class(page, newcls); - if (test_bit(PG_ckrm_account, &page->flags)) { - decr_use_count(oldcls, 0); - incr_use_count(newcls, 0); - if (PageActive(page)) { - oldcls->nr_active[page_zonenum(page)]--; - newcls->nr_active[page_zonenum(page)]++; - } else { - oldcls->nr_inactive[page_zonenum(page)]--; - newcls->nr_inactive[page_zonenum(page)]++; - } - } + ckrm_set_page_class(page, cls); } static inline void @@ -209,61 +178,42 @@ ckrm_change_pages_class(struct page *pages, int numpages, static inline void ckrm_mem_inc_active(struct page *page) { - ckrm_mem_res_t *cls = page_class(page), *curcls; - if (unlikely(!cls)) { - return; - } - BUG_ON(test_bit(PG_ckrm_account, &page->flags)); - if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) { - cls = curcls; - ckrm_change_page_class(page, cls); - } + ckrm_mem_res_t *cls = page_class(page); + BUG_ON(cls == NULL); cls->nr_active[page_zonenum(page)]++; - incr_use_count(cls, 0); - set_bit(PG_ckrm_account, &page->flags); + if (incr_use_count(cls, 0)) { + ckrm_near_limit(cls); + } } static inline void ckrm_mem_dec_active(struct page *page) { ckrm_mem_res_t *cls = page_class(page); - if (unlikely(!cls)) { - return; - } - BUG_ON(!test_bit(PG_ckrm_account, &page->flags)); + BUG_ON(cls == NULL); cls->nr_active[page_zonenum(page)]--; decr_use_count(cls, 0); - clear_bit(PG_ckrm_account, &page->flags); } static inline void ckrm_mem_inc_inactive(struct page *page) { - ckrm_mem_res_t *cls = page_class(page), *curcls; - if (unlikely(!cls)) { - return; - } - BUG_ON(test_bit(PG_ckrm_account, &page->flags)); - if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) { - cls = curcls; - ckrm_change_page_class(page, cls); - } + ckrm_mem_res_t *cls = page_class(page); + BUG_ON(cls == NULL); cls->nr_inactive[page_zonenum(page)]++; - incr_use_count(cls, 0); - set_bit(PG_ckrm_account, &page->flags); + if (incr_use_count(cls, 0) && + ((cls->flags & MEM_NEAR_LIMIT) != MEM_NEAR_LIMIT)) { + ckrm_near_limit(cls); + } } static inline void ckrm_mem_dec_inactive(struct page *page) { ckrm_mem_res_t *cls = page_class(page); - if (unlikely(!cls)) { - return; - } - BUG_ON(!test_bit(PG_ckrm_account, &page->flags)); + BUG_ON(cls == NULL); cls->nr_inactive[page_zonenum(page)]--; decr_use_count(cls, 0); - clear_bit(PG_ckrm_account, &page->flags); } static inline int @@ -282,13 +232,7 @@ ckrm_class_limit_ok(ckrm_mem_res_t *cls) if ((mem_rcbs.resid == -1) || !cls) { return 1; } - if (cls->pg_limit == CKRM_SHARE_DONTCARE) { - ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent, - mem_rcbs.resid, ckrm_mem_res_t); - return (!parcls ?: ckrm_class_limit_ok(parcls)); - } else { - return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10); - } + return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10); } #else // !CONFIG_CKRM_RES_MEM diff --git a/include/linux/ckrm_rc.h b/include/linux/ckrm_rc.h index 1bf2d07b5..b46cfd9f3 100644 --- a/include/linux/ckrm_rc.h +++ b/include/linux/ckrm_rc.h @@ -132,7 +132,7 @@ typedef struct ckrm_classtype { int num_classes; /* state about my ce interaction */ - atomic_t ce_regd; // if CE registered + int ce_regd; // if CE registered int ce_cb_active; // if Callbacks active atomic_t ce_nr_users; // number of active transient calls struct ckrm_eng_callback ce_callbacks; // callback engine @@ -223,11 +223,7 @@ typedef struct ckrm_core_class { * OTHER ******************************************************************************/ -#define ckrm_get_res_class(rescls, resid, type) \ - ((type*) (((resid != -1) && ((rescls) != NULL) \ - && ((rescls) != (void *)-1)) ? \ - ((struct ckrm_core_class *)(rescls))->res_class[resid] : NULL)) - +#define ckrm_get_res_class(rescls,resid,type) ((type*)((rescls)->res_class[resid])) extern int ckrm_register_res_ctlr(struct ckrm_classtype *, ckrm_res_ctlr_t *); extern int ckrm_unregister_res_ctlr(ckrm_res_ctlr_t *); diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h index 3611c2d3e..9d82214fb 100644 --- a/include/linux/ckrm_sched.h +++ b/include/linux/ckrm_sched.h @@ -15,34 +15,30 @@ #ifndef _CKRM_SCHED_H #define _CKRM_SCHED_H +#define CC_BUG_ON_DO(cond,action) do { if (cond) action; BUG_ON(cond); } while(0) +#define CC_BUG_ON(cond) BUG_ON(cond) + #include #include #include -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) +//update every second +#define CVT_UPDATE_TICK (1*HZ/1 ?: 1) +#define CLASS_BONUS_RATE 22 // shift from ns to increase class bonus +#define PRIORITY_BONUS_RATE 0 // ?? Hubertus +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) struct prio_array { - unsigned int nr_active; + int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; }; -#ifdef CONFIG_CKRM_CPU_SCHEDULE -#define rq_active(p,rq) (get_task_lrq(p)->active) -#define rq_expired(p,rq) (get_task_lrq(p)->expired) -int __init init_ckrm_sched_res(void); -#else -#define rq_active(p,rq) (rq->active) -#define rq_expired(p,rq) (rq->expired) -static inline void init_ckrm_sched_res(void) {} -static inline int ckrm_cpu_monitor_init(void) {return 0;} -#endif //CONFIG_CKRM_CPU_SCHEDULE - -#ifdef CONFIG_CKRM_CPU_SCHEDULE -struct ckrm_runqueue { +struct ckrm_local_runqueue { cq_node_t classqueue_linkobj; /*links in classqueue */ struct ckrm_cpu_class *cpu_class; // class it belongs to struct classqueue_struct *classqueue; // classqueue it belongs tow + CVT_t uncounted_cvt; unsigned long long uncounted_ns; prio_array_t *active, *expired, arrays[2]; @@ -59,25 +55,19 @@ struct ckrm_runqueue { * updated on enqueue, dequeue */ int top_priority; - CVT_t local_cvt; - - unsigned long lrq_load; - int local_weight; - - - /* - * unused CPU time accumulated while thoe class - * is inactive goes to savings - * - * initialized to be 0 - * a class can't accumulate more than SAVING_THRESHOLD of savings - */ - unsigned long long savings; - + CVT_t local_cvt; // snapshot of local_cvt, update on every loadbalance unsigned long magic; //for debugging }; -typedef struct ckrm_runqueue ckrm_lrq_t; +/** + * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping + */ +struct ckrm_cpu_class_local_stat { + unsigned long long run; + unsigned long long total; + unsigned long long last_sleep; + unsigned long cpu_demand; /*estimated cpu demand */ +}; /** * ckrm_cpu_class_stat - cpu usage statistics maintained for each class @@ -88,35 +78,22 @@ struct ckrm_cpu_class_stat { unsigned long long total_ns; /*how much nano-secs it has consumed */ - struct ckrm_cpu_demand_stat local_stats[NR_CPUS]; - - /* - * - */ - unsigned long max_demand; /* the maximun a class can consume */ - int egrt,megrt; /*effective guarantee*/ - int ehl,mehl; /*effective hard limit, my effective hard limit*/ + struct ckrm_cpu_class_local_stat local_stats[NR_CPUS]; + unsigned long cpu_demand; + /*temp stat used by cpu monitor */ + int effective_guarantee; + int effective_limit; + int glut; //true or false /* - * eshare: for both default class and its children - * meshare: just for the default class + * effective_share: for both default class and its children + * self_effective_share: just for the default class */ - int eshare; - int meshare; + int effective_share; + int self_effective_share; }; -#define CKRM_CPU_CLASS_MAGIC 0x7af2abe3 - -#define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds -#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ)) -#define USAGE_WINDOW_SIZE 60 //keep the last 60 sample - -struct ckrm_usage { - unsigned long samples[USAGE_WINDOW_SIZE]; //record usages - unsigned long sample_pointer; //pointer for the sliding window - unsigned long long last_ns; //ns for last sample - long long last_sample_jiffies; //in number of jiffies -}; +typedef struct ckrm_cpu_class_stat ckrm_stat_t; /* * manages the class status @@ -127,224 +104,72 @@ struct ckrm_cpu_class { struct ckrm_core_class *parent; struct ckrm_shares shares; spinlock_t cnt_lock; // always grab parent's lock first and then child's + CVT_t global_cvt; // total cummulative virtual time struct ckrm_cpu_class_stat stat; struct list_head links; // for linking up in cpu classes - ckrm_lrq_t local_queues[NR_CPUS]; // runqueues - struct ckrm_usage usage; - unsigned long magic; //for debugging + struct ckrm_local_runqueue local_queues[NR_CPUS]; // runqueues }; -#define cpu_class_weight(cls) (cls->stat.meshare) -#define local_class_weight(lrq) (lrq->local_weight) - -static inline int valid_cpu_class(struct ckrm_cpu_class * cls) -{ - return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC); -} - -struct classqueue_struct *get_cpu_classqueue(int cpu); -struct ckrm_cpu_class * get_default_cpu_class(void); - - -static inline void ckrm_usage_init(struct ckrm_usage* usage) -{ - int i; - - for (i=0; i < USAGE_WINDOW_SIZE; i++) - usage->samples[i] = 0; - usage->sample_pointer = 0; - usage->last_ns = 0; - usage->last_sample_jiffies = 0; -} - -/* - * this function can be called at any frequency - * it's self-contained - */ -static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr) -{ - struct ckrm_usage* usage = &clsptr->usage; - unsigned long long cur_sample; - int duration = jiffies - usage->last_sample_jiffies; - - //jiffies wasn't start from 0 - //so it need to be properly handled - if (unlikely(!usage->last_sample_jiffies)) - usage->last_sample_jiffies = jiffies; - - //called too frequenctly - if (duration < USAGE_SAMPLE_FREQ) - return; - - usage->last_sample_jiffies = jiffies; - - cur_sample = clsptr->stat.total_ns - usage->last_ns; - usage->last_ns = clsptr->stat.total_ns; +#if CONFIG_CKRM_CPU_SCHEDULE +#define rq_active(p,rq) (get_task_class_queue(p)->active) +#define rq_expired(p,rq) (get_task_class_queue(p)->expired) +#else +#define rq_active(p,rq) (rq->active) +#define rq_expired(p,rq) (rq->expired) +#endif - //scale it based on the sample duration - cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration); - cur_sample >>= 15; - usage->samples[usage->sample_pointer] = cur_sample; - // printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies); +//#define cpu_class_weight(cls) (cls->shares.my_guarantee) +#define cpu_class_weight(cls) (cls->stat.self_effective_share) - usage->sample_pointer ++; - if (usage->sample_pointer >= USAGE_WINDOW_SIZE) - usage->sample_pointer = 0; -} +#define bpt_queue(cpu) (& (cpu_rq(cpu)->classqueue) ) +CVT_t get_min_cvt(int cpu); -//duration is specified in number of jiffies -//return the usage in percentage -static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration) -{ - int nr_samples = duration/USAGE_SAMPLE_FREQ?:1; - struct ckrm_usage* usage = &clsptr->usage; - unsigned long long total = 0; - int i, idx; - - if (nr_samples > USAGE_WINDOW_SIZE) - nr_samples = USAGE_WINDOW_SIZE; - - idx = usage->sample_pointer; - for (i = 0; i< nr_samples; i++) { - if (! idx) - idx = USAGE_WINDOW_SIZE; - idx --; - total += usage->samples[idx]; - } - total *= 100; - do_div(total,nr_samples); - do_div(total,NS_PER_SAMPLE); - do_div(total,cpus_weight(cpu_online_map)); - return total; -} +struct classqueue_struct *get_cpu_classqueue(int cpu); +extern struct ckrm_cpu_class default_cpu_class_obj; +#define default_cpu_class (&default_cpu_class_obj) -#define lrq_nr_running(lrq) \ - (lrq->active->nr_active + lrq->expired->nr_active) +#define local_queue_nr_running(local_queue) \ + (local_queue->active->nr_active + local_queue->expired->nr_active) -static inline ckrm_lrq_t * -get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) +static inline struct ckrm_local_runqueue * +get_ckrm_local_runqueue(struct ckrm_cpu_class*cls, int cpu) { return &(cls->local_queues[cpu]); } -static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p) +static inline struct ckrm_local_runqueue *get_task_class_queue(struct task_struct *p) { return &(p->cpu_class->local_queues[task_cpu(p)]); } #define task_list_entry(list) list_entry(list,struct task_struct,run_list) -#define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj) +#define class_list_entry(list) list_entry(list,struct ckrm_local_runqueue,classqueue_linkobj) /* some additional interfaces exported from sched.c */ struct runqueue; +void dequeue_task(struct task_struct *p, prio_array_t * array); +void enqueue_task(struct task_struct *p, prio_array_t * array); +struct runqueue *task_rq_lock(task_t * p, unsigned long *flags); +void task_rq_unlock(struct runqueue *rq, unsigned long *flags); +extern spinlock_t cvt_lock; extern rwlock_t class_list_lock; extern struct list_head active_cpu_classes; -unsigned int task_timeslice(task_t *p); -void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls); +/*functions exported by ckrm_cpu_class.c*/ +int __init init_ckrm_sched_res(void); void init_cpu_classes(void); -void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares); -void ckrm_cpu_change_class(void *task, void *old, void *new); - +/*functions exported by ckrm_cpu_monitor.c*/ +void ckrm_cpu_monitor(void); +void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); #define CPU_DEMAND_ENQUEUE 0 #define CPU_DEMAND_DEQUEUE 1 #define CPU_DEMAND_DESCHEDULE 2 -#define CPU_DEMAND_INIT 3 - -/*functions exported by ckrm_cpu_monitor.c*/ -void ckrm_cpu_monitor(int check_min); -int ckrm_cpu_monitor_init(void); -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); -void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len); -void adjust_local_weight(void); - -#define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)]) -#define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu]) -#define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu)) - -/******************************************************************** - * Parameters that determine how quickly CVT's progress and how - * priority can impact a LRQ's runqueue position. See also - * get_effective_prio(). These parameters need to adjusted - * in accordance to the following example and understanding. - * - * CLASS_QUANTIZER: - * - * A class with 50% share, can execute 500 ms / per sec ~ 2^29 ns. - * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7. - * With CLASS_QUANTIZER=16, the local_cvt of this class will increase - * by 2^29/2^9 = 2^20 = 1024K. - * Setting CLASS_QUANTIZER to 16, 2^(20-16) = 16 slots / per second. - * Do the same math, a class with any share value, will cover 16 slots / per second. - * So 2^8 total slots is good track for 8 seconds of system execution - * - * PRIORITY_QUANTIZER: - * - * How much can top priorities of class impact slot bonus. - * There are 40 nice priorities, range from -20 to 19, with default nice = 0 - * "2" will allow upto 5 slots improvement - * when certain task within the class has a nice value of -20 - * in the RQ thus for 50% class it can perform ~300 msec starvation. - * - *******************************************************************/ - -#define CLASS_QUANTIZER 16 //shift from ns to increase class bonus -#define PRIORITY_QUANTIZER 2 //controls how much a high prio task can borrow - -#define CKRM_SHARE_ACCURACY 13 -#define NSEC_PER_MS 1000000 -#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ) - - -#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds - -#define CVT_UPDATE_TICK ((HZ/2)?:1) - -// ABSOLUTE_CKRM_TUNING determines whether classes can make up -// lost time in absolute time or in relative values - -#define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior - -#ifdef ABSOLUTE_CKRM_TUNING - -#define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE -//an absolute bonus of 200ms for classes when reactivated -#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) -#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES) - -#define scale_cvt(val,lrq) ((val)*local_class_weight(lrq)) -#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq))) - -#else - -#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) -/* - * to improve system responsiveness - * an inactive class is put a little bit ahead of the current class when it wakes up - * the amount is set in normalized term to simplify the calculation - * for class with 100% share, it can be 2s ahead - * while for class with 10% share, it can be 200ms ahead - */ -#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS) - -/* - * normalized savings can't be more than MAX_NORMALIZED_SAVINGS - * based on the current configuration - * this means that a class with share 100% will accumulate 10s at most - * while a class with 1% of the share can only accumulate 100ms - */ - -//a class with share 100% can get 100ms every 500ms -//while a class with share 10% can only get 10ms every 500ms -#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY) - -#define scale_cvt(val,lrq) (val) -#define unscale_cvt(val,lrq) (val) - -#endif +void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len); +#define get_task_local_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)]) +#define get_rq_local_stat(lrq,cpu) (&(lrq)->cpu_class->stat.local_stats[cpu]) /** * get_effective_prio: return the effective priority of a class local queue @@ -356,22 +181,18 @@ void adjust_local_weight(void); * currently, prio increases by 1 if either: top_priority increase by one * or, local_cvt increases by 4ms */ -static inline int get_effective_prio(ckrm_lrq_t * lrq) +static inline int get_effective_prio(struct ckrm_local_runqueue * lcq) { int prio; - prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage -#ifndef URGENCY_SUPPORT -#warning "ACB removing urgency calculation from get_effective_prio" -#else - prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency -#endif + // cumulative usage + prio = lcq->local_cvt >> CLASS_BONUS_RATE; + // queue urgency + prio += lcq->top_priority >> PRIORITY_BONUS_RATE; return prio; } -CVT_t get_local_cur_cvt(int cpu); - /** * update_class_priority: * @@ -385,8 +206,9 @@ CVT_t get_local_cur_cvt(int cpu); * -- rq_get_next_task (queue switch) * -- update_local_cvt * -- schedule + * -- update_global_cvt */ -static inline void update_class_priority(ckrm_lrq_t *local_rq) +static inline void update_class_priority(struct ckrm_local_runqueue *local_rq) { int effective_prio = get_effective_prio(local_rq); classqueue_update_prio(local_rq->classqueue, @@ -398,80 +220,42 @@ static inline void update_class_priority(ckrm_lrq_t *local_rq) * set the new top priority and reposition the queue * called when: task enqueue/dequeue and queue switch */ -static inline void set_top_priority(ckrm_lrq_t *lrq, +static inline void set_top_priority(struct ckrm_local_runqueue *class_queue, int new_priority) { - lrq->top_priority = new_priority; - update_class_priority(lrq); -} - -/* - * task_load: how much load this task counts - */ -static inline unsigned long task_load(struct task_struct* p) -{ - return (task_timeslice(p) * p->demand_stat.cpu_demand); -} - -/* - * runqueue load is the local_weight of all the classes on this cpu - * must be called with class_list_lock held - */ -static inline unsigned long ckrm_cpu_load(int cpu) -{ - struct ckrm_cpu_class *clsptr; - ckrm_lrq_t* lrq; - struct ckrm_cpu_demand_stat* l_stat; - int total_load = 0; - int load; - - list_for_each_entry(clsptr,&active_cpu_classes,links) { - lrq = get_ckrm_lrq(clsptr,cpu); - l_stat = get_cls_local_stat(clsptr,cpu); - load = lrq->local_weight; - if (l_stat->cpu_demand < load) - load = l_stat->cpu_demand; - total_load += load; - } - return total_load; + class_queue->top_priority = new_priority; + update_class_priority(class_queue); } static inline void class_enqueue_task(struct task_struct *p, prio_array_t * array) { - ckrm_lrq_t *lrq; + struct ckrm_local_runqueue *queue; int effective_prio; - lrq = get_task_lrq(p); - - cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); - lrq->lrq_load += task_load(p); + queue = get_task_class_queue(p); - if ((p->prio < lrq->top_priority) && (array == lrq->active)) - set_top_priority(lrq, p->prio); - - if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { - cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); - effective_prio = get_effective_prio(lrq); - classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio); + if (! cls_in_classqueue(&queue->classqueue_linkobj)) { + cpu_demand_event(get_task_local_stat(p),CPU_DEMAND_ENQUEUE,0); + /*make sure the cvt of this class is up to date*/ + queue->local_cvt = get_min_cvt(task_cpu(p)); + effective_prio = get_effective_prio(queue); + classqueue_enqueue(queue->classqueue, &queue->classqueue_linkobj, effective_prio); } + + if ((p->prio < queue->top_priority) && (array == queue->active)) + set_top_priority(queue, p->prio); } static inline void class_dequeue_task(struct task_struct *p, prio_array_t * array) { - ckrm_lrq_t *lrq = get_task_lrq(p); - unsigned long load = task_load(p); + struct ckrm_local_runqueue *queue = get_task_class_queue(p); - BUG_ON(lrq->lrq_load < load); - lrq->lrq_load -= load; - - cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); - - if ((array == lrq->active) && (p->prio == lrq->top_priority) + if ((array == queue->active) && (p->prio == queue->top_priority) && list_empty(&(array->queue[p->prio]))) - set_top_priority(lrq, + set_top_priority(queue, find_next_bit(array->bitmap, MAX_PRIO, p->prio)); } @@ -482,82 +266,32 @@ static inline void class_dequeue_task(struct task_struct *p, */ static inline void update_local_cvt(struct task_struct *p, unsigned long nsec) { - ckrm_lrq_t * lrq = get_task_lrq(p); - - unsigned long cvt_inc = nsec / local_class_weight(lrq); - - lrq->local_cvt += cvt_inc; - lrq->uncounted_ns += nsec; + struct ckrm_local_runqueue *class_queue = get_task_class_queue(p); + struct ckrm_cpu_class *cls = class_queue->cpu_class; - update_class_priority(lrq); -} + unsigned long cvt_inc = nsec / cpu_class_weight(cls); -static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) -{ - struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj); - struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj); + class_queue->local_cvt += cvt_inc; + class_queue->uncounted_cvt += cvt_inc; - return (class_compare_prio(node1,node2) < 0); + class_queue->uncounted_ns += nsec; + update_class_priority(class_queue); } /* - * return a random value with range [0, (val-1)] + * called during loadbalancing + * to charge the class with locally accumulated cvt */ -static inline int get_ckrm_rand(unsigned long val) -{ - int rand; - static int last_rand[NR_CPUS]; - int cpu = smp_processor_id(); - - rand = last_rand[cpu]; - rand ++; - if (rand >= val) - rand = 0; - - last_rand[cpu] = rand; - return rand; -} - -void update_class_cputime(int this_cpu); +void update_global_cvts(int this_cpu); -/**********************************************/ -/* PID_LOAD_BALANCING */ -/**********************************************/ -struct ckrm_load_struct { - unsigned long load_p; /*propotional*/ - unsigned long load_i; /*integral */ - long load_d; /*derivative */ -}; - -typedef struct ckrm_load_struct ckrm_load_t; - -static inline void ckrm_load_init(ckrm_load_t* ckrm_load) { - ckrm_load->load_p = 0; - ckrm_load->load_i = 0; - ckrm_load->load_d = 0; -} - -void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu); -long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group); -#define rq_ckrm_load(rq) (&((rq)->ckrm_load)) - -static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load) +/** + * + */ +static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) { - read_lock(&class_list_lock); - -#ifdef CONFIG_SMP - ckrm_load_sample(ckrm_load,this_cpu); -#endif - - if (! (j % CVT_UPDATE_TICK)) { - // printk("ckrm_sched j=%lu\n",j); - classqueue_update_base(get_cpu_classqueue(this_cpu)); - update_class_cputime(this_cpu); - } + struct cq_node_struct* node1 = &(get_task_class_queue(p)->classqueue_linkobj); + struct cq_node_struct* node2 = &(get_task_class_queue(curr)->classqueue_linkobj); - read_unlock(&class_list_lock); + return (class_compare_prio(node1,node2) < 0); } - -#endif //CONFIG_CKRM_CPU_SCHEDULE - #endif diff --git a/include/linux/crbce.h b/include/linux/crbce.h deleted file mode 100644 index 6a2190dd8..000000000 --- a/include/linux/crbce.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * crbce.h - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * - * This files contains the type definition of the record - * created by the CRBCE CKRM classification engine - * - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * - */ - - -/* - * Changes - * - * 2003-11-11 Created by H.Franke - * 2003-12-01 Sanitized for Delivery by H.Franke - * - */ - -#ifndef CRBCE_RECORDS_H -#define CRBCE_RECORDS_H - -#ifdef __KERNEL__ -#include -#else -#define CONFIG_CKRM -#define CONFIG_CRBCE -#define CONFIG_DELAY_ACCT -#endif - -#include -#include -#include - -#define CRBCE_UKCC_NAME "crbce_ukcc" -#define CRBCE_UKCC_PATH "/mnt/relayfs" - -#define CRBCE_UKCC_PATH_NAME CRBCE_UKCC_PATH"/"CRBCE_UKCC_NAME - -#define CRBCE_MAX_CLASS_NAME_LEN 256 - -/**************************************************************** - * - * CRBCE EVENT SET is and extension to the standard CKRM_EVENTS - * - ****************************************************************/ -enum { - - /* we use the standard CKRM_EVENT_<..> - * to identify reclassification cause actions - * and extend by additional ones we need - */ - - /* up event flow */ - - CRBCE_REC_EXIT = CKRM_NUM_EVENTS, - CRBCE_REC_DATA_DELIMITER, - CRBCE_REC_SAMPLE, - CRBCE_REC_TASKINFO, - CRBCE_REC_SYS_INFO, - CRBCE_REC_CLASS_INFO, - CRBCE_REC_KERNEL_CMD_DONE, - CRBCE_REC_UKCC_FULL, - - /* down command issueance */ - CRBCE_REC_KERNEL_CMD, - - CRBCE_NUM_EVENTS -}; - -struct task_sample_info { - uint32_t cpu_running; - uint32_t cpu_waiting; - uint32_t io_delayed; - uint32_t memio_delayed; -}; - -/********************************************* - * KERNEL -> USER records * - *********************************************/ - -/* we have records with either a time stamp or not */ -struct crbce_hdr { - int type; - pid_t pid; -}; - -struct crbce_hdr_ts { - int type; - pid_t pid; - uint32_t jiffies; - uint64_t cls; -}; - -/* individual records */ - -struct crbce_rec_fork { - struct crbce_hdr_ts hdr; - pid_t ppid; -}; - -struct crbce_rec_data_delim { - struct crbce_hdr_ts hdr; - int is_stop; /* 0 start, 1 stop */ -}; - -struct crbce_rec_task_data { - struct crbce_hdr_ts hdr; - struct task_sample_info sample; - struct task_delay_info delay; -}; - -struct crbce_ukcc_full { - struct crbce_hdr_ts hdr; -}; - -struct crbce_class_info { - struct crbce_hdr_ts hdr; - int action; - int namelen; - char name[CRBCE_MAX_CLASS_NAME_LEN]; -}; - -/********************************************* - * USER -> KERNEL records * - *********************************************/ - -enum crbce_kernel_cmd { - CRBCE_CMD_START, - CRBCE_CMD_STOP, - CRBCE_CMD_SET_TIMER, - CRBCE_CMD_SEND_DATA, -}; - -struct crbce_command { - int type; /* we need this for the K->U reflection */ - int cmd; - uint32_t len; /* added in the kernel for reflection */ -}; - -#define set_cmd_hdr(rec,tok) \ -((rec).hdr.type=CRBCE_REC_KERNEL_CMD,(rec).hdr.cmd=(tok)) - -struct crbce_cmd_done { - struct crbce_command hdr; - int rc; -}; - -struct crbce_cmd { - struct crbce_command hdr; -}; - -struct crbce_cmd_send_data { - struct crbce_command hdr; - int delta_mode; -}; - -struct crbce_cmd_settimer { - struct crbce_command hdr; - uint32_t interval; /* in msec .. 0 means stop */ -}; - -#endif diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b42a9c4e2..27e8183f4 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -17,7 +17,6 @@ typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *); typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); typedef int (elevator_may_queue_fn) (request_queue_t *, int); -typedef void (elevator_set_congested_fn) (request_queue_t *); typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int); typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); @@ -46,7 +45,6 @@ struct elevator_s elevator_put_req_fn *elevator_put_req_fn; elevator_may_queue_fn *elevator_may_queue_fn; - elevator_set_congested_fn *elevator_set_congested_fn; elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; @@ -76,7 +74,6 @@ extern struct request *elv_latter_request(request_queue_t *, struct request *); extern int elv_register_queue(request_queue_t *q); extern void elv_unregister_queue(request_queue_t *q); extern int elv_may_queue(request_queue_t *, int); -extern void elv_set_congested(request_queue_t *); extern void elv_completed_request(request_queue_t *, struct request *); extern int elv_set_request(request_queue_t *, struct request *, int); extern void elv_put_request(request_queue_t *, struct request *); @@ -122,6 +119,4 @@ extern int elv_try_last_merge(request_queue_t *, struct bio *); #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 -#define RQ_ELV_DATA(rq) (rq)->elevator_private - #endif diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h index cd252c8eb..7c6f650c9 100644 --- a/include/linux/ext2_fs.h +++ b/include/linux/ext2_fs.h @@ -196,13 +196,8 @@ struct ext2_group_desc #define EXT2_IUNLINK_FL 0x08000000 /* Immutable unlink */ #define EXT2_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ -#ifdef CONFIG_VSERVER_LEGACY -#define EXT2_FL_USER_VISIBLE 0x0C03DFFF /* User visible flags */ -#define EXT2_FL_USER_MODIFIABLE 0x0C0380FF /* User modifiable flags */ -#else #define EXT2_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ #define EXT2_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ -#endif /* * ioctl commands diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 7fe32d0be..100fba908 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -189,13 +189,8 @@ struct ext3_group_desc #define EXT3_IUNLINK_FL 0x08000000 /* Immutable unlink */ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ -#ifdef CONFIG_VSERVER_LEGACY -#define EXT3_FL_USER_VISIBLE 0x0C03DFFF /* User visible flags */ -#define EXT3_FL_USER_MODIFIABLE 0x0C0380FF /* User modifiable flags */ -#else #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ -#endif /* * Inode dynamic state flags diff --git a/include/linux/fs.h b/include/linux/fs.h index ece31a727..e83d8e4dd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -42,7 +42,7 @@ struct vfsmount; /* Fixed constants first: */ #undef NR_OPEN #define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */ -#define INR_OPEN 4096 /* Initial setting for nfile rlimits */ +#define INR_OPEN 1024 /* Initial setting for nfile rlimits */ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< static inline void add_page_to_active_list(struct zone *zone, struct page *page) { list_add(&page->lru, &zone->active_list); zone->nr_active++; - ckrm_mem_inc_active(page); } static inline void @@ -13,7 +11,6 @@ add_page_to_inactive_list(struct zone *zone, struct page *page) { list_add(&page->lru, &zone->inactive_list); zone->nr_inactive++; - ckrm_mem_inc_inactive(page); } static inline void @@ -21,7 +18,6 @@ del_page_from_active_list(struct zone *zone, struct page *page) { list_del(&page->lru); zone->nr_active--; - ckrm_mem_dec_active(page); } static inline void @@ -29,7 +25,6 @@ del_page_from_inactive_list(struct zone *zone, struct page *page) { list_del(&page->lru); zone->nr_inactive--; - ckrm_mem_dec_inactive(page); } static inline void @@ -39,9 +34,7 @@ del_page_from_lru(struct zone *zone, struct page *page) if (PageActive(page)) { ClearPageActive(page); zone->nr_active--; - ckrm_mem_dec_active(page); } else { zone->nr_inactive--; - ckrm_mem_dec_inactive(page); } } diff --git a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h deleted file mode 100644 index 0fbec884a..000000000 --- a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h +++ /dev/null @@ -1,310 +0,0 @@ -/* PPTP constants and structs */ -#ifndef _CONNTRACK_PPTP_H -#define _CONNTRACK_PPTP_H - -/* state of the control session */ -enum pptp_ctrlsess_state { - PPTP_SESSION_NONE, /* no session present */ - PPTP_SESSION_ERROR, /* some session error */ - PPTP_SESSION_STOPREQ, /* stop_sess request seen */ - PPTP_SESSION_REQUESTED, /* start_sess request seen */ - PPTP_SESSION_CONFIRMED, /* session established */ -}; - -/* state of the call inside the control session */ -enum pptp_ctrlcall_state { - PPTP_CALL_NONE, - PPTP_CALL_ERROR, - PPTP_CALL_OUT_REQ, - PPTP_CALL_OUT_CONF, - PPTP_CALL_IN_REQ, - PPTP_CALL_IN_REP, - PPTP_CALL_IN_CONF, - PPTP_CALL_CLEAR_REQ, -}; - - -/* conntrack private data */ -struct ip_ct_pptp_master { - enum pptp_ctrlsess_state sstate; /* session state */ - - /* everything below is going to be per-expectation in newnat, - * since there could be more than one call within one session */ - enum pptp_ctrlcall_state cstate; /* call state */ - u_int16_t pac_call_id; /* call id of PAC, host byte order */ - u_int16_t pns_call_id; /* call id of PNS, host byte order */ -}; - -/* conntrack_expect private member */ -struct ip_ct_pptp_expect { - enum pptp_ctrlcall_state cstate; /* call state */ - u_int16_t pac_call_id; /* call id of PAC */ - u_int16_t pns_call_id; /* call id of PNS */ -}; - - -#ifdef __KERNEL__ - -#include -DECLARE_LOCK_EXTERN(ip_pptp_lock); - -#define IP_CONNTR_PPTP PPTP_CONTROL_PORT - -#define PPTP_CONTROL_PORT 1723 - -#define PPTP_PACKET_CONTROL 1 -#define PPTP_PACKET_MGMT 2 - -#define PPTP_MAGIC_COOKIE 0x1a2b3c4d - -struct pptp_pkt_hdr { - __u16 packetLength; - __u16 packetType; - __u32 magicCookie; -}; - -/* PptpControlMessageType values */ -#define PPTP_START_SESSION_REQUEST 1 -#define PPTP_START_SESSION_REPLY 2 -#define PPTP_STOP_SESSION_REQUEST 3 -#define PPTP_STOP_SESSION_REPLY 4 -#define PPTP_ECHO_REQUEST 5 -#define PPTP_ECHO_REPLY 6 -#define PPTP_OUT_CALL_REQUEST 7 -#define PPTP_OUT_CALL_REPLY 8 -#define PPTP_IN_CALL_REQUEST 9 -#define PPTP_IN_CALL_REPLY 10 -#define PPTP_IN_CALL_CONNECT 11 -#define PPTP_CALL_CLEAR_REQUEST 12 -#define PPTP_CALL_DISCONNECT_NOTIFY 13 -#define PPTP_WAN_ERROR_NOTIFY 14 -#define PPTP_SET_LINK_INFO 15 - -#define PPTP_MSG_MAX 15 - -/* PptpGeneralError values */ -#define PPTP_ERROR_CODE_NONE 0 -#define PPTP_NOT_CONNECTED 1 -#define PPTP_BAD_FORMAT 2 -#define PPTP_BAD_VALUE 3 -#define PPTP_NO_RESOURCE 4 -#define PPTP_BAD_CALLID 5 -#define PPTP_REMOVE_DEVICE_ERROR 6 - -struct PptpControlHeader { - __u16 messageType; - __u16 reserved; -}; - -/* FramingCapability Bitmap Values */ -#define PPTP_FRAME_CAP_ASYNC 0x1 -#define PPTP_FRAME_CAP_SYNC 0x2 - -/* BearerCapability Bitmap Values */ -#define PPTP_BEARER_CAP_ANALOG 0x1 -#define PPTP_BEARER_CAP_DIGITAL 0x2 - -struct PptpStartSessionRequest { - __u16 protocolVersion; - __u8 reserved1; - __u8 reserved2; - __u32 framingCapability; - __u32 bearerCapability; - __u16 maxChannels; - __u16 firmwareRevision; - __u8 hostName[64]; - __u8 vendorString[64]; -}; - -/* PptpStartSessionResultCode Values */ -#define PPTP_START_OK 1 -#define PPTP_START_GENERAL_ERROR 2 -#define PPTP_START_ALREADY_CONNECTED 3 -#define PPTP_START_NOT_AUTHORIZED 4 -#define PPTP_START_UNKNOWN_PROTOCOL 5 - -struct PptpStartSessionReply { - __u16 protocolVersion; - __u8 resultCode; - __u8 generalErrorCode; - __u32 framingCapability; - __u32 bearerCapability; - __u16 maxChannels; - __u16 firmwareRevision; - __u8 hostName[64]; - __u8 vendorString[64]; -}; - -/* PptpStopReasons */ -#define PPTP_STOP_NONE 1 -#define PPTP_STOP_PROTOCOL 2 -#define PPTP_STOP_LOCAL_SHUTDOWN 3 - -struct PptpStopSessionRequest { - __u8 reason; -}; - -/* PptpStopSessionResultCode */ -#define PPTP_STOP_OK 1 -#define PPTP_STOP_GENERAL_ERROR 2 - -struct PptpStopSessionReply { - __u8 resultCode; - __u8 generalErrorCode; -}; - -struct PptpEchoRequest { - __u32 identNumber; -}; - -/* PptpEchoReplyResultCode */ -#define PPTP_ECHO_OK 1 -#define PPTP_ECHO_GENERAL_ERROR 2 - -struct PptpEchoReply { - __u32 identNumber; - __u8 resultCode; - __u8 generalErrorCode; - __u16 reserved; -}; - -/* PptpFramingType */ -#define PPTP_ASYNC_FRAMING 1 -#define PPTP_SYNC_FRAMING 2 -#define PPTP_DONT_CARE_FRAMING 3 - -/* PptpCallBearerType */ -#define PPTP_ANALOG_TYPE 1 -#define PPTP_DIGITAL_TYPE 2 -#define PPTP_DONT_CARE_BEARER_TYPE 3 - -struct PptpOutCallRequest { - __u16 callID; - __u16 callSerialNumber; - __u32 minBPS; - __u32 maxBPS; - __u32 bearerType; - __u32 framingType; - __u16 packetWindow; - __u16 packetProcDelay; - __u16 reserved1; - __u16 phoneNumberLength; - __u16 reserved2; - __u8 phoneNumber[64]; - __u8 subAddress[64]; -}; - -/* PptpCallResultCode */ -#define PPTP_OUTCALL_CONNECT 1 -#define PPTP_OUTCALL_GENERAL_ERROR 2 -#define PPTP_OUTCALL_NO_CARRIER 3 -#define PPTP_OUTCALL_BUSY 4 -#define PPTP_OUTCALL_NO_DIAL_TONE 5 -#define PPTP_OUTCALL_TIMEOUT 6 -#define PPTP_OUTCALL_DONT_ACCEPT 7 - -struct PptpOutCallReply { - __u16 callID; - __u16 peersCallID; - __u8 resultCode; - __u8 generalErrorCode; - __u16 causeCode; - __u32 connectSpeed; - __u16 packetWindow; - __u16 packetProcDelay; - __u32 physChannelID; -}; - -struct PptpInCallRequest { - __u16 callID; - __u16 callSerialNumber; - __u32 callBearerType; - __u32 physChannelID; - __u16 dialedNumberLength; - __u16 dialingNumberLength; - __u8 dialedNumber[64]; - __u8 dialingNumber[64]; - __u8 subAddress[64]; -}; - -/* PptpInCallResultCode */ -#define PPTP_INCALL_ACCEPT 1 -#define PPTP_INCALL_GENERAL_ERROR 2 -#define PPTP_INCALL_DONT_ACCEPT 3 - -struct PptpInCallReply { - __u16 callID; - __u16 peersCallID; - __u8 resultCode; - __u8 generalErrorCode; - __u16 packetWindow; - __u16 packetProcDelay; - __u16 reserved; -}; - -struct PptpInCallConnected { - __u16 peersCallID; - __u16 reserved; - __u32 connectSpeed; - __u16 packetWindow; - __u16 packetProcDelay; - __u32 callFramingType; -}; - -struct PptpClearCallRequest { - __u16 callID; - __u16 reserved; -}; - -struct PptpCallDisconnectNotify { - __u16 callID; - __u8 resultCode; - __u8 generalErrorCode; - __u16 causeCode; - __u16 reserved; - __u8 callStatistics[128]; -}; - -struct PptpWanErrorNotify { - __u16 peersCallID; - __u16 reserved; - __u32 crcErrors; - __u32 framingErrors; - __u32 hardwareOverRuns; - __u32 bufferOverRuns; - __u32 timeoutErrors; - __u32 alignmentErrors; -}; - -struct PptpSetLinkInfo { - __u16 peersCallID; - __u16 reserved; - __u32 sendAccm; - __u32 recvAccm; -}; - - -struct pptp_priv_data { - __u16 call_id; - __u16 mcall_id; - __u16 pcall_id; -}; - -union pptp_ctrl_union { - struct PptpStartSessionRequest sreq; - struct PptpStartSessionReply srep; - struct PptpStopSessionRequest streq; - struct PptpStopSessionReply strep; - struct PptpOutCallRequest ocreq; - struct PptpOutCallReply ocack; - struct PptpInCallRequest icreq; - struct PptpInCallReply icack; - struct PptpInCallConnected iccon; - struct PptpClearCallRequest clrreq; - struct PptpCallDisconnectNotify disc; - struct PptpWanErrorNotify wanerr; - struct PptpSetLinkInfo setlink; -}; - -#endif /* __KERNEL__ */ -#endif /* _CONNTRACK_PPTP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h b/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h deleted file mode 100644 index 07646857c..000000000 --- a/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef _CONNTRACK_PROTO_GRE_H -#define _CONNTRACK_PROTO_GRE_H -#include - -/* GRE PROTOCOL HEADER */ - -/* GRE Version field */ -#define GRE_VERSION_1701 0x0 -#define GRE_VERSION_PPTP 0x1 - -/* GRE Protocol field */ -#define GRE_PROTOCOL_PPTP 0x880B - -/* GRE Flags */ -#define GRE_FLAG_C 0x80 -#define GRE_FLAG_R 0x40 -#define GRE_FLAG_K 0x20 -#define GRE_FLAG_S 0x10 -#define GRE_FLAG_A 0x80 - -#define GRE_IS_C(f) ((f)&GRE_FLAG_C) -#define GRE_IS_R(f) ((f)&GRE_FLAG_R) -#define GRE_IS_K(f) ((f)&GRE_FLAG_K) -#define GRE_IS_S(f) ((f)&GRE_FLAG_S) -#define GRE_IS_A(f) ((f)&GRE_FLAG_A) - -/* GRE is a mess: Four different standards */ -struct gre_hdr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u16 rec:3, - srr:1, - seq:1, - key:1, - routing:1, - csum:1, - version:3, - reserved:4, - ack:1; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u16 csum:1, - routing:1, - key:1, - seq:1, - srr:1, - rec:3, - ack:1, - reserved:4, - version:3; -#else -#error "Adjust your defines" -#endif - __u16 protocol; -}; - -/* modified GRE header for PPTP */ -struct gre_hdr_pptp { - __u8 flags; /* bitfield */ - __u8 version; /* should be GRE_VERSION_PPTP */ - __u16 protocol; /* should be GRE_PROTOCOL_PPTP */ - __u16 payload_len; /* size of ppp payload, not inc. gre header */ - __u16 call_id; /* peer's call_id for this session */ - __u32 seq; /* sequence number. Present if S==1 */ - __u32 ack; /* seq number of highest packet recieved by */ - /* sender in this session */ -}; - - -/* this is part of ip_conntrack */ -struct ip_ct_gre { - unsigned int stream_timeout; - unsigned int timeout; -}; - -/* this is part of ip_conntrack_expect */ -struct ip_ct_gre_expect { - struct ip_ct_gre_keymap *keymap_orig, *keymap_reply; -}; - -#ifdef __KERNEL__ -struct ip_conntrack_expect; - -/* structure for original <-> reply keymap */ -struct ip_ct_gre_keymap { - struct list_head list; - - struct ip_conntrack_tuple tuple; -}; - - -/* add new tuple->key_reply pair to keymap */ -int ip_ct_gre_keymap_add(struct ip_conntrack_expect *exp, - struct ip_conntrack_tuple *t, - int reply); - -/* change an existing keymap entry */ -void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km, - struct ip_conntrack_tuple *t); - -/* delete keymap entries */ -void ip_ct_gre_keymap_destroy(struct ip_conntrack_expect *exp); - - -/* get pointer to gre key, if present */ -static inline u_int32_t *gre_key(struct gre_hdr *greh) -{ - if (!greh->key) - return NULL; - if (greh->csum || greh->routing) - return (u_int32_t *) (greh+sizeof(*greh)+4); - return (u_int32_t *) (greh+sizeof(*greh)); -} - -/* get pointer ot gre csum, if present */ -static inline u_int16_t *gre_csum(struct gre_hdr *greh) -{ - if (!greh->csum) - return NULL; - return (u_int16_t *) (greh+sizeof(*greh)); -} - -#endif /* __KERNEL__ */ - -#endif /* _CONNTRACK_PROTO_GRE_H */ diff --git a/include/linux/netfilter_ipv4/ip_nat_pptp.h b/include/linux/netfilter_ipv4/ip_nat_pptp.h deleted file mode 100644 index eaf66c2e8..000000000 --- a/include/linux/netfilter_ipv4/ip_nat_pptp.h +++ /dev/null @@ -1,11 +0,0 @@ -/* PPTP constants and structs */ -#ifndef _NAT_PPTP_H -#define _NAT_PPTP_H - -/* conntrack private data */ -struct ip_nat_pptp { - u_int16_t pns_call_id; /* NAT'ed PNS call id */ - u_int16_t pac_call_id; /* NAT'ed PAC call id */ -}; - -#endif /* _NAT_PPTP_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c70f46a4e..c6f5063f0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -77,7 +77,6 @@ #define PG_compound 19 /* Part of a compound page */ #define PG_anon 20 /* Anonymous: anon_vma in mapping */ -#define PG_ckrm_account 21 /* This page is accounted by CKRM */ /* diff --git a/include/linux/rbce.h b/include/linux/rbce.h deleted file mode 100644 index 91afba9ba..000000000 --- a/include/linux/rbce.h +++ /dev/null @@ -1,127 +0,0 @@ -/* Rule-based Classification Engine (RBCE) module - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * - * Module for loading of classification policies and providing - * a user API for Class-based Kernel Resource Management (CKRM) - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * - */ - -/* Changes - * - * 25 Mar 2004 - * Integrate RBCE and CRBE into a single module - * - */ - -#ifndef RBCE_H -#define RBCE_H - -// data types defined in main rbcemod.c -struct rbce_private_data; -struct rbce_class; -struct ckrm_core_class; - -#ifndef RBCE_EXTENSION - -/**************************************************************************** - * - * RBCE STANDALONE VERSION, NO CHOICE FOR DATA COLLECTION - * - ****************************************************************************/ - -#ifdef RBCE_SHOW_INCL -#warning " ... RBCE .." -#endif - -#define RBCE_MOD_DESCR "Rule Based Classification Engine Module for CKRM" -#define RBCE_MOD_NAME "rbce" - -/* extension to private data: NONE */ -struct rbce_ext_private_data { - /* empty data */ -}; -static inline void init_ext_private_data(struct rbce_private_data *dst) -{ -} - -/* sending notification to user: NONE */ - -static void notify_class_action(struct rbce_class *cls, int action) -{ -} -static inline void send_fork_notification(struct task_struct *tsk, - struct ckrm_core_class *cls) -{ -} -static inline void send_exit_notification(struct task_struct *tsk) -{ -} -static inline void send_manual_notification(struct task_struct *tsk) -{ -} - -/* extension initialization and destruction at module init and exit */ -static inline int init_rbce_ext_pre(void) -{ - return 0; -} -static inline int init_rbce_ext_post(void) -{ - return 0; -} -static inline void exit_rbce_ext(void) -{ -} - -#else - -/*************************************************************************** - * - * RBCE with User Level Notification - * - ***************************************************************************/ - -#ifdef RBCE_SHOW_INCL -#warning " ... CRBCE .." -#ifdef RBCE_DO_SAMPLE -#warning " ... CRBCE doing sampling ..." -#endif -#ifdef RBCE_DO_DELAY -#warning " ... CRBCE doing delay ..." -#endif -#endif - -#define RBCE_MOD_DESCR "Rule Based Classification Engine Module" \ - "with Data Sampling/Delivery for CKRM" -#define RBCE_MOD_NAME "crbce" - -#include - -struct rbce_ext_private_data { - struct task_sample_info sample; -}; - -static void notify_class_action(struct rbce_class *cls, int action); -#if 0 -static void send_fork_notification(struct task_struct *tsk, - struct ckrm_core_class *cls); -static void send_exit_notification(struct task_struct *tsk); -static void send_manual_notification(struct task_struct *tsk); -#endif - -#endif - -#endif // RBCE_H diff --git a/include/linux/rcfs.h b/include/linux/rcfs.h index 13aa5a7d2..232d58ef1 100644 --- a/include/linux/rcfs.h +++ b/include/linux/rcfs.h @@ -71,7 +71,6 @@ extern struct file_operations shares_fileops; extern struct file_operations stats_fileops; extern struct file_operations config_fileops; extern struct file_operations members_fileops; -extern struct file_operations reclassify_fileops; extern struct file_operations rcfs_file_operations; // Callbacks into rcfs from ckrm diff --git a/include/linux/sched.h b/include/linux/sched.h index dd5005295..93f3c3230 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -94,7 +94,7 @@ extern unsigned long avenrun[]; /* Load averages */ extern int nr_threads; extern int last_pid; DECLARE_PER_CPU(unsigned long, process_counts); -// DECLARE_PER_CPU(struct runqueue, runqueues); -- removed after ckrm cpu v7 merge +DECLARE_PER_CPU(struct runqueue, runqueues); extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); @@ -264,11 +264,6 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; -#ifdef CONFIG_CKRM_RES_MEM - struct ckrm_mem_res *memclass; - struct list_head tasklist; /* list of all tasks sharing this address space */ - spinlock_t peertask_lock; /* protect above tasklist */ -#endif }; extern int mmlist_nr; @@ -429,25 +424,6 @@ int set_current_groups(struct group_info *group_info); struct audit_context; /* See audit.c */ struct mempolicy; -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/** - * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class - * @run: how much time it has been running since the counter started - * @total: total time since the counter started - * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping - * @recalc_interval: how often do we recalculate the cpu_demand - * @cpu_demand: moving average of run/total - */ -struct ckrm_cpu_demand_stat { - unsigned long long run; - unsigned long long total; - unsigned long long last_sleep; - unsigned long long recalc_interval; - unsigned long cpu_demand; /*estimated cpu demand */ -}; -#endif - - struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -547,6 +523,7 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; + sigset_t blocked, real_blocked; struct sigpending pending; @@ -593,8 +570,6 @@ struct task_struct { struct io_context *io_context; - int ioprio; - unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ @@ -612,14 +587,10 @@ struct task_struct { struct list_head taskclass_link; #ifdef CONFIG_CKRM_CPU_SCHEDULE struct ckrm_cpu_class *cpu_class; - //track cpu demand of this task - struct ckrm_cpu_demand_stat demand_stat; -#endif //CONFIG_CKRM_CPU_SCHEDULE +#endif #endif // CONFIG_CKRM_TYPE_TASKCLASS -#ifdef CONFIG_CKRM_RES_MEM - struct list_head mm_peers; // list of tasks using same mm_struct -#endif // CONFIG_CKRM_RES_MEM #endif // CONFIG_CKRM + struct task_delay_info delays; }; @@ -801,6 +772,83 @@ extern int idle_cpu(int cpu); void yield(void); +/* + * These are the runqueue data structures: + */ +typedef struct runqueue runqueue_t; + +#ifdef CONFIG_CKRM_CPU_SCHEDULE +#include +#endif + +#ifdef CONFIG_CKRM_CPU_SCHEDULE + +/** + * if belong to different class, compare class priority + * otherwise compare task priority + */ +#define TASK_PREEMPTS_CURR(p, rq) \ + (((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio) +#else +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; +#define rq_active(p,rq) (rq->active) +#define rq_expired(p,rq) (rq->expired) +#define ckrm_rebalance_tick(j,this_cpu) do {} while (0) +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) +#endif + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#if defined(CONFIG_SMP) + unsigned long cpu_load; +#endif + unsigned long long nr_switches, nr_preempt; + unsigned long expired_timestamp, nr_uninterruptible; + unsigned long long timestamp_last_tick; + task_t *curr, *idle; + struct mm_struct *prev_mm; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + unsigned long ckrm_cpu_load; + struct classqueue_struct classqueue; +#else + prio_array_t *active, *expired, arrays[2]; +#endif + int best_expired_prio; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +#endif + struct list_head hold_queue; + int idle_tokens; +}; + /* * The default (Linux) execution domain. */ @@ -837,7 +885,6 @@ static inline struct user_struct *get_uid(struct user_struct *u) atomic_inc(&u->__count); return u; } - extern void free_uid(struct user_struct *); extern void switch_uid(struct user_struct *); @@ -943,7 +990,6 @@ static inline int capable(int cap) } #endif - /* * Routines for handling mm_structs */ @@ -1077,7 +1123,7 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task) return mm; } - + /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ @@ -1201,43 +1247,19 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #define def_delay_var(var) unsigned long long var #define get_delay(tsk,field) ((tsk)->delays.field) +#define delay_value(x) (((unsigned long)(x))/1000) #define start_delay(var) ((var) = sched_clock()) #define start_delay_set(var,flg) (set_delay_flag(current,flg),(var) = sched_clock()) #define inc_delay(tsk,field) (((tsk)->delays.field)++) +#define add_delay_ts(tsk,field,start_ts,end_ts) ((tsk)->delays.field += delay_value((end_ts)-(start_ts))) +#define add_delay_clear(tsk,field,start_ts,flg) (add_delay_ts(tsk,field,start_ts,sched_clock()),clear_delay_flag(tsk,flg)) -/* because of hardware timer drifts in SMPs and task continue on different cpu - * then where the start_ts was taken there is a possibility that - * end_ts < start_ts by some usecs. In this case we ignore the diff - * and add nothing to the total. - */ -#ifdef CONFIG_SMP -#define test_ts_integrity(start_ts,end_ts) (likely((end_ts) > (start_ts))) -#else -#define test_ts_integrity(start_ts,end_ts) (1) -#endif - -#define add_delay_ts(tsk,field,start_ts,end_ts) \ - do { if (test_ts_integrity(start_ts,end_ts)) (tsk)->delays.field += ((end_ts)-(start_ts)); } while (0) - -#define add_delay_clear(tsk,field,start_ts,flg) \ - do { \ - unsigned long long now = sched_clock();\ - add_delay_ts(tsk,field,start_ts,now); \ - clear_delay_flag(tsk,flg); \ - } while (0) - -static inline void add_io_delay(unsigned long long dstart) +static inline void add_io_delay(unsigned long dstart) { struct task_struct * tsk = current; - unsigned long long now = sched_clock(); - unsigned long long val; - - if (test_ts_integrity(dstart,now)) - val = now - dstart; - else - val = 0; + unsigned long val = delay_value(sched_clock()-dstart); if (test_delay_flag(tsk,PF_MEMIO)) { tsk->delays.mem_iowait_total += val; tsk->delays.num_memwaits++; diff --git a/include/linux/socket.h b/include/linux/socket.h index 602d03b5d..4cd4850d7 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -269,9 +269,6 @@ struct ucred { #define SOL_NETBEUI 267 #define SOL_LLC 268 -/* PlanetLab PL2525: reset the context ID of an existing socket */ -#define SO_SETXID SO_PEERCRED - /* IPX options */ #define IPX_TYPE 1 diff --git a/include/linux/taskdelays.h b/include/linux/taskdelays.h index e5682d805..eafb1e77f 100644 --- a/include/linux/taskdelays.h +++ b/include/linux/taskdelays.h @@ -5,7 +5,7 @@ #include struct task_delay_info { -#if defined CONFIG_DELAY_ACCT +#ifdef CONFIG_DELAY_ACCT /* delay statistics in usecs */ uint64_t waitcpu_total; uint64_t runcpu_total; @@ -14,7 +14,7 @@ struct task_delay_info { uint32_t runs; uint32_t num_iowaits; uint32_t num_memwaits; -#endif +#endif }; #endif // _LINUX_TASKDELAYS_H diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9ed5fac6c..9cdf6963e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -387,6 +387,7 @@ struct tcp_opt { #ifndef CONFIG_ACCEPT_QUEUES struct open_request *accept_queue_tail; #endif + unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; diff --git a/include/linux/vserver/inode.h b/include/linux/vserver/inode.h index e19632d08..fc49aba6d 100644 --- a/include/linux/vserver/inode.h +++ b/include/linux/vserver/inode.h @@ -57,10 +57,6 @@ extern int vc_set_iattr_v0(uint32_t, void __user *); extern int vc_get_iattr(uint32_t, void __user *); extern int vc_set_iattr(uint32_t, void __user *); -extern int vc_iattr_ioctl(struct dentry *de, - unsigned int cmd, - unsigned long arg); - #endif /* __KERNEL__ */ /* inode ioctls */ @@ -68,7 +64,4 @@ extern int vc_iattr_ioctl(struct dentry *de, #define FIOC_GETXFLG _IOR('x', 5, long) #define FIOC_SETXFLG _IOW('x', 6, long) -#define FIOC_GETIATTR _IOR('x', 7, long) -#define FIOC_SETIATTR _IOR('x', 8, long) - #endif /* _VX_INODE_H */ diff --git a/include/net/sock.h b/include/net/sock.h index a487663e0..a2aba080f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1086,10 +1086,8 @@ static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * packet. */ if (inet_stream_ops.bind != inet_bind && - (int) sk->sk_xid > 0 && sk->sk_xid != skb->xid) { - err = -EPERM; + (int) sk->sk_xid >= 0 && sk->sk_xid != skb->xid) goto out; - } /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK diff --git a/init/Kconfig b/init/Kconfig index 64ca2fcb7..89ec58c3f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -115,18 +115,6 @@ config BSD_PROCESS_ACCT up to the user level program to do useful things with this information. This is generally a good idea, so say Y. -config BSD_PROCESS_ACCT_V3 - bool "BSD Process Accounting version 3 file format" - depends on BSD_PROCESS_ACCT - default n - help - If you say Y here, the process accounting information is written - in a new file format that also logs the process IDs of each - process and it's parent. Note that this file format is incompatible - with previous v0/v1/v2 file formats, so you will need updated tools - for processing it. A preliminary version of these tools is available - at . - menu "Class Based Kernel Resource Management" config CKRM @@ -175,45 +163,21 @@ config CKRM_RES_NUMTASKS config CKRM_CPU_SCHEDULE bool "CKRM CPU scheduler" depends on CKRM_TYPE_TASKCLASS - default y + default m help Use CKRM CPU scheduler instead of Linux Scheduler Say N if unsure, Y to use the feature. -config CKRM_RES_BLKIO - tristate " Disk I/O Resource Controller" - depends on CKRM_TYPE_TASKCLASS && IOSCHED_CFQ +config CKRM_CPU_MONITOR + bool "CKRM CPU Resoure Monitor" + depends on CKRM_CPU_SCHEDULE default m help - Provides a resource controller for best-effort block I/O - bandwidth control. The controller attempts this by proportional - servicing of requests in the I/O scheduler. However, seek - optimizations and reordering by device drivers/disk controllers may - alter the actual bandwidth delivered to a class. + Monitor CPU Resource Usage of the classes Say N if unsure, Y to use the feature. -config CKRM_RES_MEM - bool "Class based physical memory controller" - default y - depends on CKRM - help - Provide the basic support for collecting physical memory usage information - among classes. Say Y if you want to know the memory usage of each class. - -config CKRM_MEM_LRUORDER_CHANGE - bool "Change the LRU ordering of scanned pages" - default n - depends on CKRM_RES_MEM - help - While trying to free pages, by default(n), scanned pages are left were they - are found if they belong to relatively under-used class. In this case the - LRU ordering of the memory subsystemis left intact. If this option is chosen, - then the scanned pages are moved to the tail of the list(active or inactive). - Changing this to yes reduces the checking overhead but violates the approximate - LRU order that is maintained by the paging subsystem. - config CKRM_TYPE_SOCKETCLASS bool "Class Manager for socket groups" depends on CKRM @@ -262,6 +226,18 @@ config CKRM_CRBCE endmenu +config BSD_PROCESS_ACCT_V3 + bool "BSD Process Accounting version 3 file format" + depends on BSD_PROCESS_ACCT + default n + help + If you say Y here, the process accounting information is written + in a new file format that also logs the process IDs of each + process and it's parent. Note that this file format is incompatible + with previous v0/v1/v2 file formats, so you will need updated tools + for processing it. A preliminary version of these tools is available + at . + config SYSCTL bool "Sysctl support" ---help--- @@ -353,22 +329,6 @@ config IKCONFIG_PROC This option enables access to the kernel configuration file through /proc/config.gz. -config OOM_PANIC - bool "OOM Panic" - default y - ---help--- - This option enables panic() to be called when a system is out of - memory. This feature along with /proc/sys/kernel/panic allows a - different behavior on out-of-memory conditions when the standard - behavior (killing processes in an attempt to recover) does not - make sense. - - If unsure, say N. - -config OOM_KILL - bool - depends on !OOM_PANIC - default y menuconfig EMBEDDED bool "Configure standard kernel features (for small systems)" diff --git a/init/main.c b/init/main.c index 6416eab8d..e93d25685 100644 --- a/init/main.c +++ b/init/main.c @@ -55,7 +55,6 @@ int __init init_ckrm_sched_res(void); #else #define init_ckrm_sched_res() ((void)0) #endif -//#include /* * This is one of the first .c files built. Error out early @@ -477,7 +476,6 @@ asmlinkage void __init start_kernel(void) * printk() and can access its per-cpu storage. */ smp_prepare_boot_cpu(); - /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() @@ -697,9 +695,7 @@ static int init(void * unused) * firmware files. */ populate_rootfs(); - do_basic_setup(); - init_ckrm_sched_res(); sched_init_smp(); diff --git a/kernel/Makefile b/kernel/Makefile index ec5001052..905f3c59d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,9 +27,12 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o -obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o +obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o +obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_sched.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o +obj-$(CONFIG_KGDB) += kgdbstub.o + ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index b32530977..3da88775d 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -3,11 +3,11 @@ # ifeq ($(CONFIG_CKRM),y) - obj-y = ckrm.o ckrmutils.o ckrm_numtasks_stub.o rbce/ + obj-y = ckrm.o ckrmutils.o ckrm_tasks_stub.o rbce/ endif obj-$(CONFIG_CKRM_TYPE_TASKCLASS) += ckrm_tc.o - obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o + obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_tasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o - obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o - obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o - obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o + obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o + obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o + obj-$(CONFIG_CKRM_CPU_MONITOR) += ckrm_cpu_monitor.o diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c index f1cfb268c..5217ea003 100644 --- a/kernel/ckrm/ckrm.c +++ b/kernel/ckrm/ckrm.c @@ -142,7 +142,7 @@ EXPORT_SYMBOL(ckrm_classobj); static inline void set_callbacks_active(struct ckrm_classtype *ctype) { - ctype->ce_cb_active = ((atomic_read(&ctype->ce_regd) > 0) && + ctype->ce_cb_active = ((atomic_read(&ctype->ce_nr_users) > 0) && (ctype->ce_callbacks.always_callback || (ctype->num_classes > 1))); } @@ -176,11 +176,10 @@ int ckrm_register_engine(const char *typename, ckrm_eng_callback_t * ecbs) if (ctype == NULL) return (-ENOENT); - atomic_inc(&ctype->ce_regd); - - /* another engine registered or trying to register ? */ - if (atomic_read(&ctype->ce_regd) != 1) { - atomic_dec(&ctype->ce_regd); + ce_protect(ctype); + if (atomic_read(&ctype->ce_nr_users) != 1) { + // Some engine is acive, deregister it first. + ce_release(ctype); return (-EBUSY); } @@ -193,10 +192,17 @@ int ckrm_register_engine(const char *typename, ckrm_eng_callback_t * ecbs) if (!(((ecbs->classify) && (ecbs->class_delete)) || (ecbs->notify)) || (ecbs->c_interest && ecbs->classify == NULL) || (ecbs->n_interest && ecbs->notify == NULL)) { - atomic_dec(&ctype->ce_regd); + ce_release(ctype); return (-EINVAL); } + /* Is any other engine registered for this classtype ? */ + if (ctype->ce_regd) { + ce_release(ctype); + return (-EINVAL); + } + + ctype->ce_regd = 1; ctype->ce_callbacks = *ecbs; set_callbacks_active(ctype); @@ -229,12 +235,13 @@ int ckrm_unregister_engine(const char *typename) ctype->ce_cb_active = 0; - if (atomic_read(&ctype->ce_nr_users) > 1) { + if (atomic_dec_and_test(&ctype->ce_nr_users) != 1) { // Somebody is currently using the engine, cannot deregister. - return (-EAGAIN); + atomic_inc(&ctype->ce_nr_users); + return (-EBUSY); } - atomic_set(&ctype->ce_regd, 0); + ctype->ce_regd = 0; memset(&ctype->ce_callbacks, 0, sizeof(ckrm_eng_callback_t)); return 0; } @@ -444,7 +451,7 @@ ckrm_init_core_class(struct ckrm_classtype *clstype, CLS_DEBUG("name %s => %p\n", name ? name : "default", dcore); if ((dcore != clstype->default_class) && (!ckrm_is_core_valid(parent))){ - printk(KERN_DEBUG "error not a valid parent %p\n", parent); + printk("error not a valid parent %p\n", parent); return -EINVAL; } #if 0 @@ -456,7 +463,7 @@ ckrm_init_core_class(struct ckrm_classtype *clstype, (void **)kmalloc(clstype->max_resid * sizeof(void *), GFP_KERNEL); if (dcore->res_class == NULL) { - printk(KERN_DEBUG "error no mem\n"); + printk("error no mem\n"); return -ENOMEM; } } @@ -532,10 +539,10 @@ void ckrm_free_core_class(struct ckrm_core_class *core) parent->name); if (core->delayed) { /* this core was marked as late */ - printk(KERN_DEBUG "class <%s> finally deleted %lu\n", core->name, jiffies); + printk("class <%s> finally deleted %lu\n", core->name, jiffies); } if (ckrm_remove_child(core) == 0) { - printk(KERN_DEBUG "Core class removal failed. Chilren present\n"); + printk("Core class removal failed. Chilren present\n"); } for (i = 0; i < clstype->max_resid; i++) { @@ -656,7 +663,7 @@ ckrm_register_res_ctlr(struct ckrm_classtype *clstype, ckrm_res_ctlr_t * rcbs) */ read_lock(&ckrm_class_lock); list_for_each_entry(core, &clstype->classes, clslist) { - printk(KERN_INFO "CKRM .. create res clsobj for resouce <%s>" + printk("CKRM .. create res clsobj for resouce <%s>" "class <%s> par=%p\n", rcbs->res_name, core->name, core->hnode.parent); ckrm_alloc_res_class(core, core->hnode.parent, resid); @@ -833,7 +840,7 @@ int ckrm_unregister_event_set(struct ckrm_event_spec especs[]) } #define ECC_PRINTK(fmt, args...) \ -// printk(KERN_DEBUG "%s: " fmt, __FUNCTION__ , ## args) +// printk("%s: " fmt, __FUNCTION__ , ## args) void ckrm_invoke_event_cb_chain(enum ckrm_event ev, void *arg) { @@ -978,7 +985,7 @@ void ckrm_cb_exit(struct task_struct *tsk) void __init ckrm_init(void) { - printk(KERN_DEBUG "CKRM Initialization\n"); + printk("CKRM Initialization\n"); // register/initialize the Metatypes @@ -996,7 +1003,7 @@ void __init ckrm_init(void) #endif // prepare init_task and then rely on inheritance of properties ckrm_cb_newtask(&init_task); - printk(KERN_DEBUG "CKRM Initialization done\n"); + printk("CKRM Initialization done\n"); } EXPORT_SYMBOL(ckrm_register_engine); diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c index 917875b18..0ded7f3c6 100644 --- a/kernel/ckrm/ckrm_cpu_class.c +++ b/kernel/ckrm/ckrm_cpu_class.c @@ -23,32 +23,17 @@ #include #include -struct ckrm_res_ctlr cpu_rcbs; -/** - * insert_cpu_class - insert a class to active_cpu_class list - * - * insert the class in decreasing order of class weight - */ -static inline void insert_cpu_class(struct ckrm_cpu_class *cls) -{ - list_add(&cls->links,&active_cpu_classes); -} +struct ckrm_res_ctlr cpu_rcbs; /* * initialize a class object and its local queues */ -void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) + static void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) { int i,j,k; prio_array_t *array; - ckrm_lrq_t* queue; - - cls->shares = *shares; - cls->cnt_lock = SPIN_LOCK_UNLOCKED; - ckrm_cpu_stat_init(&cls->stat); - ckrm_usage_init(&cls->usage); - cls->magic = CKRM_CPU_CLASS_MAGIC; + struct ckrm_local_runqueue* queue; for (i = 0 ; i < NR_CPUS ; i++) { queue = &cls->local_queues[i]; @@ -73,37 +58,34 @@ void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) queue->top_priority = MAX_PRIO; cq_node_init(&queue->classqueue_linkobj); queue->local_cvt = 0; - queue->lrq_load = 0; - queue->local_weight = cpu_class_weight(cls); + queue->uncounted_cvt = 0; queue->uncounted_ns = 0; - queue->savings = 0; queue->magic = 0x43FF43D7; } + cls->shares = *shares; + cls->global_cvt = 0; + cls->cnt_lock = SPIN_LOCK_UNLOCKED; + ckrm_cpu_stat_init(&cls->stat); + // add to class list write_lock(&class_list_lock); - insert_cpu_class(cls); + list_add(&cls->links,&active_cpu_classes); write_unlock(&class_list_lock); } static inline void set_default_share(ckrm_shares_t *shares) { shares->my_guarantee = 0; - shares->total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + shares->total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - shares->cur_max_limit = 0; + shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + shares->cur_max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; } -struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) -{ - struct ckrm_cpu_class * cls; - cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); - if (valid_cpu_class(cls)) - return cls; - else - return NULL; +struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) { + return ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); } @@ -112,7 +94,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class struct ckrm_cpu_class *cls; if (! parent) /*root class*/ - cls = get_default_cpu_class(); + cls = default_cpu_class; else cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC); @@ -131,7 +113,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class cls->parent = parent; } } else - printk(KERN_ERR"alloc_cpu_class failed\n"); + printk("alloc_cpu_class failed GFP_ATOMIC\n"); return cls; } @@ -150,7 +132,7 @@ static void ckrm_free_cpu_class(void *my_res) return; /*the default class can't be freed*/ - if (cls == get_default_cpu_class()) + if (cls == default_cpu_class) return; // Assuming there will be no children when this function is called @@ -180,9 +162,6 @@ static void ckrm_free_cpu_class(void *my_res) write_unlock(&class_list_lock); kfree(cls); - - //call ckrm_cpu_monitor after class removed - ckrm_cpu_monitor(0); } /* @@ -208,28 +187,18 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) parres = NULL; } - /* - * hzheng: CKRM_SHARE_DONTCARE should be handled - */ - if (new_share->my_guarantee == CKRM_SHARE_DONTCARE) - new_share->my_guarantee = 0; - rc = set_shares(new_share, cur, par); - if (cur->my_limit == CKRM_SHARE_DONTCARE) - cur->my_limit = cur->max_limit; - spin_unlock(&cls->cnt_lock); if (cls->parent) { spin_unlock(&parres->cnt_lock); } - - //call ckrm_cpu_monitor after changes are changed - ckrm_cpu_monitor(0); - return rc; } +/* + * translate the global_CVT to ticks + */ static int ckrm_cpu_get_share(void *my_res, struct ckrm_shares *shares) { @@ -244,59 +213,64 @@ static int ckrm_cpu_get_share(void *my_res, int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) { struct ckrm_cpu_class *cls = my_res; - struct ckrm_cpu_class_stat* stat = &cls->stat; - ckrm_lrq_t* lrq; - int i; if (!cls) return -EINVAL; seq_printf(sfile, "-------- CPU Class Status Start---------\n"); - seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n", + seq_printf(sfile, " gua= %d limit= %d\n", cls->shares.my_guarantee, - cls->shares.my_limit, + cls->shares.my_limit); + seq_printf(sfile, " total_gua= %d limit= %d\n", cls->shares.total_guarantee, cls->shares.max_limit); - seq_printf(sfile, "\tunused_grt= %d cur_max_limit= %d\n", + seq_printf(sfile, " used_gua= %d cur_limit= %d\n", cls->shares.unused_guarantee, cls->shares.cur_max_limit); - seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt); - seq_printf(sfile, "\tmegrt= %d\n",stat->megrt); - seq_printf(sfile, "\tehl= %d\n",stat->ehl); - seq_printf(sfile, "\tmehl= %d\n",stat->mehl); - seq_printf(sfile, "\teshare= %d\n",stat->eshare); - seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls)); - seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand); - seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns); - seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n", - get_ckrm_usage(cls,2*HZ), - get_ckrm_usage(cls,10*HZ), - get_ckrm_usage(cls,60*HZ) - ); - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(cls,i); - seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings); - } - + seq_printf(sfile, " Share= %d\n",cpu_class_weight(cls)); + seq_printf(sfile, " cvt= %llu\n",cls->local_queues[0].local_cvt); + seq_printf(sfile, " total_ns= %llu\n",cls->stat.total_ns); + seq_printf(sfile, " prio= %d\n",cls->local_queues[0].classqueue_linkobj.prio); + seq_printf(sfile, " index= %d\n",cls->local_queues[0].classqueue_linkobj.index); + seq_printf(sfile, " run= %llu\n",cls->stat.local_stats[0].run); + seq_printf(sfile, " total= %llu\n",cls->stat.local_stats[0].total); + seq_printf(sfile, " cpu_demand= %lu\n",cls->stat.cpu_demand); + + seq_printf(sfile, " effective_guarantee= %d\n",cls->stat.effective_guarantee); + seq_printf(sfile, " effective_limit= %d\n",cls->stat.effective_limit); + seq_printf(sfile, " effective_share= %d\n",cls->stat.effective_share); seq_printf(sfile, "-------- CPU Class Status END ---------\n"); + return 0; } /* * task will remain in the same cpu but on a different local runqueue */ -void ckrm_cpu_change_class(void *task, void *old, void *new) +static void ckrm_cpu_change_class(void *task, void *old, void *new) { struct task_struct *tsk = task; struct ckrm_cpu_class *newcls = new; + unsigned long flags; + struct runqueue *rq; + prio_array_t *array; /*sanity checking*/ if (!task || ! old || !new) return; - _ckrm_cpu_change_class(tsk,newcls); + rq = task_rq_lock(tsk,&flags); + array = tsk->array; + if (array) { + dequeue_task(tsk,array); + tsk->cpu_class = newcls; + enqueue_task(tsk,rq_active(tsk,rq)); + } else { + tsk->cpu_class = newcls; + } + task_rq_unlock(rq,&flags); } /*dummy function, not used*/ @@ -318,12 +292,12 @@ static int ckrm_cpu_set_config(void *my_res, const char *cfgstr) if (!cls) return -EINVAL; - printk(KERN_DEBUG "ckrm_cpu config='%s'\n",cfgstr); + printk("ckrm_cpu config='%s'\n",cfgstr); return 0; } struct ckrm_res_ctlr cpu_rcbs = { - .res_name = "cpu", + .res_name = "CKRM CPU Class", .res_hdepth = 1, .resid = -1, .res_alloc = ckrm_alloc_cpu_class, @@ -349,7 +323,7 @@ int __init init_ckrm_sched_res(void) if (resid == -1) { /*not registered */ resid = ckrm_register_res_ctlr(clstype,&cpu_rcbs); - printk(KERN_DEBUG "........init_ckrm_sched_res , resid= %d\n",resid); + printk("........init_ckrm_sched_res , resid= %d\n",resid); } return 0; } @@ -365,11 +339,10 @@ void init_cpu_classes(void) //init classqueues for each processor for (i=0; i < NR_CPUS; i++) classqueue_init(get_cpu_classqueue(i)); - - /* - * hzheng: initialize the default cpu class - * required for E14/E15 since ckrm_init is called after sched_init - */ +/* + * hzheng: initialize the default cpu class + * required for E14 since ckrm_init is called after sched_init + */ ckrm_alloc_cpu_class(NULL,NULL); } diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c index d8c199a20..674ee6e50 100644 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ b/kernel/ckrm/ckrm_cpu_monitor.c @@ -28,84 +28,36 @@ #include #include -#define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/ +#define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/ +#define CKRM_SHARE_ACCURACY 7 #define CKRM_SHARE_MAX (1<shares.my_limit; -} - -static inline int get_mysoft_limit(struct ckrm_cpu_class *cls) -{ - return cls->shares.total_guarantee; -} - -static inline int get_hard_limit(struct ckrm_cpu_class *cls) -{ - return cls->shares.total_guarantee; -} - -static inline int get_myhard_limit(struct ckrm_cpu_class *cls) -{ - return cls->shares.total_guarantee; -} - - -static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type) -{ - unsigned long long now = sched_clock(); - - local_stat->run = 0; - local_stat->total = 0; - local_stat->last_sleep = now; - switch (type) { - case CPU_DEMAND_TP_CLASS: - local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC; - local_stat->cpu_demand = 0; - break; - case CPU_DEMAND_TP_TASK: - local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC; - //for task, the init cpu_demand is copied from its parent - break; - default: - BUG(); - } -} void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) { int i; + struct ckrm_cpu_class_local_stat* local_stat; + unsigned long long now = sched_clock(); stat->stat_lock = SPIN_LOCK_UNLOCKED; stat->total_ns = 0; - stat->max_demand = 0; + stat->cpu_demand = 0; for (i=0; i< NR_CPUS; i++) { - cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS); + local_stat = &stat->local_stats[i]; + local_stat->run = 0; + local_stat->total = 0; + local_stat->last_sleep = now; + local_stat->cpu_demand = 0; } - stat->egrt = 0; - stat->megrt = 0; - stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/ - stat->mehl = CKRM_SHARE_MAX; /*default: no limit */ - - stat->eshare = CKRM_SHARE_MAX; - stat->meshare = CKRM_SHARE_MAX; + stat->effective_guarantee = 0; + stat->effective_limit = 0; + stat->glut = 0; + stat->effective_share = 100; + stat->self_effective_share = 100; } - /**********************************************/ /* cpu demand */ /**********************************************/ @@ -125,42 +77,52 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) */ /** - * update_cpu_demand_stat - + * update_cpu_demand - update a state change * - * should be called whenever the state of a task/task local queue changes + * should be called whenever the state of a local queue changes * -- when deschedule : report how much run * -- when enqueue: report how much sleep * - * how often should we recalculate the cpu demand - * the number is in ns + * to deal with excessive long run/sleep state + * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record */ -static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len) +#define CKRM_CPU_DEMAND_RUN 0 +#define CKRM_CPU_DEMAND_SLEEP 1 +//how often should we recalculate the cpu demand, in ns +#define CPU_DEMAND_CAL_THRESHOLD (1000000000LL) +static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len) { local_stat->total += len; if (state == CKRM_CPU_DEMAND_RUN) local_stat->run += len; - if (local_stat->total >= local_stat->recalc_interval) { + if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) { local_stat->total >>= CKRM_SHARE_ACCURACY; - if (unlikely(local_stat->run > 0xFFFFFFFF)) - local_stat->run = 0xFFFFFFFF; - - if (local_stat->total > 0xFFFFFFFF) + if (local_stat->total > 0xFFFFFFFF) local_stat->total = 0xFFFFFFFF; - - do_div(local_stat->run,(unsigned long)local_stat->total); - if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep - local_stat->cpu_demand = local_stat->run; - else { - local_stat->cpu_demand += local_stat->run; - local_stat->cpu_demand >>= 1; - } + do_div(local_stat->run,(unsigned long)local_stat->total); + local_stat->cpu_demand +=local_stat->run; + local_stat->cpu_demand >>= 1; local_stat->total = 0; local_stat->run = 0; } } +static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len) +{ + update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len); +} + +static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len) +{ + update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len); +} + +#define CPU_DEMAND_ENQUEUE 0 +#define CPU_DEMAND_DEQUEUE 1 +#define CPU_DEMAND_DESCHEDULE 2 + /** * cpu_demand_event - and cpu_demand event occured * @event: one of the following three events: @@ -169,24 +131,19 @@ static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_sta * CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run */ -void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) +void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len) { switch (event) { case CPU_DEMAND_ENQUEUE: len = sched_clock() - local_stat->last_sleep; local_stat->last_sleep = 0; - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len); + cpu_demand_update_sleep(local_stat,len); break; case CPU_DEMAND_DEQUEUE: - if (! local_stat->last_sleep) { - local_stat->last_sleep = sched_clock(); - } + local_stat->last_sleep = sched_clock(); break; case CPU_DEMAND_DESCHEDULE: - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len); - break; - case CPU_DEMAND_INIT: //for task init only - cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK); + cpu_demand_update_run(local_stat,len); break; default: BUG(); @@ -195,19 +152,18 @@ void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsign /** * check all the class local queue - * - * to deal with excessive long run/sleep state - * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record + * if local queueu is not in runqueue, then it's in sleep state + * if compare to last sleep, */ static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) { - struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; + struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu]; unsigned long long sleep,now; if (local_stat->last_sleep) { now = sched_clock(); sleep = now - local_stat->last_sleep; local_stat->last_sleep = now; - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); + cpu_demand_update_sleep(local_stat,sleep); } } @@ -216,72 +172,51 @@ static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int * * self_cpu_demand = sum(cpu demand of all local queues) */ -static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) +static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat + *stat) { int cpu_demand = 0; int i; - int cpuonline = 0; for_each_online_cpu(i) { cpu_demand_check_sleep(stat,i); cpu_demand += stat->local_stats[i].cpu_demand; - cpuonline ++; } - return (cpu_demand/cpuonline); + if (cpu_demand > CKRM_SHARE_MAX) + cpu_demand = CKRM_SHARE_MAX; + return cpu_demand; } /* - * my max demand = min(cpu_demand, my effective hard limit) + * update effective cpu demand for each class + * assume the root_core->parent == NULL */ -static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) -{ - unsigned long mmax_demand = get_self_cpu_demand(stat); - if (mmax_demand > stat->mehl) - mmax_demand = stat->mehl; - - return mmax_demand; -} - -/** - * update_max_demand: update effective cpu demand for each class - * return -1 on error - * - * Assume: the root_core->parent == NULL - */ -static int update_max_demand(struct ckrm_core_class *root_core) +static void update_cpu_demand(struct ckrm_core_class *root_core) { struct ckrm_core_class *cur_core, *child_core; - struct ckrm_cpu_class *cls,*c_cls; - int ret = -1; + struct ckrm_cpu_class *cls; cur_core = root_core; child_core = NULL; - - repeat: - if (!cur_core) { //normal exit - ret = 0; - goto out; - } + /* + * iterate the tree + * update cpu_demand of each node + */ + repeat: + if (!cur_core) + return; cls = ckrm_get_cpu_class(cur_core); - if (! cls) //invalid c_cls, abort - goto out; - if (!child_core) //first child - cls->stat.max_demand = get_mmax_demand(&cls->stat); + cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat); else { - c_cls = ckrm_get_cpu_class(child_core); - if (c_cls) - cls->stat.max_demand += c_cls->stat.max_demand; - else //invalid c_cls, abort - goto out; + cls->stat.cpu_demand += + ckrm_get_cpu_class(child_core)->stat.cpu_demand; + if (cls->stat.cpu_demand > CKRM_SHARE_MAX) + cls->stat.cpu_demand = CKRM_SHARE_MAX; } - //check class hard limit - if (cls->stat.max_demand > cls->stat.ehl) - cls->stat.max_demand = cls->stat.ehl; - //next child child_core = ckrm_get_next_child(cur_core, child_core); if (child_core) { @@ -294,123 +229,78 @@ static int update_max_demand(struct ckrm_core_class *root_core) cur_core = child_core->hnode.parent; } goto repeat; - out: - return ret; } /**********************************************/ /* effective guarantee & limit */ /**********************************************/ -static inline void set_eshare(struct ckrm_cpu_class_stat *stat, +static inline void set_effective_share(struct ckrm_cpu_class_stat *stat, int new_share) { if (!new_share) new_share = 1; - - BUG_ON(new_share < 0); - stat->eshare = new_share; + stat->effective_share = new_share; } -static inline void set_meshare(struct ckrm_cpu_class_stat *stat, +static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat, int new_share) { if (!new_share) new_share = 1; - - BUG_ON(new_share < 0); - stat->meshare = new_share; + stat->self_effective_share = new_share; } -/** - *update_child_effective - update egrt, ehl, mehl for all children of parent - *@parent: the parent node - *return -1 if anything wrong - * - */ -static int update_child_effective(struct ckrm_core_class *parent) +static inline void update_child_effective(struct ckrm_core_class *parent) { struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent); - struct ckrm_core_class *child_core; - int ret = -1; - - if (! p_cls) - return ret; + struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL); - child_core = ckrm_get_next_child(parent, NULL); while (child_core) { struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - return ret; - c_cls->stat.egrt = - p_cls->stat.egrt * + c_cls->stat.effective_guarantee = + p_cls->stat.effective_guarantee * c_cls->shares.my_guarantee / p_cls->shares.total_guarantee; - - c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee - / c_cls->shares.total_guarantee; - - c_cls->stat.ehl = - p_cls->stat.ehl * - get_hard_limit(c_cls) / p_cls->shares.total_guarantee; - - c_cls->stat.mehl = - c_cls->stat.ehl * - get_myhard_limit(c_cls) / c_cls->shares.total_guarantee; - - set_eshare(&c_cls->stat,c_cls->stat.egrt); - set_meshare(&c_cls->stat,c_cls->stat.megrt); - + c_cls->stat.effective_limit = + p_cls->stat.effective_guarantee * c_cls->shares.my_limit / + p_cls->shares.total_guarantee; child_core = ckrm_get_next_child(parent, child_core); }; - return 0; + } -/** - * update_effectives: update egrt, ehl, mehl for the whole tree +/* + * update effective guarantee and effective limit + * -- effective share = parent->effective->share * share/parent->total_share + * -- effective limit = parent->effective->share * limit/parent->total_share * should be called only when class structure changed - * - * return -1 if anything wrong happened (eg: the structure changed during the process) */ -static int update_effectives(struct ckrm_core_class *root_core) +static void update_effective_guarantee_limit(struct ckrm_core_class *root_core) { - struct ckrm_core_class *cur_core, *child_core; + struct ckrm_core_class *cur_core, *child_core = NULL; struct ckrm_cpu_class *cls; - int ret = -1; cur_core = root_core; - child_core = NULL; cls = ckrm_get_cpu_class(cur_core); + cls->stat.effective_guarantee = CKRM_SHARE_MAX; + cls->stat.effective_limit = cls->stat.effective_guarantee; - //initialize the effectives for root - cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */ - cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee - / cls->shares.total_guarantee; - cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls) - / cls->shares.total_guarantee; - cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls) - / cls->shares.total_guarantee; - set_eshare(&cls->stat,cls->stat.egrt); - set_meshare(&cls->stat,cls->stat.megrt); - - repeat: + repeat: //check exit if (!cur_core) - return 0; + return; - //visit this node only once - if (! child_core) - if (update_child_effective(cur_core) < 0) - return ret; //invalid cur_core node - + //visit this node + update_child_effective(cur_core); //next child child_core = ckrm_get_next_child(cur_core, child_core); - if (child_core) { - //go down to the next hier + //go down cur_core = child_core; child_core = NULL; - } else { //no more child, go back + goto repeat; + } else { //no more child, go back child_core = cur_core; cur_core = child_core->hnode.parent; } @@ -422,12 +312,12 @@ static int update_effectives(struct ckrm_core_class *root_core) /**********************************************/ /* - * surplus = egrt - demand + * surplus = my_effective_share - demand * if surplus < 0, surplus = 0 */ static inline int get_node_surplus(struct ckrm_cpu_class *cls) { - int surplus = cls->stat.egrt - cls->stat.max_demand; + int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand; if (surplus < 0) surplus = 0; @@ -435,254 +325,122 @@ static inline int get_node_surplus(struct ckrm_cpu_class *cls) return surplus; } -static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); - - if (surplus < 0) - surplus = 0; - - return surplus; -} - -/** - * consume_surplus: decides how much surplus a node can consume - * @ckeck_sl: if check_sl is set, then check soft_limitx +/* + * consume the surplus * return how much consumed - * - * implements all the CKRM Scheduling Requirement - * assume c_cls is valid + * set glut when necessary */ -static inline int consume_surplus(int surplus, - struct ckrm_cpu_class *c_cls, - struct ckrm_cpu_class *p_cls, - int check_sl - ) +static inline int node_surplus_consume(int old_surplus, + struct ckrm_core_class *child_core, + struct ckrm_cpu_class *p_cls) { int consumed = 0; int inc_limit; - int total_grt = p_cls->shares.total_guarantee; - BUG_ON(surplus < 0); + struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core); - /*can't consume more than demand or hard limit*/ - if (c_cls->stat.eshare >= c_cls->stat.max_demand) + if (c_cls->stat.glut) goto out; - //the surplus allocation is propotional to grt - consumed = - surplus * c_cls->shares.my_guarantee / total_grt; - - if (! consumed) //no more share + //check demand + if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) { + c_cls->stat.glut = 1; goto out; - - //hard limit and demand limit - inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare; - - if (check_sl) { - int esl = p_cls->stat.eshare * get_soft_limit(c_cls) - /total_grt; - if (esl < c_cls->stat.max_demand) - inc_limit = esl - c_cls->stat.eshare; } - if (consumed > inc_limit) - consumed = inc_limit; - - BUG_ON(consumed < 0); - out: - return consumed; -} - -/* - * how much a node can consume for itself? - */ -static inline int consume_self_surplus(int surplus, - struct ckrm_cpu_class *p_cls, - int check_sl - ) -{ - int consumed = 0; - int inc_limit; - int total_grt = p_cls->shares.total_guarantee; - int max_demand = get_mmax_demand(&p_cls->stat); - - BUG_ON(surplus < 0); - - /*can't consume more than demand or hard limit*/ - if (p_cls->stat.meshare >= max_demand) - goto out; - - //the surplus allocation is propotional to grt consumed = - surplus * p_cls->shares.unused_guarantee / total_grt; - - if (! consumed) //no more share - goto out; - - //hard limit and demand limit - inc_limit = max_demand - p_cls->stat.meshare; + old_surplus * c_cls->shares.my_guarantee / + p_cls->shares.total_guarantee; - if (check_sl) { - int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls) - /total_grt; - if (mesl < max_demand) - inc_limit = mesl - p_cls->stat.meshare; - } - - if (consumed > inc_limit) + //check limit + inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share; + if (inc_limit <= consumed) { + c_cls->stat.glut = 1; consumed = inc_limit; + } - BUG_ON(consumed < 0); - out: + c_cls->stat.effective_share += consumed; + out: return consumed; } - /* - * allocate surplus to all its children and also its default class - */ -static int alloc_surplus_single_round( - int surplus, - struct ckrm_core_class *parent, - struct ckrm_cpu_class *p_cls, - int check_sl) -{ - struct ckrm_cpu_class *c_cls; - struct ckrm_core_class *child_core = NULL; - int total_consumed = 0,consumed; - - //first allocate to the default class - consumed = - consume_self_surplus(surplus,p_cls,check_sl); - - if (consumed > 0) { - set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed); - total_consumed += consumed; - } - - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - return -1; - - consumed = - consume_surplus(surplus, c_cls, - p_cls,check_sl); - if (consumed > 0) { - set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); - total_consumed += consumed; - } - } - } while (child_core); - - return total_consumed; -} - -/** - * alloc_surplus_node: re-allocate the shares for children under parent - * @parent: parent node - * return the remaining surplus - * + * re-allocate the shares for all the childs under this node * task: * 1. get total surplus * 2. allocate surplus * 3. set the effective_share of each node */ -static int alloc_surplus_node(struct ckrm_core_class *parent) +static void alloc_surplus_node(struct ckrm_core_class *parent) { - struct ckrm_cpu_class *p_cls,*c_cls; - int total_surplus,consumed; - int check_sl; - int ret = -1; + int total_surplus = 0, old_surplus = 0; + struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent); struct ckrm_core_class *child_core = NULL; - - p_cls = ckrm_get_cpu_class(parent); - if (! p_cls) - goto realloc_out; + int self_share; /* - * get total surplus + * calculate surplus + * total_surplus = sum(child_surplus) + * reset glut flag + * initialize effective_share */ - total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; - BUG_ON(total_surplus < 0); - total_surplus += get_my_node_surplus(p_cls); - do { child_core = ckrm_get_next_child(parent, child_core); if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - goto realloc_out; + struct ckrm_cpu_class *c_cls = + ckrm_get_cpu_class(child_core); + ckrm_stat_t *stat = &c_cls->stat; total_surplus += get_node_surplus(c_cls); + stat->glut = 0; + set_effective_share(stat, stat->effective_guarantee); } } while (child_core); - - if (! total_surplus) { - ret = 0; - goto realloc_out; - } - - /* - * distributing the surplus - * first with the check_sl enabled - * once all the tasks has research the soft limit, disable check_sl and try again - */ - - check_sl = 1; + /*distribute the surplus */ + child_core = NULL; do { - consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl); - if (consumed < 0) //something is wrong - goto realloc_out; + if (!child_core) //keep the surplus of last round + old_surplus = total_surplus; - if (! consumed) - check_sl = 0; - else - total_surplus -= consumed; + child_core = ckrm_get_next_child(parent, child_core); + if (child_core) { + total_surplus -= + node_surplus_consume(old_surplus, child_core, + p_cls); + } + //start a new round if something is allocated in the last round + } while (child_core || (total_surplus != old_surplus)); - } while ((total_surplus > 0) && (consumed || check_sl) ); + //any remaining surplus goes to the default class + self_share = p_cls->stat.effective_share * + p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee; + self_share += total_surplus; - ret = 0; - - realloc_out: - return ret; + set_self_effective_share(&p_cls->stat, self_share); } /** * alloc_surplus - reallocate unused shares * * class A's usused share should be allocated to its siblings - * the re-allocation goes downward from the top */ -static int alloc_surplus(struct ckrm_core_class *root_core) +static void alloc_surplus(struct ckrm_core_class *root_core) { - struct ckrm_core_class *cur_core, *child_core; - // struct ckrm_cpu_class *cls; - int ret = -1; + struct ckrm_core_class *cur_core, *child_core = NULL; + struct ckrm_cpu_class *cls; - /*initialize*/ cur_core = root_core; - child_core = NULL; - // cls = ckrm_get_cpu_class(cur_core); - - /*the ckrm idle tasks get all what's remaining*/ - /*hzheng: uncomment the following like for hard limit support */ - // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); - - repeat: + cls = ckrm_get_cpu_class(cur_core); + cls->stat.glut = 0; + set_effective_share(&cls->stat, cls->stat.effective_guarantee); + repeat: //check exit if (!cur_core) - return 0; - - //visit this node only once - if (! child_core) - if ( alloc_surplus_node(cur_core) < 0 ) - return ret; + return; + //visit this node + alloc_surplus_node(cur_core); //next child child_core = ckrm_get_next_child(cur_core, child_core); if (child_core) { @@ -697,250 +455,22 @@ static int alloc_surplus(struct ckrm_core_class *root_core) goto repeat; } -/**********************************************/ -/* CKRM Idle Tasks */ -/**********************************************/ -struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class; -struct task_struct* ckrm_idle_tasks[NR_CPUS]; - -/*how many ckrm idle tasks should I wakeup*/ -static inline int get_nr_idle(unsigned long surplus) -{ - int cpu_online = cpus_weight(cpu_online_map); - int nr_idle = 0; - - nr_idle = surplus * cpu_online; - nr_idle >>= CKRM_SHARE_ACCURACY; - - if (surplus) - nr_idle ++; - - if (nr_idle > cpu_online) - nr_idle = cpu_online; - - return nr_idle; -} - -/** - * update_ckrm_idle: update the status of the idle class according to the new surplus - * surplus: new system surplus - * - * Task: - * -- update share of the idle class - * -- wakeup idle tasks according to surplus - */ -void update_ckrm_idle(unsigned long surplus) -{ - int nr_idle = get_nr_idle(surplus); - int i; - struct task_struct* idle_task; - - set_eshare(&ckrm_idle_class->stat,surplus); - set_meshare(&ckrm_idle_class->stat,surplus); - /*wake up nr_idle idle tasks*/ - for_each_online_cpu(i) { - idle_task = ckrm_idle_tasks[i]; - if (unlikely(idle_task->cpu_class != ckrm_idle_class)) { - ckrm_cpu_change_class(idle_task, - idle_task->cpu_class, - ckrm_idle_class); - } - if (! idle_task) - continue; - if (i < nr_idle) { - //activate it - wake_up_process(idle_task); - } else { - //deactivate it - idle_task->state = TASK_INTERRUPTIBLE; - set_tsk_need_resched(idle_task); - } - } -} - -static int ckrm_cpu_idled(void *nothing) -{ - set_user_nice(current,19); - daemonize("ckrm_idle_task"); - - //deactivate it, it will be awakened by ckrm_cpu_monitor - current->state = TASK_INTERRUPTIBLE; - schedule(); - - /*similar to cpu_idle */ - while (1) { - while (!need_resched()) { - ckrm_cpu_monitor(1); - if (current_cpu_data.hlt_works_ok) { - local_irq_disable(); - if (!need_resched()) { - set_tsk_need_resched(current); - safe_halt(); - } else - local_irq_enable(); - } - } - schedule(); - } - return 0; -} - -/** - * ckrm_start_ckrm_idle: - * create the ckrm_idle_class and starts the idle tasks - * - */ -void ckrm_start_ckrm_idle(void) -{ - int i; - int ret; - ckrm_shares_t shares; - - ckrm_idle_class = &ckrm_idle_class_obj; - memset(ckrm_idle_class,0,sizeof(shares)); - /*don't care about the shares */ - init_cpu_class(ckrm_idle_class,&shares); - printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class); - - for_each_online_cpu(i) { - ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL); - - /*warn on error, but the system should still work without it*/ - if (ret < 0) - printk(KERN_ERR"Warn: can't start ckrm idle tasks\n"); - else { - ckrm_idle_tasks[i] = find_task_by_pid(ret); - if (!ckrm_idle_tasks[i]) - printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret); - } - } -} - -/**********************************************/ -/* Local Weight */ -/**********************************************/ -/** - * adjust_class_local_weight: adjust the local weight for each cpu - * - * lrq->weight = lpr->pressure * class->weight / total_pressure - */ -static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) -{ - unsigned long total_pressure = 0; - ckrm_lrq_t* lrq; - int i; - unsigned long class_weight; - unsigned long long lw; - - //get total pressure - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(clsptr,i); - total_pressure += lrq->lrq_load; - } - - if (! total_pressure) - return; - - class_weight = cpu_class_weight(clsptr) * cpu_online; - - /* - * update weight for each cpu, minimun is 1 - */ - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(clsptr,i); - if (! lrq->lrq_load) - /*give idle class a high share to boost interactiveness */ - lw = cpu_class_weight(clsptr); - else { - lw = lrq->lrq_load * class_weight; - do_div(lw,total_pressure); - if (!lw) - lw = 1; - else if (lw > CKRM_SHARE_MAX) - lw = CKRM_SHARE_MAX; - } - - lrq->local_weight = lw; - } -} - -/* - * assume called with class_list_lock read lock held - */ -void adjust_local_weight(void) -{ - static spinlock_t lock = SPIN_LOCK_UNLOCKED; - struct ckrm_cpu_class *clsptr; - int cpu_online; - - //do nothing if someone already holding the lock - if (! spin_trylock(&lock)) - return; - - cpu_online = cpus_weight(cpu_online_map); - - //class status: demand, share,total_ns prio, index - list_for_each_entry(clsptr,&active_cpu_classes,links) { - adjust_lrq_weight(clsptr,cpu_online); - } - - spin_unlock(&lock); -} - -/**********************************************/ -/* Main */ -/**********************************************/ /** *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress - *@check_min: if check_min is set, the call can't be within 100ms of last call * * this function is called every CPU_MONITOR_INTERVAL * it computes the cpu demand of each class * and re-allocate the un-used shares to other classes */ -void ckrm_cpu_monitor(int check_min) +void ckrm_cpu_monitor(void) { - static spinlock_t lock = SPIN_LOCK_UNLOCKED; - static unsigned long long last_check = 0; - struct ckrm_core_class *root_core = get_default_cpu_class()->core; - unsigned long long now; -#define MIN_CPU_MONITOR_INTERVAL 100000000UL - + struct ckrm_core_class *root_core = default_cpu_class->core; if (!root_core) return; - //do nothing if someone already holding the lock - if (! spin_trylock(&lock)) - return; - - read_lock(&class_list_lock); - - now = sched_clock(); - - //consecutive check should be at least 100ms apart - if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL)) - goto outunlock; - - last_check = now; - - if (update_effectives(root_core) != 0) - goto outunlock; - - if (update_max_demand(root_core) != 0) - goto outunlock; - -#ifndef ALLOC_SURPLUS_SUPPORT -#warning "MEF taking out alloc_surplus" -#else - if (alloc_surplus(root_core) != 0) - goto outunlock; -#endif - - adjust_local_weight(); - - outunlock: - read_unlock(&class_list_lock); - spin_unlock(&lock); + update_effective_guarantee_limit(root_core); + update_cpu_demand(root_core); + alloc_surplus(root_core); } /*****************************************************/ @@ -951,19 +481,22 @@ static int thread_exit = 0; static int ckrm_cpu_monitord(void *nothing) { + wait_queue_head_t wait; + + init_waitqueue_head(&wait); + daemonize("ckrm_cpu_ctrld"); for (;;) { /*sleep for sometime before next try*/ - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(CPU_MONITOR_INTERVAL); - ckrm_cpu_monitor(1); + interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL); + ckrm_cpu_monitor(); if (thread_exit) { break; } } cpu_monitor_pid = -1; thread_exit = 2; - printk(KERN_DEBUG "cpu_monitord exit\n"); + printk("cpu_monitord exit\n"); return 0; } @@ -971,18 +504,21 @@ void ckrm_start_monitor(void) { cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL); if (cpu_monitor_pid < 0) { - printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n"); + printk("ckrm_cpu_monitord for failed\n"); } } void ckrm_kill_monitor(void) { - printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid); + wait_queue_head_t wait; + int interval = HZ; + init_waitqueue_head(&wait); + + printk("killing process %d\n", cpu_monitor_pid); if (cpu_monitor_pid > 0) { thread_exit = 1; while (thread_exit != 2) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(CPU_MONITOR_INTERVAL); + interruptible_sleep_on_timeout(&wait, interval); } } } @@ -990,8 +526,6 @@ void ckrm_kill_monitor(void) int ckrm_cpu_monitor_init(void) { ckrm_start_monitor(); - /*hzheng: uncomment the following like for hard limit support */ - // ckrm_start_ckrm_idle(); return 0; } diff --git a/kernel/ckrm/ckrm_laq.c b/kernel/ckrm/ckrm_laq.c deleted file mode 100644 index b64205a06..000000000 --- a/kernel/ckrm/ckrm_laq.c +++ /dev/null @@ -1,495 +0,0 @@ -/* ckrm_socketaq.c - accept queue resource controller - * - * Copyright (C) Vivek Kashyap, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * Initial version - */ - -/* Code Description: TBD - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define hnode_2_core(ptr) \ - ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL) - -#define CKRM_SAQ_MAX_DEPTH 3 // 0 => /rcfs - // 1 => socket_aq - // 2 => socket_aq/listen_class - // 3 => socket_aq/listen_class/accept_queues - // 4 => Not allowed - -typedef struct ckrm_laq_res { - spinlock_t reslock; - atomic_t refcnt; - struct ckrm_shares shares; - struct ckrm_core_class *core; - struct ckrm_core_class *pcore; - int my_depth; - int my_id; - unsigned int min_ratio; -} ckrm_laq_res_t; - -static int my_resid = -1; - -extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int); -extern struct ckrm_core_class *rcfs_make_core(struct dentry *, - struct ckrm_core_class *); - -void laq_res_hold(struct ckrm_laq_res *res) -{ - atomic_inc(&res->refcnt); - return; -} - -void laq_res_put(struct ckrm_laq_res *res) -{ - if (atomic_dec_and_test(&res->refcnt)) - kfree(res); - return; -} - -/* Initialize rescls values - */ -static void laq_res_initcls(void *my_res) -{ - ckrm_laq_res_t *res = my_res; - - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; -} - -static int atoi(char *s) -{ - int k = 0; - while (*s) - k = *s++ - '0' + (k * 10); - return k; -} - -static char *laq_get_name(struct ckrm_core_class *c) -{ - char *p = (char *)c->name; - - while (*p) - p++; - while (*p != '/' && p != c->name) - p--; - - return ++p; -} - -static void *laq_res_alloc(struct ckrm_core_class *core, - struct ckrm_core_class *parent) -{ - ckrm_laq_res_t *res, *pres; - int pdepth; - - if (parent) - pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t); - else - pres = NULL; - - if (core == core->classtype->default_class) - pdepth = 1; - else { - if (!parent) - return NULL; - pdepth = 1 + pres->my_depth; - } - - res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC); - if (res) { - memset(res, 0, sizeof(res)); - spin_lock_init(&res->reslock); - laq_res_hold(res); - res->my_depth = pdepth; - if (pdepth == 2) // listen class - res->my_id = 0; - else if (pdepth == 3) - res->my_id = atoi(laq_get_name(core)); - res->core = core; - res->pcore = parent; - - // rescls in place, now initialize contents other than - // hierarchy pointers - laq_res_initcls(res); // acts as initialising value - } - - return res; -} - -static void laq_res_free(void *my_res) -{ - ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res; - ckrm_laq_res_t *parent; - - if (!res) - return; - - if (res->my_depth != 3) { - kfree(res); - return; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // Should never happen - return; - - spin_lock(&parent->reslock); - spin_lock(&res->reslock); - - // return child's guarantee to parent node - // Limits have no meaning for accept queue control - child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0); - - spin_unlock(&res->reslock); - laq_res_put(res); - spin_unlock(&parent->reslock); - return; -} - -/************************************************************************** - * SHARES *** - **************************************************************************/ - -void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio) -{ - int i; - struct tcp_opt *tp; - - tp = tcp_sk(ns->ns_sk); - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - tp->acceptq[i].aq_ratio = aq_ratio[i]; - return; -} -void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio) -{ - - struct ckrm_net_struct *ns; - struct ckrm_core_class *core = parent->core; - - class_lock(core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - laq_set_aq_value(ns, aq_ratio); - } - class_unlock(core); - return; -} - -static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio) -{ - struct ckrm_hnode *chnode; - ckrm_laq_res_t *child; - unsigned int min; - int i; - - min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee; - - list_for_each_entry(chnode, &res->core->hnode.children, siblings) { - child = hnode_2_core(chnode)->res_class[my_resid]; - - aq_ratio[child->my_id] = - (unsigned int)child->shares.my_guarantee; - if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE) - aq_ratio[child->my_id] = 0; - if (aq_ratio[child->my_id] && - ((unsigned int)aq_ratio[child->my_id] < min)) - min = (unsigned int)child->shares.my_guarantee; - } - - if (min == 0) { - min = 1; - // default takes all if nothing specified - aq_ratio[0] = 1; - } - res->min_ratio = min; - - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - aq_ratio[i] = aq_ratio[i] / min; -} - -static int laq_set_share_values(void *my_res, struct ckrm_shares *shares) -{ - ckrm_laq_res_t *res = my_res; - ckrm_laq_res_t *parent; - unsigned int aq_ratio[NUM_ACCEPT_QUEUES]; - int rc = 0; - - if (!res) - return -EINVAL; - - if (!res->pcore) { - // something is badly wrong - printk(KERN_ERR "socketaq internal inconsistency\n"); - return -EBADF; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // socketclass does not have a share interface - return -EINVAL; - - // Ensure that we ignore limit values - shares->my_limit = CKRM_SHARE_DONTCARE; - shares->max_limit = CKRM_SHARE_UNCHANGED; - - if (res->my_depth == 0) { - printk(KERN_ERR "socketaq bad entry\n"); - return -EBADF; - } else if (res->my_depth == 1) { - // can't be written to. This is an internal default. - return -EINVAL; - } else if (res->my_depth == 2) { - //nothin to inherit - if (!shares->total_guarantee) { - return -EINVAL; - } - parent = res; - shares->my_guarantee = CKRM_SHARE_DONTCARE; - } else if (res->my_depth == 3) { - // accept queue itself. - shares->total_guarantee = CKRM_SHARE_UNCHANGED; - } - - ckrm_lock_hier(parent->pcore); - spin_lock(&parent->reslock); - rc = set_shares(shares, &res->shares, - (parent == res) ? NULL : &parent->shares); - if (rc) { - spin_unlock(&res->reslock); - ckrm_unlock_hier(res->pcore); - return rc; - } - calculate_aq_ratios(parent, aq_ratio); - laq_set_aq_values(parent, aq_ratio); - spin_unlock(&parent->reslock); - ckrm_unlock_hier(parent->pcore); - - return rc; -} - -static int laq_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - ckrm_laq_res_t *res = my_res; - - if (!res) - return -EINVAL; - *shares = res->shares; - return 0; -} - -/************************************************************************** - * STATS *** - **************************************************************************/ - -void -laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i) -{ - seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t" - "queued: %u\n\twait_time: %u\n", - i, taq->acceptq_count, taq->acceptq_qcount, - jiffies_to_msecs(taq->acceptq_wait_time)); - - if (i) - return; - - for (i = 1; i < NUM_ACCEPT_QUEUES; i++) { - taq[0].acceptq_wait_time += taq[i].acceptq_wait_time; - taq[0].acceptq_qcount += taq[i].acceptq_qcount; - taq[0].acceptq_count += taq[i].acceptq_count; - } - - seq_printf(sfile, "Totals :\n\taccepted: %u\n\t" - "queued: %u\n\twait_time: %u\n", - taq->acceptq_count, taq->acceptq_qcount, - jiffies_to_msecs(taq->acceptq_wait_time)); - - return; -} - -void -laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres, - struct tcp_acceptq_info *taq) -{ - struct ckrm_net_struct *ns; - struct ckrm_core_class *core = pres->core; - struct tcp_opt *tp; - int a = mres->my_id; - int z; - - if (a == 0) - z = NUM_ACCEPT_QUEUES; - else - z = a + 1; - - // XXX Instead of holding a class_lock introduce a rw - // lock to be write locked by listen callbacks and read locked here. - // - VK - class_lock(pres->core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - tp = tcp_sk(ns->ns_sk); - for (; a < z; a++) { - taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time; - taq->acceptq_qcount += tp->acceptq[a].aq_qcount; - taq->acceptq_count += tp->acceptq[a].aq_count; - taq++; - } - } - class_unlock(pres->core); -} - -static int laq_get_stats(void *my_res, struct seq_file *sfile) -{ - ckrm_laq_res_t *res = my_res; - ckrm_laq_res_t *parent; - struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES]; - int rc = 0; - - if (!res) - return -EINVAL; - - if (!res->pcore) { - // something is badly wrong - printk(KERN_ERR "socketaq internal inconsistency\n"); - return -EBADF; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) { // socketclass does not have a stat interface - printk(KERN_ERR "socketaq internal fs inconsistency\n"); - return -EINVAL; - } - - memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES); - - switch (res->my_depth) { - - default: - case 0: - printk(KERN_ERR "socket class bad entry\n"); - rc = -EBADF; - break; - - case 1: // can't be read from. this is internal default. - // return -EINVAL - rc = -EINVAL; - break; - - case 2: // return the default and total - ckrm_lock_hier(res->core); // block any deletes - laq_get_aq_stats(res, res, &taq[0]); - laq_print_aq_stats(sfile, &taq[0], 0); - ckrm_unlock_hier(res->core); // block any deletes - break; - - case 3: - ckrm_lock_hier(parent->core); // block any deletes - laq_get_aq_stats(parent, res, &taq[res->my_id]); - laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id); - ckrm_unlock_hier(parent->core); // block any deletes - break; - } - - return rc; -} - -/* - * The network connection is reclassified to this class. Update its shares. - * The socket lock is held. - */ -static void laq_change_resclass(void *n, void *old, void *r) -{ - struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n; - struct ckrm_laq_res *res = (struct ckrm_laq_res *)r; - unsigned int aq_ratio[NUM_ACCEPT_QUEUES]; - - if (res->my_depth != 2) - return; - - // a change to my_depth == 3 ie. the accept classes cannot happen. - // there is no target file - if (res->my_depth == 2) { // it is one of the socket classes - ckrm_lock_hier(res->pcore); - // share rule: hold parent resource lock. then self. - // However, since my_depth == 1 is a generic class it is not - // needed here. Self lock is enough. - spin_lock(&res->reslock); - calculate_aq_ratios(res, aq_ratio); - class_lock(res->pcore); - laq_set_aq_value(ns, aq_ratio); - class_unlock(res->pcore); - spin_unlock(&res->reslock); - ckrm_unlock_hier(res->pcore); - } - - return; -} - -struct ckrm_res_ctlr laq_rcbs = { - .res_name = "laq", - .resid = -1, // dynamically assigned - .res_alloc = laq_res_alloc, - .res_free = laq_res_free, - .set_share_values = laq_set_share_values, - .get_share_values = laq_get_share_values, - .get_stats = laq_get_stats, - .change_resclass = laq_change_resclass, - //.res_initcls = laq_res_initcls, //HUBERTUS: unnecessary !! -}; - -int __init init_ckrm_laq_res(void) -{ - struct ckrm_classtype *clstype; - int resid; - - clstype = ckrm_find_classtype_by_name("socketclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); - return -ENOENT; - } - - if (my_resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &laq_rcbs); - if (resid >= 0) - my_resid = resid; - printk(KERN_DEBUG "........init_ckrm_listen_aq_res -> %d\n", my_resid); - } - return 0; - -} - -void __exit exit_ckrm_laq_res(void) -{ - ckrm_unregister_res_ctlr(&laq_rcbs); - my_resid = -1; -} - -module_init(init_ckrm_laq_res) - module_exit(exit_ckrm_laq_res) - - MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c index c6c594a96..667ac9c67 100644 --- a/kernel/ckrm/ckrm_mem.c +++ b/kernel/ckrm/ckrm_mem.c @@ -52,7 +52,6 @@ EXPORT_SYMBOL(ckrm_tot_lru_pages); static ckrm_mem_res_t *ckrm_mem_root_class; atomic_t ckrm_mem_real_count = ATOMIC_INIT(0); EXPORT_SYMBOL(ckrm_mem_real_count); -static void ckrm_mem_evaluate_all_pages(void); /* Initialize rescls values * May be called on each rcfs unmount or as part of error recovery @@ -90,7 +89,7 @@ mem_res_initcls_one(void *my_res) res->pg_guar = CKRM_SHARE_DONTCARE; res->pg_limit = CKRM_SHARE_DONTCARE; - res->pg_unused = 0; + res->pg_unused = CKRM_SHARE_DONTCARE; } static void * @@ -180,25 +179,20 @@ mem_res_free(void *my_res) if (!res) return; - res->shares.my_guarantee = 0; - res->shares.my_limit = 0; - res->pg_guar = 0; - res->pg_limit = 0; - res->pg_unused = 0; - parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t); + // return child's limit/guarantee to parent node if (parres) { child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0); child_maxlimit_changed_local(parres); } - ckrm_mem_evaluate_all_pages(); - res->core = NULL; - + res->shares.my_guarantee = 0; + res->shares.my_limit = 0; spin_lock(&ckrm_mem_lock); list_del(&res->mcls_list); spin_unlock(&ckrm_mem_lock); mem_class_put(res); + return; } @@ -361,14 +355,8 @@ mem_change_resclass(void *tsk, void *old, void *new) } } - spin_unlock(&mm->peertask_lock); ckrm_mem_evaluate_mm(mm); - /* - printk("chg_cls: task <%s:%d> mm %p oldmm %s newmm %s o %s n %s\n", - task->comm, task->pid, mm, prev_mmcls ? prev_mmcls->core->name: - "NULL", mm->memclass ? mm->memclass->core->name : "NULL", - o ? o->core->name: "NULL", n ? n->core->name: "NULL"); - */ + spin_unlock(&mm->peertask_lock); return; } @@ -497,7 +485,7 @@ set_usage_flags(ckrm_mem_res_t *res) guar = (res->pg_guar > 0) ? res->pg_guar : 0; range = res->pg_limit - guar; - if ((tot_usage > (guar + ((110 * range) / 100))) && + if ((tot_usage > (guar + ((120 * range) / 100))) && (res->pg_lent > (guar + ((25 * range) / 100)))) { set_flags_of_children(res, CLS_PARENT_OVER); } @@ -508,10 +496,6 @@ set_usage_flags(ckrm_mem_res_t *res) res->reclaim_flags |= CLS_OVER_100; } else if (cls_usage > (guar + ((3 * range) / 4))) { res->reclaim_flags |= CLS_OVER_75; - } else if (cls_usage > (guar + (range / 2))) { - res->reclaim_flags |= CLS_OVER_50; - } else if (cls_usage > (guar + (range / 4))) { - res->reclaim_flags |= CLS_OVER_25; } else if (cls_usage > guar) { res->reclaim_flags |= CLS_OVER_GUAR; } else { @@ -562,16 +546,15 @@ ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract) { int i, j, mask = 0; - if (*flags == 0) { - *extract = 0; + if (*extract == 0 || *flags == 0) { return; } - if (*flags & CLS_SHRINK) { *extract = CLS_SHRINK; *flags = 0; return; } + i = fls(*flags); for (j = i-1; j > 0; j--) { @@ -583,16 +566,12 @@ ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract) } void -ckrm_at_limit(ckrm_mem_res_t *cls) +ckrm_near_limit(ckrm_mem_res_t *cls) { -#ifndef AT_LIMIT_SUPPORT -#warning "ckrm_at_limit disabled due to problems with memory hog tests" -#else struct zone *zone; unsigned long now = jiffies; - if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || - ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) { + if (!cls || ((cls->flags & MEM_NEAR_LIMIT) == MEM_NEAR_LIMIT)) { return; } if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ? @@ -606,17 +585,14 @@ ckrm_at_limit(ckrm_mem_res_t *cls) spin_lock(&ckrm_mem_lock); list_add(&cls->shrink_list, &ckrm_shrink_list); spin_unlock(&ckrm_mem_lock); - cls->flags |= MEM_AT_LIMIT; + cls->flags |= MEM_NEAR_LIMIT; for_each_zone(zone) { wakeup_kswapd(zone); break; // only once is enough } -#endif // AT_LIMIT_SUPPORT } -static int unmapped = 0, changed = 0, unchanged = 0, maxnull = 0, -anovma = 0, fnovma = 0; -static void +static int ckrm_mem_evaluate_page_anon(struct page* page) { ckrm_mem_res_t* pgcls = page_class(page); @@ -624,12 +600,10 @@ ckrm_mem_evaluate_page_anon(struct page* page) struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; struct vm_area_struct *vma; struct mm_struct* mm; - int v = 0; spin_lock(&anon_vma->lock); BUG_ON(list_empty(&anon_vma->head)); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - v++; mm = vma->vm_mm; if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) { @@ -637,20 +611,15 @@ ckrm_mem_evaluate_page_anon(struct page* page) } } spin_unlock(&anon_vma->lock); - if (!v) - anovma++; - if (!maxshareclass) - maxnull++; if (maxshareclass && (pgcls != maxshareclass)) { ckrm_change_page_class(page, maxshareclass); - changed++; - } else - unchanged++; - return; + return 1; + } + return 0; } -static void +static int ckrm_mem_evaluate_page_file(struct page* page) { ckrm_mem_res_t* pgcls = page_class(page); @@ -660,132 +629,69 @@ ckrm_mem_evaluate_page_file(struct page* page) pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct prio_tree_iter iter; struct mm_struct* mm; - int v = 0; if (!mapping) - return; + return 0; if (!spin_trylock(&mapping->i_mmap_lock)) - return; + return 0; while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap, &iter, pgoff, pgoff)) != NULL) { - v++; mm = vma->vm_mm; if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0) maxshareclass = mm->memclass; } spin_unlock(&mapping->i_mmap_lock); - if (!v) - fnovma++; - if (!maxshareclass) - maxnull++; - if (maxshareclass && pgcls != maxshareclass) { ckrm_change_page_class(page, maxshareclass); - changed++; - } else - unchanged++; - return; + return 1; + } + return 0; } -static void +static int ckrm_mem_evaluate_page(struct page* page) { + int changed = 0; + if (page->mapping) { if (PageAnon(page)) - ckrm_mem_evaluate_page_anon(page); + changed = ckrm_mem_evaluate_page_anon(page); else - ckrm_mem_evaluate_page_file(page); - } else - unmapped++; - return; -} - -static void -ckrm_mem_evaluate_all_pages() -{ - struct page *page; - struct zone *zone; - int active = 0, inactive = 0, cleared = 0; - int act_cnt, inact_cnt, idx; - ckrm_mem_res_t *res; - - spin_lock(&ckrm_mem_lock); - list_for_each_entry(res, &ckrm_memclass_list, mcls_list) { - res->tmp_cnt = 0; + changed = ckrm_mem_evaluate_page_file(page); } - spin_unlock(&ckrm_mem_lock); - - for_each_zone(zone) { - spin_lock_irq(&zone->lru_lock); - list_for_each_entry(page, &zone->inactive_list, lru) { - ckrm_mem_evaluate_page(page); - active++; - page_class(page)->tmp_cnt++; - if (!test_bit(PG_ckrm_account, &page->flags)) - cleared++; - } - list_for_each_entry(page, &zone->active_list, lru) { - ckrm_mem_evaluate_page(page); - inactive++; - page_class(page)->tmp_cnt++; - if (!test_bit(PG_ckrm_account, &page->flags)) - cleared++; - } - spin_unlock_irq(&zone->lru_lock); - } - printk(KERN_DEBUG "all_pages: active %d inactive %d cleared %d\n", - active, inactive, cleared); - spin_lock(&ckrm_mem_lock); - list_for_each_entry(res, &ckrm_memclass_list, mcls_list) { - act_cnt = 0; inact_cnt = 0; idx = 0; - for_each_zone(zone) { - act_cnt += res->nr_active[idx]; - inact_cnt += res->nr_inactive[idx]; - idx++; - } - printk(KERN_DEBUG "all_pages: %s: tmp_cnt %d; act_cnt %d inact_cnt %d\n", - res->core->name, res->tmp_cnt, act_cnt, inact_cnt); - } - spin_unlock(&ckrm_mem_lock); - - // check all mm's in the system to see which memclass they are attached - // to. - return; + return changed; } -static /*inline*/ int +static inline int class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma, pmd_t* pmdir, unsigned long address, unsigned long end) { - pte_t *pte, *orig_pte; + pte_t* pte; unsigned long pmd_end; if (pmd_none(*pmdir)) return 0; BUG_ON(pmd_bad(*pmdir)); - orig_pte = pte = pte_offset_map(pmdir,address); + pte = pte_offset_map(pmdir,address); pmd_end = (address+PMD_SIZE)&PMD_MASK; if (end>pmd_end) end = pmd_end; do { if (pte_present(*pte)) { - BUG_ON(mm->memclass == NULL); - ckrm_change_page_class(pte_page(*pte), mm->memclass); - // ckrm_mem_evaluate_page(pte_page(*pte)); + ckrm_mem_evaluate_page(pte_page(*pte)); } address += PAGE_SIZE; pte++; } while(address && (addressmemclass != (void *)maxshareclass)) { + if (mm->memclass != (void *)maxshareclass) { + mem_class_get(maxshareclass); if (mm->memclass) mem_class_put(mm->memclass); mm->memclass = maxshareclass; - mem_class_get(maxshareclass); /* Go through all VMA to migrate pages */ down_read(&mm->mmap_sem); @@ -870,6 +776,26 @@ ckrm_mem_evaluate_mm(struct mm_struct* mm) return; } +void +ckrm_mem_evaluate_page_byadd(struct page* page, struct mm_struct* mm) +{ + ckrm_mem_res_t *pgcls = page_class(page); + ckrm_mem_res_t *chgcls = mm->memclass ? mm->memclass : GET_MEM_CLASS(current); + + if (!chgcls || pgcls == chgcls) + return; + + if (!page->mapcount) { + ckrm_change_page_class(page, chgcls); + return; + } + if (ckrm_mem_share_compare(pgcls, chgcls) < 0) { + ckrm_change_page_class(page, chgcls); + return; + } + return; +} + void ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task) { @@ -879,26 +805,10 @@ ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task) list_del_init(&task->mm_peers); } list_add_tail(&task->mm_peers, &mm->tasklist); - spin_unlock(&mm->peertask_lock); if (mm->memclass != GET_MEM_CLASS(task)) ckrm_mem_evaluate_mm(mm); + spin_unlock(&mm->peertask_lock); return; } -int -ckrm_memclass_valid(ckrm_mem_res_t *cls) -{ - ckrm_mem_res_t *tmp; - - spin_lock(&ckrm_mem_lock); - list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) { - if (tmp == cls) { - spin_unlock(&ckrm_mem_lock); - return 1; - } - } - spin_unlock(&ckrm_mem_lock); - return 0; -} - MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_sockc.c b/kernel/ckrm/ckrm_sockc.c index 8ccadfa39..a8a3b4bd5 100644 --- a/kernel/ckrm/ckrm_sockc.c +++ b/kernel/ckrm/ckrm_sockc.c @@ -59,7 +59,7 @@ struct ckrm_sock_class { static struct ckrm_sock_class sockclass_dflt_class = { }; -#define SOCKET_CLASS_TYPE_NAME "socketclass" +#define SOCKET_CLASS_TYPE_NAME "socket_class" const char *dflt_sockclass_name = SOCKET_CLASS_TYPE_NAME; @@ -464,16 +464,6 @@ sock_forced_reclassify(struct ckrm_core_class *target, const char *options) if (!options) return -EINVAL; - if (target == NULL) { - unsigned long id = simple_strtol(options,NULL,0); - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (id != 0) - return -EINVAL; - printk(KERN_DEBUG "sock_class: reclassify all not net implemented\n"); - return 0; - } - while ((p = strsep((char **)&options, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; @@ -553,7 +543,7 @@ static void sock_reclassify_class(struct ckrm_sock_class *cls) void __init ckrm_meta_init_sockclass(void) { - printk(KERN_DEBUG "...... Initializing ClassType<%s> ........\n", + printk("...... Initializing ClassType<%s> ........\n", CT_sockclass.name); // intialize the default class ckrm_init_core_class(&CT_sockclass, class_core(&sockclass_dflt_class), diff --git a/kernel/ckrm/ckrm_numtasks.c b/kernel/ckrm/ckrm_tasks.c similarity index 90% rename from kernel/ckrm/ckrm_numtasks.c rename to kernel/ckrm/ckrm_tasks.c index 61517aee0..ee539216e 100644 --- a/kernel/ckrm/ckrm_numtasks.c +++ b/kernel/ckrm/ckrm_tasks.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -191,11 +190,6 @@ static void numtasks_put_ref_local(void *arg) res = ckrm_get_res_class(core, resid, ckrm_numtasks_t); if (res == NULL) return; - if (unlikely(atomic_read(&res->cnt_cur_alloc) == 0)) { - printk(KERN_WARNING "numtasks_put_ref: Trying to decrement " - "counter below 0\n"); - return; - } atomic_dec(&res->cnt_cur_alloc); if (atomic_read(&res->cnt_borrowed) > 0) { atomic_dec(&res->cnt_borrowed); @@ -249,13 +243,10 @@ static void numtasks_res_free(void *my_res) parres = ckrm_get_res_class(res->parent, resid, ckrm_numtasks_t); - if (unlikely(atomic_read(&res->cnt_cur_alloc) < 0)) { - printk(KERN_WARNING "numtasks_res: counter below 0\n"); - } - if (unlikely(atomic_read(&res->cnt_cur_alloc) > 0 || - atomic_read(&res->cnt_borrowed) > 0)) { - printk(KERN_WARNING "numtasks_res_free: resource still " - "alloc'd %p\n", res); + if (unlikely(atomic_read(&res->cnt_cur_alloc) != 0 || + atomic_read(&res->cnt_borrowed))) { + printk(KERN_ERR + "numtasks_res_free: resource still alloc'd %p\n", res); if ((borrowed = atomic_read(&res->cnt_borrowed)) > 0) { for (i = 0; i < borrowed; i++) { numtasks_put_ref_local(parres->core); @@ -307,9 +298,9 @@ recalc_and_propagate(ckrm_numtasks_t * res, ckrm_numtasks_t * parres) if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { res->cnt_guarantee = CKRM_SHARE_DONTCARE; } else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * parres->cnt_guarantee; - do_div(temp, par->total_guarantee); - res->cnt_guarantee = (int) temp; + res->cnt_guarantee = + (self->my_guarantee * parres->cnt_guarantee) + / par->total_guarantee; } else { res->cnt_guarantee = 0; } @@ -317,9 +308,8 @@ recalc_and_propagate(ckrm_numtasks_t * res, ckrm_numtasks_t * parres) if (parres->cnt_limit == CKRM_SHARE_DONTCARE) { res->cnt_limit = CKRM_SHARE_DONTCARE; } else if (par->max_limit) { - u64 temp = (u64) self->my_limit * parres->cnt_limit; - do_div(temp, par->max_limit); - res->cnt_limit = (int) temp; + res->cnt_limit = (self->my_limit * parres->cnt_limit) + / par->max_limit; } else { res->cnt_limit = 0; } @@ -328,9 +318,9 @@ recalc_and_propagate(ckrm_numtasks_t * res, ckrm_numtasks_t * parres) if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) { res->cnt_unused = CKRM_SHARE_DONTCARE; } else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * res->cnt_guarantee; - do_div(temp, self->total_guarantee); - res->cnt_unused = (int) temp; + res->cnt_unused = (self->unused_guarantee * + res->cnt_guarantee) / + self->total_guarantee; } else { res->cnt_unused = 0; } @@ -376,9 +366,9 @@ static int numtasks_set_share_values(void *my_res, struct ckrm_shares *new) if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { parres->cnt_unused = CKRM_SHARE_DONTCARE; } else if (par->total_guarantee) { - u64 temp = (u64) par->unused_guarantee * parres->cnt_guarantee; - do_div(temp, par->total_guarantee); - parres->cnt_unused = (int) temp; + parres->cnt_unused = (par->unused_guarantee * + parres->cnt_guarantee) / + par->total_guarantee; } else { parres->cnt_unused = 0; } @@ -425,11 +415,10 @@ static int numtasks_get_stats(void *my_res, struct seq_file *sfile) #ifdef NUMTASKS_DEBUG seq_printf(sfile, "cur_alloc %d; borrowed %d; cnt_guar %d; cnt_limit %d " - "cnt_unused %d, unused_guarantee %d, cur_max_limit %d\n", + "unused_guarantee %d, cur_max_limit %d\n", atomic_read(&res->cnt_cur_alloc), atomic_read(&res->cnt_borrowed), res->cnt_guarantee, - res->cnt_limit, res->cnt_unused, - res->shares.unused_guarantee, + res->cnt_limit, res->shares.unused_guarantee, res->shares.cur_max_limit); #endif @@ -453,7 +442,7 @@ static int numtasks_set_config(void *my_res, const char *cfgstr) if (!res) return -EINVAL; - printk(KERN_DEBUG "numtasks config='%s'\n", cfgstr); + printk("numtasks config='%s'\n", cfgstr); return 0; } @@ -505,7 +494,7 @@ int __init init_ckrm_numtasks_res(void) if (resid == -1) { resid = ckrm_register_res_ctlr(clstype, &numtasks_rcbs); - printk(KERN_DEBUG "........init_ckrm_numtasks_res -> %d\n", resid); + printk("........init_ckrm_numtasks_res -> %d\n", resid); if (resid != -1) { ckrm_numtasks_register(numtasks_get_ref_local, numtasks_put_ref_local); diff --git a/kernel/ckrm/ckrm_numtasks_stub.c b/kernel/ckrm/ckrm_tasks_stub.c similarity index 100% rename from kernel/ckrm/ckrm_numtasks_stub.c rename to kernel/ckrm/ckrm_tasks_stub.c diff --git a/kernel/ckrm/ckrm_tc.c b/kernel/ckrm/ckrm_tc.c index af95644f2..316266494 100644 --- a/kernel/ckrm/ckrm_tc.c +++ b/kernel/ckrm/ckrm_tc.c @@ -318,7 +318,7 @@ static void cb_taskclass_fork(struct task_struct *tsk) ckrm_task_unlock(tsk->parent); } if (!list_empty(&tsk->taskclass_link)) - printk(KERN_WARNING "BUG in cb_fork.. tsk (%s:%d> already linked\n", + printk("BUG in cb_fork.. tsk (%s:%d> already linked\n", tsk->comm, tsk->pid); ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_FORK); @@ -397,7 +397,7 @@ DECLARE_MUTEX(async_serializer); // serialize all async functions * We use a hybrid by comparing ratio nr_threads/pidmax */ -static int ckrm_reclassify_all_tasks(void) +static void ckrm_reclassify_all_tasks(void) { extern int pid_max; @@ -407,11 +407,6 @@ static int ckrm_reclassify_all_tasks(void) int ratio; int use_bitmap; - /* Check permissions */ - if ((!capable(CAP_SYS_NICE)) && (!capable(CAP_SYS_RESOURCE))) { - return -EPERM; - } - ratio = curpidmax / nr_threads; if (curpidmax <= PID_MAX_DEFAULT) { use_bitmap = 1; @@ -422,7 +417,6 @@ static int ckrm_reclassify_all_tasks(void) ce_protect(&CT_taskclass); retry: - if (use_bitmap == 0) { // go through it in one walk read_lock(&tasklist_lock); @@ -496,13 +490,40 @@ static int ckrm_reclassify_all_tasks(void) } else { read_unlock(&tasklist_lock); } - pos++; } } } ce_release(&CT_taskclass); - return 0; +} + +int ckrm_reclassify(int pid) +{ + struct task_struct *tsk; + int rc = 0; + + down(&async_serializer); // protect again race condition + if (pid < 0) { + // do we want to treat this as process group .. should YES ToDo + rc = -EINVAL; + } else if (pid == 0) { + // reclassify all tasks in the system + ckrm_reclassify_all_tasks(); + } else { + // reclassify particular pid + read_lock(&tasklist_lock); + if ((tsk = find_task_by_pid(pid)) != NULL) { + get_task_struct(tsk); + read_unlock(&tasklist_lock); + CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_RECLASSIFY, tsk); + put_task_struct(tsk); + } else { + read_unlock(&tasklist_lock); + rc = -EINVAL; + } + } + up(&async_serializer); + return rc; } /* @@ -525,7 +546,7 @@ static void ckrm_reclassify_class_tasks(struct ckrm_task_class *cls) atomic_read(&cls->core.hnode.parent->refcnt)); // If no CE registered for this classtype, following will be needed // repeatedly; - ce_regd = atomic_read(&class_core(cls)->classtype->ce_regd); + ce_regd = class_core(cls)->classtype->ce_regd; cnode = &(class_core(cls)->hnode); parcls = class_type(ckrm_task_class_t, cnode->parent); @@ -574,21 +595,20 @@ static void ckrm_reclassify_class_tasks(struct ckrm_task_class *cls) } /* - * Change the core class of the given task + * Change the core class of the given task. */ int ckrm_forced_reclassify_pid(pid_t pid, struct ckrm_task_class *cls) { struct task_struct *tsk; - if (cls && !ckrm_validate_and_grab_core(class_core(cls))) + if (!ckrm_validate_and_grab_core(class_core(cls))) return -EINVAL; read_lock(&tasklist_lock); if ((tsk = find_task_by_pid(pid)) == NULL) { read_unlock(&tasklist_lock); - if (cls) - ckrm_core_drop(class_core(cls)); + ckrm_core_drop(class_core(cls)); return -EINVAL; } get_task_struct(tsk); @@ -597,21 +617,19 @@ int ckrm_forced_reclassify_pid(pid_t pid, struct ckrm_task_class *cls) /* Check permissions */ if ((!capable(CAP_SYS_NICE)) && (!capable(CAP_SYS_RESOURCE)) && (current->user != tsk->user)) { - if (cls) - ckrm_core_drop(class_core(cls)); + ckrm_core_drop(class_core(cls)); put_task_struct(tsk); return -EPERM; } - ce_protect(&CT_taskclass); - if (cls == NULL) - CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY,tsk); - else - ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_MANUAL); + down(&async_serializer); // protect again race condition + ce_protect(&CT_taskclass); + ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_MANUAL); ce_release(&CT_taskclass); put_task_struct(tsk); + up(&async_serializer); return 0; } @@ -669,7 +687,7 @@ static int ckrm_free_task_class(struct ckrm_core_class *core) void __init ckrm_meta_init_taskclass(void) { - printk(KERN_DEBUG "...... Initializing ClassType<%s> ........\n", + printk("...... Initializing ClassType<%s> ........\n", CT_taskclass.name); // intialize the default class ckrm_init_core_class(&CT_taskclass, class_core(&taskclass_dflt_class), @@ -703,25 +721,16 @@ static int tc_forced_reclassify(struct ckrm_core_class *target, const char *obj) pid_t pid; int rc = -EINVAL; - pid = (pid_t) simple_strtol(obj, NULL, 0); - - down(&async_serializer); // protect again race condition with reclassify_class - if (pid < 0) { - // do we want to treat this as process group .. TBD - rc = -EINVAL; - } else if (pid == 0) { - rc = (target == NULL) ? ckrm_reclassify_all_tasks() : -EINVAL; - } else { - struct ckrm_task_class *cls = NULL; - if (target) - cls = class_type(ckrm_task_class_t,target); - rc = ckrm_forced_reclassify_pid(pid,cls); + pid = (pid_t) simple_strtoul(obj, NULL, 10); + if (pid > 0) { + rc = ckrm_forced_reclassify_pid(pid, + class_type(ckrm_task_class_t, + target)); } - up(&async_serializer); return rc; } -#if 0 +#if 1 /****************************************************************************** * Debugging Task Classes: Utility functions @@ -737,7 +746,7 @@ void check_tasklist_sanity(struct ckrm_task_class *cls) class_lock(core); if (list_empty(&core->objlist)) { class_lock(core); - printk(KERN_DEBUG "check_tasklist_sanity: class %s empty list\n", + printk("check_tasklist_sanity: class %s empty list\n", core->name); return; } @@ -746,14 +755,14 @@ void check_tasklist_sanity(struct ckrm_task_class *cls) container_of(lh1, struct task_struct, taskclass_link); if (count++ > 20000) { - printk(KERN_WARNING "list is CORRUPTED\n"); + printk("list is CORRUPTED\n"); break; } if (tsk->taskclass != cls) { const char *tclsname; tclsname = (tsk->taskclass) ? class_core(tsk->taskclass)->name:"NULL"; - printk(KERN_WARNING "sanity: task %s:%d has ckrm_core " + printk("sanity: task %s:%d has ckrm_core " "|%s| but in list |%s|\n", tsk->comm, tsk->pid, tclsname, core->name); } @@ -767,7 +776,7 @@ void ckrm_debug_free_task_class(struct ckrm_task_class *tskcls) struct task_struct *proc, *thread; int count = 0; - printk(KERN_DEBUG "Analyze Error <%s> %d\n", + printk("Analyze Error <%s> %d\n", class_core(tskcls)->name, atomic_read(&(class_core(tskcls)->refcnt))); @@ -779,7 +788,7 @@ void ckrm_debug_free_task_class(struct ckrm_task_class *tskcls) const char *tclsname; tclsname = (thread->taskclass) ? class_core(thread->taskclass)->name :"NULL"; - printk(KERN_DEBUG "%d thread=<%s:%d> -> <%s> <%lx>\n", count, + printk("%d thread=<%s:%d> -> <%s> <%lx>\n", count, thread->comm, thread->pid, tclsname, thread->flags & PF_EXITING); } @@ -787,7 +796,7 @@ void ckrm_debug_free_task_class(struct ckrm_task_class *tskcls) class_unlock(class_core(tskcls)); read_unlock(&tasklist_lock); - printk(KERN_DEBUG "End Analyze Error <%s> %d\n", + printk("End Analyze Error <%s> %d\n", class_core(tskcls)->name, atomic_read(&(class_core(tskcls)->refcnt))); } diff --git a/kernel/ckrm/ckrmutils.c b/kernel/ckrm/ckrmutils.c index d54e7b563..c56a2ae1c 100644 --- a/kernel/ckrm/ckrmutils.c +++ b/kernel/ckrm/ckrmutils.c @@ -96,6 +96,7 @@ void child_maxlimit_changed(struct ckrm_shares *parent, int new_limit) return; } + /* * Caller is responsible for holding any lock to protect the data * structures passed to this function @@ -110,18 +111,26 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, // Check total_guarantee for correctness if (new->total_guarantee <= CKRM_SHARE_DONTCARE) { + printk(KERN_ERR "new->total_guarantee %d <= CKRM_SHARE_DONTCARE\n", + new->total_guarantee); goto set_share_err; } else if (new->total_guarantee == CKRM_SHARE_UNCHANGED) { ; // do nothing } else if (cur_usage_guar > new->total_guarantee) { + printk(KERN_ERR "cur_usage_guar %d > new->total_guarantee %d\n", + cur_usage_guar,new->total_guarantee); goto set_share_err; } // Check max_limit for correctness if (new->max_limit <= CKRM_SHARE_DONTCARE) { + printk(KERN_ERR "new->max_limit %d <= CKRM_SHARE_DONTCARE\n", + new->max_limit); goto set_share_err; } else if (new->max_limit == CKRM_SHARE_UNCHANGED) { ; // do nothing } else if (cur->cur_max_limit > new->max_limit) { + printk(KERN_ERR "cur->cur_max_limit %d > new->max_limit %d\n", + cur->cur_max_limit, new->max_limit); goto set_share_err; } // Check my_guarantee for correctness @@ -130,6 +139,8 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, } else if (new->my_guarantee == CKRM_SHARE_DONTCARE) { ; // do nothing } else if (par && increase_by > par->unused_guarantee) { + printk(KERN_ERR "increase_by %d > par->unused_guarantee %d\n", + increase_by, par->unused_guarantee); goto set_share_err; } // Check my_limit for correctness @@ -139,6 +150,8 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, ; // do nothing } else if (par && new->my_limit > par->max_limit) { // I can't get more limit than my parent's limit + printk(KERN_ERR "new->my_limit %d > par->max_limit %d\n", + new->my_limit,par->max_limit); goto set_share_err; } @@ -152,6 +165,8 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, ; // do nothing earlier setting would've // taken care of it } else if (new->my_guarantee > cur->my_limit) { + printk(KERN_ERR "new->my_guarantee %d > cur->my_limit %d\n", + new->my_guarantee,par->max_limit); goto set_share_err; } } else { // new->my_limit has a valid value @@ -159,9 +174,13 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, ; // do nothing } else if (new->my_guarantee == CKRM_SHARE_UNCHANGED) { if (cur->my_guarantee > new->my_limit) { + printk(KERN_ERR "cur->my_guarantee %d > new->my_limit %d\n", + cur->my_guarantee,new->my_limit); goto set_share_err; } } else if (new->my_guarantee > new->my_limit) { + printk(KERN_ERR "new->my_guarantee %d > new->my_limit %d\n", + new->my_guarantee,new->my_limit); goto set_share_err; } } diff --git a/kernel/ckrm/rbce/bitvector.h b/kernel/ckrm/rbce/bitvector.h index 098cc2327..4f53f9847 100644 --- a/kernel/ckrm/rbce/bitvector.h +++ b/kernel/ckrm/rbce/bitvector.h @@ -136,12 +136,12 @@ inline static void bitvector_print(int flag, bitvector_t * vec) return; } if (vec == NULL) { - printk(KERN_DEBUG "v<0>-NULL\n"); + printk("v<0>-NULL\n"); return; } - printk(KERN_DEBUG "v<%d>-", sz = vec->size); + printk("v<%d>-", sz = vec->size); for (i = 0; i < sz; i++) { - printk(KERN_DEBUG "%c", test_bit(i, vec->bits) ? '1' : '0'); + printk("%c", test_bit(i, vec->bits) ? '1' : '0'); } return; } diff --git a/kernel/ckrm/rbce/info.h b/kernel/ckrm/rbce/info.h index 7263b22e1..3bc13b519 100644 --- a/kernel/ckrm/rbce/info.h +++ b/kernel/ckrm/rbce/info.h @@ -1,6 +1,12 @@ static char *info = "1. Magic files\n" "\t|--rbce_info - read only file detailing how to setup and use RBCE.\n\n" + "\t|--rbce_reclassify - contains nothing. Writing a pid to it" + "reclassifies\n" + "\tthe given task according to the current set of rules.\n" + "\tWriting 0 to it reclassifies all tasks in the system according to the \n" + "\tsurrent set of rules. This is typically done by the user/sysadmin \n" + "\tafter changing/creating rules. \n\n" "\t|--rbce_state - determines whether RBCE is currently active" " or inactive.\n" "\tWriting 1 (0) activates (deactivates) the CE. Reading the file\n" diff --git a/kernel/ckrm/rbce/rbce_fs.c b/kernel/ckrm/rbce/rbce_fs.c index 187e7cdba..bb92fb94c 100644 --- a/kernel/ckrm/rbce/rbce_fs.c +++ b/kernel/ckrm/rbce/rbce_fs.c @@ -1,26 +1,6 @@ -/* RCFS API for Rule-based Classification Engine (RBCE) and - * Consolidated RBCE module code (combined) - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap, IBM Corp. 2004 - * - * Module for loading of classification policies and providing - * a user API for Class-based Kernel Resource Management (CKRM) - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * +/* + * This file is released under the GPL. */ - #include #include #include @@ -74,6 +54,12 @@ rbce_write(struct file *file, const char __user * buf, if (*ptr == '\n') { *ptr = '\0'; } +#if 0 + if (!strcmp(file->f_dentry->d_name.name, "rbce_reclassify")) { + pid = simple_strtol(line, NULL, 0); + rc = reclassify_pid(pid); + } else +#endif if (!strcmp(file->f_dentry->d_name.name, "rbce_tag")) { pid = simple_strtol(line, &ptr, 0); rc = set_tasktag(pid, ptr + 1); // expected syntax "pid tag" @@ -101,7 +87,8 @@ static int rbce_show(struct seq_file *seq, void *offset) char result[256]; memset(result, 0, 256); - if (!strcmp(file->f_dentry->d_name.name, "rbce_tag")) { + if (!strcmp(file->f_dentry->d_name.name, "rbce_reclassify") || + !strcmp(file->f_dentry->d_name.name, "rbce_tag")) { return -EPERM; } if (!strcmp(file->f_dentry->d_name.name, "rbce_state")) { @@ -130,7 +117,8 @@ static int rbce_close(struct inode *ino, struct file *file) { const char *name = file->f_dentry->d_name.name; - if (strcmp(name, "rbce_state") && + if (strcmp(name, "rbce_reclassify") && + strcmp(name, "rbce_state") && strcmp(name, "rbce_tag") && strcmp(name, "rbce_info")) { if (!rule_exists(name)) { @@ -304,9 +292,11 @@ rbce_create(struct inode *dir, struct dentry *dentry, struct dentry *pd = list_entry(dir->i_dentry.next, struct dentry, d_alias); - // Under /ce only "rbce_state", "rbce_tag" and "rbce_info" are allowed + // Under /ce only "rbce_reclassify", "rbce_state", "rbce_tag" and + // "rbce_info" are allowed if (!strcmp(pd->d_name.name, "ce")) { - if (strcmp(dentry->d_name.name, "rbce_state") && + if (strcmp(dentry->d_name.name, "rbce_reclassify") && + strcmp(dentry->d_name.name, "rbce_state") && strcmp(dentry->d_name.name, "rbce_tag") && strcmp(dentry->d_name.name, "rbce_info")) { return -EINVAL; @@ -329,7 +319,7 @@ rbce_symlink(struct inode *dir, struct dentry *dentry, const char *symname) /******************************* Magic files ********************/ -#define RBCE_NR_MAGF 5 +#define RBCE_NR_MAGF 6 struct rcfs_magf rbce_magf_files[RBCE_NR_MAGF] = { { .name = "ce", @@ -351,6 +341,11 @@ struct rcfs_magf rbce_magf_files[RBCE_NR_MAGF] = { .mode = RCFS_DEFAULT_FILE_MODE, .i_fop = &rbce_file_operations, }, + { + .name = "rbce_reclassify", + .mode = RCFS_DEFAULT_FILE_MODE, + .i_fop = &rbce_file_operations, + }, { .name = "rules", .mode = (RCFS_DEFAULT_DIR_MODE | S_IWUSR), @@ -422,7 +417,7 @@ static struct inode_operations rbce_dir_inode_operations = { static void rbce_put_super(struct super_block *sb) { module_put(THIS_MODULE); - printk(KERN_DEBUG "rbce_put_super called\n"); + printk("rbce_put_super called\n"); } static struct super_operations rbce_ops = { diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c index 555ba0a4e..fa8d2c470 100644 --- a/kernel/ckrm/rbce/rbcemod.c +++ b/kernel/ckrm/rbce/rbcemod.c @@ -1,5 +1,4 @@ -/* Rule-based Classification Engine (RBCE) and - * Consolidated RBCE module code (combined) +/* Rule-based Classification Engine (RBCE) module * * Copyright (C) Hubertus Franke, IBM Corp. 2003 * (C) Chandra Seetharaman, IBM Corp. 2003 @@ -15,10 +14,6 @@ * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * */ /* Changes @@ -54,7 +49,7 @@ #include #include #include "bitvector.h" -#include +#include "rbce.h" #define DEBUG @@ -179,8 +174,6 @@ int termop_2_vecidx[RBCE_RULE_INVALID] = { #define POLICY_ACTION_REDO_ALL 0x02 // Recompute all rule flags #define POLICY_ACTION_PACK_TERMS 0x04 // Time to pack the terms -const int use_persistent_state = 1; - struct ckrm_eng_callback ckrm_ecbs; // Term vector state @@ -254,7 +247,7 @@ int rbcedebug = 0x00; #define DBG_RULE ( 0x20 ) #define DBG_POLICY ( 0x40 ) -#define DPRINTK(x, y...) if (rbcedebug & (x)) printk(KERN_DEBUG y) +#define DPRINTK(x, y...) if (rbcedebug & (x)) printk(y) // debugging selectively enabled through /proc/sys/debug/rbce static void print_context_vectors(void) @@ -265,9 +258,9 @@ static void print_context_vectors(void) return; } for (i = 0; i < NUM_TERM_MASK_VECTOR; i++) { - printk(KERN_DEBUG "%d: ", i); + printk("%d: ", i); bitvector_print(DBG_OPTIMIZATION, gl_mask_vecs[i]); - printk(KERN_DEBUG "\n"); + printk("\n"); } } #else @@ -506,7 +499,7 @@ rbce_class_deletecb(const char *classname, void *classobj, int classtype) } notify_class_action(cls, 0); cls->classobj = NULL; - list_for_each_entry(pos, &rules_list[classtype], link) { + list_for_each_entry(pos, &rules_list[cls->classtype], link) { rule = (struct rbce_rule *)pos; if (rule->target_class) { if (!strcmp @@ -517,6 +510,7 @@ rbce_class_deletecb(const char *classname, void *classobj, int classtype) } } } + put_class(cls); if ((cls = find_class_name(classname)) != NULL) { printk(KERN_ERR "rbce ERROR: class %s exists in rbce after " @@ -1343,49 +1337,65 @@ int rule_exists(const char *rname) static struct rbce_private_data *create_private_data(struct rbce_private_data *, int); -static inline -void reset_evaluation(struct rbce_private_data *pdata,int termflag) +int rbce_ckrm_reclassify(int pid) { - /* reset TAG ruleterm evaluation results to pick up - * on next classification event - */ - if (use_persistent_state && gl_mask_vecs[termflag]) { - bitvector_and_not( pdata->eval, pdata->eval, - gl_mask_vecs[termflag] ); - bitvector_and_not( pdata->true, pdata->true, - gl_mask_vecs[termflag] ); - } + printk("ckrm_reclassify_pid ignored\n"); + return -EINVAL; +} + +int reclassify_pid(int pid) +{ + struct task_struct *tsk; + + // FIXME: Need to treat -pid as process group + if (pid < 0) { + return -EINVAL; + } + + if (pid == 0) { + rbce_ckrm_reclassify(0); // just reclassify all tasks. + } + // if pid is +ve take control of the task, start evaluating it + if ((tsk = find_task_by_pid(pid)) == NULL) { + return -EINVAL; + } + + if (unlikely(!RBCE_DATA(tsk))) { + RBCE_DATAP(tsk) = create_private_data(NULL, 0); + if (!RBCE_DATA(tsk)) { + return -ENOMEM; + } + } + RBCE_DATA(tsk)->evaluate = 1; + rbce_ckrm_reclassify(pid); + return 0; } - + int set_tasktag(int pid, char *tag) { char *tp; - int rc = 0; struct task_struct *tsk; struct rbce_private_data *pdata; - int len; if (!tag) { return -EINVAL; } - len = strlen(tag) + 1; - tp = kmalloc(len, GFP_ATOMIC); - if (!tp) { - return -ENOMEM; - } - strncpy(tp,tag,len); - read_lock(&tasklist_lock); if ((tsk = find_task_by_pid(pid)) == NULL) { - rc = -EINVAL; - goto out; + return -EINVAL; + } + + tp = kmalloc(strlen(tag) + 1, GFP_ATOMIC); + + if (!tp) { + return -ENOMEM; } if (unlikely(!RBCE_DATA(tsk))) { RBCE_DATAP(tsk) = create_private_data(NULL, 0); if (!RBCE_DATA(tsk)) { - rc = -ENOMEM; - goto out; + kfree(tp); + return -ENOMEM; } } pdata = RBCE_DATA(tsk); @@ -1393,13 +1403,10 @@ int set_tasktag(int pid, char *tag) kfree(pdata->app_tag); } pdata->app_tag = tp; - reset_evaluation(pdata,RBCE_TERMFLAG_TAG); - - out: - read_unlock(&tasklist_lock); - if (rc != 0) - kfree(tp); - return rc; + strcpy(pdata->app_tag, tag); + rbce_ckrm_reclassify(pid); + + return 0; } /*====================== Classification Functions =======================*/ @@ -1816,7 +1823,7 @@ static inline int valid_pdata(struct rbce_private_data *pdata) } } spin_unlock(&pdata_lock); - printk(KERN_WARNING "INVALID/CORRUPT PDATA %p\n", pdata); + printk("INVALID/CORRUPT PDATA %p\n", pdata); return 0; } @@ -1829,7 +1836,7 @@ static inline void store_pdata(struct rbce_private_data *pdata) while (i < MAX_PDATA) { if (pdata_arr[pdata_next] == NULL) { - printk(KERN_DEBUG "storing %p at %d, count %d\n", pdata, + printk("storing %p at %d, count %d\n", pdata, pdata_next, pdata_count); pdata_arr[pdata_next++] = pdata; if (pdata_next == MAX_PDATA) { @@ -1844,7 +1851,7 @@ static inline void store_pdata(struct rbce_private_data *pdata) spin_unlock(&pdata_lock); } if (i == MAX_PDATA) { - printk(KERN_DEBUG "PDATA BUFFER FULL pdata_count %d pdata %p\n", + printk("PDATA BUFFER FULL pdata_count %d pdata %p\n", pdata_count, pdata); } } @@ -1856,7 +1863,7 @@ static inline void unstore_pdata(struct rbce_private_data *pdata) spin_lock(&pdata_lock); for (i = 0; i < MAX_PDATA; i++) { if (pdata_arr[i] == pdata) { - printk(KERN_DEBUG "unstoring %p at %d, count %d\n", pdata, + printk("unstoring %p at %d, count %d\n", pdata, i, pdata_count); pdata_arr[i] = NULL; pdata_count--; @@ -1866,7 +1873,7 @@ static inline void unstore_pdata(struct rbce_private_data *pdata) } spin_unlock(&pdata_lock); if (i == MAX_PDATA) { - printk(KERN_DEBUG "pdata %p not found in the stored array\n", + printk("pdata %p not found in the stored array\n", pdata); } } @@ -1881,6 +1888,8 @@ static inline void unstore_pdata(struct rbce_private_data *pdata) #endif // PDATA_DEBUG +const int use_persistent_state = 1; + /* * Allocate and initialize a rbce_private_data data structure. * @@ -1929,7 +1938,7 @@ static struct rbce_private_data *create_private_data(struct rbce_private_data // pdata->evaluate = src->evaluate; // if(src->app_tag) { // int len = strlen(src->app_tag)+1; - // printk(KERN_DEBUG "CREATE_PRIVATE: apptag %s len %d\n", + // printk("CREATE_PRIVATE: apptag %s len %d\n", // src->app_tag,len); // pdata->app_tag = kmalloc(len, GFP_ATOMIC); // if (pdata->app_tag) { @@ -2252,7 +2261,6 @@ void *rbce_tc_classify(enum ckrm_event event, ...) va_list args; void *cls = NULL; struct task_struct *tsk; - struct rbce_private_data *pdata; va_start(args, event); tsk = va_arg(args, struct task_struct *); @@ -2262,7 +2270,7 @@ void *rbce_tc_classify(enum ckrm_event event, ...) * [ CKRM_LATCHABLE_EVENTS .. CKRM_NONLATCHABLE_EVENTS ) */ - // printk(KERN_DEBUG "tc_classify %p:%d:%s '%s'\n",tsk,tsk->pid, + // printk("tc_classify %p:%d:%s '%s'\n",tsk,tsk->pid, // tsk->comm,event_names[event]); switch (event) { @@ -2307,14 +2315,11 @@ void *rbce_tc_classify(enum ckrm_event event, ...) break; case CKRM_EVENT_RECLASSIFY: - if ((pdata = (RBCE_DATA(tsk)))) { - pdata->evaluate = 1; - } cls = rbce_classify(tsk, NULL, RBCE_TERMFLAG_ALL, tc_classtype); break; } - // printk(KERN_DEBUG "tc_classify %p:%d:%s '%s' ==> %p\n",tsk,tsk->pid, + // printk("tc_classify %p:%d:%s '%s' ==> %p\n",tsk,tsk->pid, // tsk->comm,event_names[event],cls); return cls; @@ -2323,7 +2328,7 @@ void *rbce_tc_classify(enum ckrm_event event, ...) #ifndef RBCE_EXTENSION static void rbce_tc_notify(int event, void *core, struct task_struct *tsk) { - printk(KERN_DEBUG "tc_manual %p:%d:%s '%s'\n", tsk, tsk->pid, tsk->comm, + printk("tc_manual %p:%d:%s '%s'\n", tsk, tsk->pid, tsk->comm, event_names[event]); if (event != CKRM_EVENT_MANUAL) return; @@ -2402,40 +2407,38 @@ struct ce_regtable_struct ce_regtable[] = { {NULL} }; -static void unregister_classtype_engines(void) - { +static int register_classtype_engines(void) +{ int rc; struct ce_regtable_struct *ceptr = ce_regtable; while (ceptr->name) { - if (*ceptr->clsvar >= 0) { - printk(KERN_DEBUG "ce unregister with <%s>\n",ceptr->name); - while ((rc = ckrm_unregister_engine(ceptr->name)) == -EAGAIN) - ; - printk(KERN_DEBUG "ce unregister with <%s> rc=%d\n",ceptr->name,rc); - *ceptr->clsvar = -1; - } + rc = ckrm_register_engine(ceptr->name, ceptr->cbs); + printk("ce register with <%s> typeId=%d\n", ceptr->name, rc); + if ((rc < 0) && (rc != -ENOENT)) + return (rc); + if (rc != -ENOENT) + *ceptr->clsvar = rc; ceptr++; } - } + return 0; +} -static int register_classtype_engines(void) +static void unregister_classtype_engines(void) { int rc; struct ce_regtable_struct *ceptr = ce_regtable; while (ceptr->name) { - rc = ckrm_register_engine(ceptr->name, ceptr->cbs); - printk(KERN_DEBUG "ce register with <%s> typeId=%d\n",ceptr->name,rc); - if ((rc < 0) && (rc != -ENOENT)) { - unregister_classtype_engines(); - return (rc); + if (*ceptr->clsvar >= 0) { + printk("ce unregister with <%s>\n", ceptr->name); + rc = ckrm_unregister_engine(ceptr->name); + printk("ce unregister with <%s> rc=%d\n", ceptr->name, + rc); + *ceptr->clsvar = -1; } - if (rc != -ENOENT) - *ceptr->clsvar = rc; ceptr++; } - return 0; } // =========== /proc/sysctl/debug/rbce debug stuff ============= @@ -2506,7 +2509,7 @@ int init_rbce(void) { int rc, i, line; - printk(KERN_DEBUG "<1>\nInstalling \'%s\' module\n", modname); + printk("<1>\nInstalling \'%s\' module\n", modname); for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { INIT_LIST_HEAD(&rules_list[i]); @@ -2555,7 +2558,7 @@ int init_rbce(void) exit_rbce_ext(); out: - printk(KERN_DEBUG "<1>%s: error installing rc=%d line=%d\n", __FUNCTION__, rc, + printk("<1>%s: error installing rc=%d line=%d\n", __FUNCTION__, rc, line); return rc; } @@ -2564,19 +2567,19 @@ void exit_rbce(void) { int i; - printk(KERN_DEBUG "<1>Removing \'%s\' module\n", modname); + printk("<1>Removing \'%s\' module\n", modname); stop_debug(); exit_rbce_ext(); // Print warnings if lists are not empty, which is a bug if (!list_empty(&class_list)) { - printk(KERN_DEBUG "exit_rbce: Class list is not empty\n"); + printk("exit_rbce: Class list is not empty\n"); } for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { if (!list_empty(&rules_list[i])) { - printk(KERN_DEBUG "exit_rbce: Rules list for classtype %d" + printk("exit_rbce: Rules list for classtype %d" " is not empty\n", i); } } @@ -2594,6 +2597,7 @@ EXPORT_SYMBOL(rule_exists); EXPORT_SYMBOL(change_rule); EXPORT_SYMBOL(delete_rule); EXPORT_SYMBOL(rename_rule); +EXPORT_SYMBOL(reclassify_pid); EXPORT_SYMBOL(set_tasktag); module_init(init_rbce); diff --git a/kernel/ckrm/rbce/rbcemod_ext.c b/kernel/ckrm/rbce/rbcemod_ext.c index 3cae550f7..b7886ebf4 100644 --- a/kernel/ckrm/rbce/rbcemod_ext.c +++ b/kernel/ckrm/rbce/rbcemod_ext.c @@ -3,7 +3,7 @@ * Copyright (C) Hubertus Franke, IBM Corp. 2003 * * Extension to be included into RBCE to collect delay and sample information - * Requires user daemon e.g. crbcedmn to activate. + * requires user daemon to activate. * * Latest version, more details at http://ckrm.sf.net * @@ -12,13 +12,8 @@ * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * */ - /******************************************************************************* * * User-Kernel Communication Channel (UKCC) @@ -62,10 +57,10 @@ static int ukcc_fileop_notify(int rchan_id, { static int readers = 0; if (fileop == RELAY_FILE_OPEN) { - // printk(KERN_DEBUG "got fileop_notify RELAY_FILE_OPEN for file %p\n", + // printk("got fileop_notify RELAY_FILE_OPEN for file %p\n", // filp); if (readers) { - printk(KERN_DEBUG "only one client allowed, backoff .... \n"); + printk("only one client allowed, backoff .... \n"); return -EPERM; } if (!try_module_get(THIS_MODULE)) @@ -74,7 +69,7 @@ static int ukcc_fileop_notify(int rchan_id, client_attached(); } else if (fileop == RELAY_FILE_CLOSE) { - // printk(KERN_DEBUG "got fileop_notify RELAY_FILE_CLOSE for file %p\n", + // printk("got fileop_notify RELAY_FILE_CLOSE for file %p\n", // filp); client_detached(); readers--; @@ -109,10 +104,10 @@ static int create_ukcc_channel(void) channel_flags, &ukcc_callbacks, 0, 0, 0, 0, 0, 0, NULL, 0); if (ukcc_channel < 0) - printk(KERN_DEBUG "crbce: ukcc creation failed, errcode: %d\n", + printk("crbce: ukcc creation failed, errcode: %d\n", ukcc_channel); else - printk(KERN_DEBUG "crbce: ukcc created (%u KB)\n", + printk("crbce: ukcc created (%u KB)\n", UKCC_TOTAL_BUFFER_SIZE >> 10); return ukcc_channel; } @@ -144,9 +139,9 @@ static inline void close_ukcc_channel(void) (r),(l),-1,NULL) > 0); \ chan_state = chan_isok ? UKCC_OK : UKCC_STANDBY; \ if (chan_wasok && !chan_isok) { \ - printk(KERN_DEBUG "Channel stalled\n"); \ + printk("Channel stalled\n"); \ } else if (!chan_wasok && chan_isok) { \ - printk(KERN_DEBUG "Channel continues\n"); \ + printk("Channel continues\n"); \ } \ } while (0) @@ -288,7 +283,7 @@ send_task_record(struct task_struct *tsk, int event, return 0; pdata = RBCE_DATA(tsk); if (pdata == NULL) { - // printk(KERN_DEBUG "send [%d]<%s>: no pdata\n",tsk->pid,tsk->comm); + // printk("send [%d]<%s>: no pdata\n",tsk->pid,tsk->comm); return 0; } if (send_forced || (delta_mode == 0) @@ -384,7 +379,7 @@ static void send_task_data(void) rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0); rec_send(&limrec); - // printk(KERN_DEBUG "send_task_data mode=%d t#=%d s#=%d\n", + // printk("send_task_data mode=%d t#=%d s#=%d\n", // delta_mode,taskcnt,sendcnt); } @@ -503,7 +498,7 @@ static void sample_task_data(unsigned long unused) } while_each_thread(proc, thread); read_unlock(&tasklist_lock); -// printk(KERN_DEBUG "sample_timer: run=%d wait=%d\n",run,wait); +// printk("sample_timer: run=%d wait=%d\n",run,wait); start_sample_timer(); } @@ -513,7 +508,7 @@ static void ukcc_cmd_deliver(int rchan_id, char *from, u32 len) struct crbce_cmd_done cmdret; int rc = 0; -// printk(KERN_DEBUG "ukcc_cmd_deliver: %d %d len=%d:%d\n",cmdrec->type, +// printk("ukcc_cmd_deliver: %d %d len=%d:%d\n",cmdrec->type, // cmdrec->cmd,cmdrec->len,len); cmdrec->len = len; // add this to reflection so the user doesn't @@ -578,20 +573,20 @@ static void ukcc_cmd_deliver(int rchan_id, char *from, u32 len) cmdret.hdr.cmd = cmdrec->cmd; cmdret.rc = rc; rec_send(&cmdret); -// printk(KERN_DEBUG "ukcc_cmd_deliver ACK: %d %d rc=%d %d\n",cmdret.hdr.type, +// printk("ukcc_cmd_deliver ACK: %d %d rc=%d %d\n",cmdret.hdr.type, // cmdret.hdr.cmd,rc,sizeof(cmdret)); } static void client_attached(void) { - printk(KERN_DEBUG "client [%d]<%s> attached to UKCC\n", current->pid, + printk("client [%d]<%s> attached to UKCC\n", current->pid, current->comm); relay_reset(ukcc_channel); } static void client_detached(void) { - printk(KERN_DEBUG "client [%d]<%s> detached to UKCC\n", current->pid, + printk("client [%d]<%s> detached to UKCC\n", current->pid, current->comm); chan_state = UKCC_STANDBY; stop_sample_timer(); diff --git a/kernel/ckrm/rbce/token.c b/kernel/ckrm/rbce/token.c index 32446fb2b..0ace80a50 100644 --- a/kernel/ckrm/rbce/token.c +++ b/kernel/ckrm/rbce/token.c @@ -1,24 +1,3 @@ -/* Tokens for Rule-based Classification Engine (RBCE) and - * Consolidated RBCE module code (combined) - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * - */ - #include #include @@ -197,7 +176,7 @@ rules_parse(char *rule_defn, struct rbce_rule_term **rterms, int *term_mask) nterms = 0; while (*rp++) { - if (*rp == '>' || *rp == '<' || *rp == '=' || *rp == '!') { + if (*rp == '>' || *rp == '<' || *rp == '=') { nterms++; } } @@ -293,7 +272,7 @@ rules_parse(char *rule_defn, struct rbce_rule_term **rterms, int *term_mask) *term_mask = 0; } /* else { for (i = 0; i < nterms; i++) { - printk(KERN_DEBUG "token: i %d; op %d, operator %d, str %ld\n", + printk("token: i %d; op %d, operator %d, str %ld\n", i, terms[i].op, terms[i].operator, terms[i].u.id); } } */ diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c index 0400844a3..1929aaf4e 100644 --- a/kernel/ckrm_classqueue.c +++ b/kernel/ckrm_classqueue.c @@ -133,42 +133,11 @@ void classqueue_update_prio(struct classqueue_struct *cq, //add to new positon, round robin for classes with same priority list_add_tail(&(node->list), &cq->array.queue[index]); - __set_bit(index, cq->array.bitmap); + __set_bit(index, cq->array.bitmap); + node->index = index; } -/** - *classqueue_get_min_prio: return the priority of the last node in queue - * - * this function can be called without runqueue lock held - */ -static inline int classqueue_get_min_prio(struct classqueue_struct *cq) -{ - cq_node_t *result = NULL; - int pos; - - /* - * search over the bitmap to get the first class in the queue - */ - pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset); - //do circular search from the beginning - if (pos >= CLASSQUEUE_SIZE) - pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); - - if (pos < CLASSQUEUE_SIZE) { - result = list_entry(cq->array.queue[pos].next, cq_node_t, list); - if (list_empty(&cq->array.queue[pos])) - result = NULL; - } - if (result) - return result->prio; - else - return 0; -} - -/** - * this function must be called with runqueue lock held - */ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) { cq_node_t *result = NULL; @@ -178,9 +147,9 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) * search over the bitmap to get the first class in the queue */ pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset); - //do circular search from the beginning - if (pos >= CLASSQUEUE_SIZE) + if (pos >= CLASSQUEUE_SIZE) { //do circular search from the beginning pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); + } if (pos < CLASSQUEUE_SIZE) { BUG_ON(list_empty(&cq->array.queue[pos])); @@ -193,17 +162,15 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) * Moving the end of queue forward * the new_base here is logical, we need to translate to the abosule position */ -void classqueue_update_base(struct classqueue_struct *cq) +void classqueue_update_base(struct classqueue_struct *cq, int new_base) { - int new_base; - - if (! cq_nr_member(cq)) { + if (!cq_nr_member(cq)) { cq->base_offset = -1; //not defined return; } - new_base = classqueue_get_min_prio(cq); - + // assert(new_base >= cq->base); + if (new_base > cq->base) { cq->base_offset = get_index(cq, &new_base); cq->base = new_base; diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c index 5142b2eaa..ba716d4c5 100644 --- a/kernel/ckrm_sched.c +++ b/kernel/ckrm_sched.c @@ -15,202 +15,57 @@ #include #include -rwlock_t class_list_lock = RW_LOCK_UNLOCKED; -LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor - -struct ckrm_cpu_class default_cpu_class_obj; - -struct ckrm_cpu_class * get_default_cpu_class(void) { - return (&default_cpu_class_obj); -} - /*******************************************************/ /* CVT Management */ /*******************************************************/ +#define CVT_WINDOW_SIZE (CLASSQUEUE_SIZE << CLASS_BONUS_RATE) +static CVT_t max_CVT = CVT_WINDOW_SIZE; -static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) +/* + * Also ensure that the classes global cvt is upgraded to the + * minimum CVT in the system, as a class might not have run for a while + */ +static void update_global_cvt(struct ckrm_cpu_class *cpu_class, int cpu) { + struct ckrm_local_runqueue *class_queue = + get_ckrm_local_runqueue(cpu_class, cpu); CVT_t min_cvt; - CVT_t bonus; - - //just a safty measure - if (unlikely(! cur_cvt)) - return; + CVT_t local_cvt_old = class_queue->local_cvt; -#ifndef INTERACTIVE_BONUS_SUPPORT -#warning "ACB taking out interactive bonus calculation" - bonus = 0; -#else - /* - * Always leaving a small bonus for inactive classes - * allows them to compete for cycles immediately when the become - * active. This should improve interactive behavior - */ - bonus = INTERACTIVE_BONUS(lrq); + spin_lock(&cvt_lock); + if (class_queue->uncounted_cvt) { + cpu_class->global_cvt += class_queue->uncounted_cvt; + class_queue->uncounted_cvt = 0; + } + min_cvt = max_CVT - CVT_WINDOW_SIZE; + if (cpu_class->global_cvt < min_cvt) + cpu_class->global_cvt = min_cvt; + else if (cpu_class->global_cvt > max_CVT) + max_CVT = cpu_class->global_cvt; + +/* update local cvt from global cvt*/ +#if 0 + class_queue->local_cvt = cpu_class->global_cvt; #endif + spin_unlock(&cvt_lock); - //cvt can't be negative - if (cur_cvt > bonus) - min_cvt = cur_cvt - bonus; - else - min_cvt = 0; - - if (lrq->local_cvt < min_cvt) { - CVT_t lost_cvt; - - lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq); - lrq->local_cvt = min_cvt; - - /* add what the class lost to its savings*/ - lrq->savings += lost_cvt; - if (lrq->savings > MAX_SAVINGS) - lrq->savings = MAX_SAVINGS; - } else if (lrq->savings) { - /* - *if a class saving and falling behind - * then start to use it saving in a leaking bucket way - */ - CVT_t savings_used; - - savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq); - if (savings_used > lrq->savings) - savings_used = lrq->savings; - - if (savings_used > SAVINGS_LEAK_SPEED) - savings_used = SAVINGS_LEAK_SPEED; - - BUG_ON(lrq->savings < savings_used); - lrq->savings -= savings_used; - unscale_cvt(savings_used,lrq); - BUG_ON(lrq->local_cvt < savings_used); -#ifndef CVT_SAVINGS_SUPPORT -#warning "ACB taking out cvt saving" -#else - lrq->local_cvt -= savings_used; -#endif - } + if (class_queue->local_cvt != local_cvt_old) + update_class_priority(class_queue); } /* - * return the max_cvt of all the classes - */ -static inline CVT_t get_max_cvt(int this_cpu) -{ - struct ckrm_cpu_class *clsptr; - ckrm_lrq_t * lrq; - CVT_t max_cvt; - - max_cvt = 0; - - /*update class time, at the same time get max_cvt */ - list_for_each_entry(clsptr, &active_cpu_classes, links) { - lrq = get_ckrm_lrq(clsptr, this_cpu); - if (lrq->local_cvt > max_cvt) - max_cvt = lrq->local_cvt; - } - - return max_cvt; -} - -/** - * update_class_cputime - updates cvt of inactive classes - * -- an inactive class shouldn't starve others when it comes back - * -- the cpu time it lost when it's inactive should be accumulated - * -- its accumulated saving should be compensated (in a leaky bucket fashion) - * * class_list_lock must have been acquired */ -void update_class_cputime(int this_cpu) +void update_global_cvts(int this_cpu) { struct ckrm_cpu_class *clsptr; - ckrm_lrq_t * lrq; - CVT_t cur_cvt; - - /* - * a class's local_cvt must not be significantly smaller than min_cvt - * of active classes otherwise, it will starve other classes when it - * is reactivated. - * - * Hence we keep all local_cvt's within a range of the min_cvt off - * all active classes (approximated by the local_cvt of the currently - * running class) and account for how many cycles where thus taken - * from an inactive class building a savings (not to exceed a few seconds) - * for a class to gradually make up upon reactivation, without - * starvation of other classes. - * - */ - cur_cvt = get_local_cur_cvt(this_cpu); + struct ckrm_local_runqueue *class_queue; - /* - * cur_cvt == 0 means the system is now idle - * in this case, we use max_cvt as cur_cvt - * max_cvt roughly represents the cvt of the class - * that has just finished running - * - * fairness wouldn't be a problem since we account for whatever lost in savings - * if the system is not busy, the system responsiveness is not a problem. - * still fine if the sytem is busy, but happened to be idle at this certain point - * since bias toward interactive classes (class priority) is a more important way to improve system responsiveness - */ - if (unlikely(! cur_cvt)) { - cur_cvt = get_max_cvt(this_cpu); - //return; - } - - /* - * - check the local cvt of all the classes - * - update total_ns received by the class - * - do a usage sampling for the whole class - */ + /*for each class*/ list_for_each_entry(clsptr, &active_cpu_classes, links) { - lrq = get_ckrm_lrq(clsptr, this_cpu); - - spin_lock(&clsptr->stat.stat_lock); - clsptr->stat.total_ns += lrq->uncounted_ns; - ckrm_sample_usage(clsptr); - spin_unlock(&clsptr->stat.stat_lock); - lrq->uncounted_ns = 0; - - check_inactive_class(lrq,cur_cvt); + update_global_cvt(clsptr, this_cpu); + class_queue = get_ckrm_local_runqueue(clsptr, this_cpu); + clsptr->stat.total_ns += class_queue->uncounted_ns; + class_queue->uncounted_ns = 0; } } - -/*******************************************************/ -/* PID load balancing stuff */ -/*******************************************************/ -#define PID_SAMPLE_T 32 -#define PID_KP 20 -#define PID_KI 60 -#define PID_KD 20 - -/** - * sample pid load periodically - */ -void ckrm_load_sample(ckrm_load_t* pid,int cpu) -{ - long load; - long err; - - if (jiffies % PID_SAMPLE_T) - return; - - adjust_local_weight(); - - load = ckrm_cpu_load(cpu); - err = load - pid->load_p; - pid->load_d = err; - pid->load_p = load; - pid->load_i *= 9; - pid->load_i += load; - pid->load_i /= 10; -} - -long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) -{ - long pressure; - pressure = ckrm_load->load_p * PID_KP; - pressure += ckrm_load->load_i * PID_KI; - pressure += ckrm_load->load_d * PID_KD; - pressure /= 100; - return pressure; -} diff --git a/kernel/exit.c b/kernel/exit.c index 60075cbb3..5bc8fff46 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include @@ -524,12 +523,6 @@ static inline void __exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); -#ifdef CONFIG_CKRM_RES_MEM - spin_lock(&mm->peertask_lock); - list_del_init(&tsk->mm_peers); - ckrm_mem_evaluate_mm(mm); - spin_unlock(&mm->peertask_lock); -#endif enter_lazy_tlb(mm, current); task_unlock(tsk); mmput(mm); @@ -866,6 +859,9 @@ asmlinkage NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); tsk->exit_code = code; +#ifdef CONFIG_CKRM_TYPE_TASKCLASS + numtasks_put_ref(tsk->taskclass); +#endif exit_notify(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); diff --git a/kernel/exit.c.orig b/kernel/exit.c.orig new file mode 100644 index 000000000..f53583e2b --- /dev/null +++ b/kernel/exit.c.orig @@ -0,0 +1,1192 @@ +/* + * linux/kernel/exit.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +extern void sem_exit (void); +extern struct task_struct *child_reaper; + +int getrusage(struct task_struct *, int, struct rusage __user *); + +static void __unhash_process(struct task_struct *p) +{ + nr_threads--; + detach_pid(p, PIDTYPE_PID); + detach_pid(p, PIDTYPE_TGID); + if (thread_group_leader(p)) { + detach_pid(p, PIDTYPE_PGID); + detach_pid(p, PIDTYPE_SID); + if (p->pid) + __get_cpu_var(process_counts)--; + } + + REMOVE_LINKS(p); +} + +void release_task(struct task_struct * p) +{ + int zap_leader; + task_t *leader; + struct dentry *proc_dentry; + +repeat: + BUG_ON(p->state < TASK_ZOMBIE); + + atomic_dec(&p->user->processes); + spin_lock(&p->proc_lock); + proc_dentry = proc_pid_unhash(p); + write_lock_irq(&tasklist_lock); + if (unlikely(p->ptrace)) + __ptrace_unlink(p); + BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + __exit_signal(p); + __exit_sighand(p); + __unhash_process(p); + + /* + * If we are the last non-leader member of the thread + * group, and the leader is zombie, then notify the + * group leader's parent process. (if it wants notification.) + */ + zap_leader = 0; + leader = p->group_leader; + if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) { + BUG_ON(leader->exit_signal == -1); + do_notify_parent(leader, leader->exit_signal); + /* + * If we were the last child thread and the leader has + * exited already, and the leader's parent ignores SIGCHLD, + * then we are the one who should release the leader. + * + * do_notify_parent() will have marked it self-reaping in + * that case. + */ + zap_leader = (leader->exit_signal == -1); + } + + p->parent->cutime += p->utime + p->cutime; + p->parent->cstime += p->stime + p->cstime; + p->parent->cmin_flt += p->min_flt + p->cmin_flt; + p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt; + p->parent->cnvcsw += p->nvcsw + p->cnvcsw; + p->parent->cnivcsw += p->nivcsw + p->cnivcsw; + sched_exit(p); + write_unlock_irq(&tasklist_lock); + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); + release_thread(p); + put_task_struct(p); + + p = leader; + if (unlikely(zap_leader)) + goto repeat; +} + +/* we are using it only for SMP init */ + +void unhash_process(struct task_struct *p) +{ + struct dentry *proc_dentry; + + spin_lock(&p->proc_lock); + proc_dentry = proc_pid_unhash(p); + write_lock_irq(&tasklist_lock); + __unhash_process(p); + write_unlock_irq(&tasklist_lock); + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); +} + +/* + * This checks not only the pgrp, but falls back on the pid if no + * satisfactory pgrp is found. I dunno - gdb doesn't work correctly + * without this... + */ +int session_of_pgrp(int pgrp) +{ + struct task_struct *p; + struct list_head *l; + struct pid *pid; + int sid = -1; + + read_lock(&tasklist_lock); + for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) + if (p->signal->session > 0) { + sid = p->signal->session; + goto out; + } + p = find_task_by_pid(pgrp); + if (p) + sid = p->signal->session; +out: + read_unlock(&tasklist_lock); + + return sid; +} + +/* + * Determine if a process group is "orphaned", according to the POSIX + * definition in 2.2.2.52. Orphaned process groups are not to be affected + * by terminal-generated stop signals. Newly orphaned process groups are + * to receive a SIGHUP and a SIGCONT. + * + * "I ask you, have you ever known what it is to be an orphan?" + */ +static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) +{ + struct task_struct *p; + struct list_head *l; + struct pid *pid; + int ret = 1; + + for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + if (p == ignored_task + || p->state >= TASK_ZOMBIE + || p->real_parent->pid == 1) + continue; + if (process_group(p->real_parent) != pgrp + && p->real_parent->signal->session == p->signal->session) { + ret = 0; + break; + } + } + return ret; /* (sighing) "Often!" */ +} + +int is_orphaned_pgrp(int pgrp) +{ + int retval; + + read_lock(&tasklist_lock); + retval = will_become_orphaned_pgrp(pgrp, NULL); + read_unlock(&tasklist_lock); + + return retval; +} + +static inline int has_stopped_jobs(int pgrp) +{ + int retval = 0; + struct task_struct *p; + struct list_head *l; + struct pid *pid; + + for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + if (p->state != TASK_STOPPED) + continue; + + /* If p is stopped by a debugger on a signal that won't + stop it, then don't count p as stopped. This isn't + perfect but it's a good approximation. */ + if (unlikely (p->ptrace) + && p->exit_code != SIGSTOP + && p->exit_code != SIGTSTP + && p->exit_code != SIGTTOU + && p->exit_code != SIGTTIN) + continue; + + retval = 1; + break; + } + return retval; +} + +/** + * reparent_to_init() - Reparent the calling kernel thread to the init task. + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to init so that + * it is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_init() gives the caller full capabilities. + */ +void reparent_to_init(void) +{ + write_lock_irq(&tasklist_lock); + + ptrace_unlink(current); + /* Reparent to init */ + REMOVE_LINKS(current); + current->parent = child_reaper; + current->real_parent = child_reaper; + SET_LINKS(current); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + security_task_reparent_to_init(current); + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); + atomic_inc(&(INIT_USER->__count)); + switch_uid(INIT_USER); + + write_unlock_irq(&tasklist_lock); +} + +void __set_special_pids(pid_t session, pid_t pgrp) +{ + struct task_struct *curr = current; + + if (curr->signal->session != session) { + detach_pid(curr, PIDTYPE_SID); + curr->signal->session = session; + attach_pid(curr, PIDTYPE_SID, session); + } + if (process_group(curr) != pgrp) { + detach_pid(curr, PIDTYPE_PGID); + curr->signal->pgrp = pgrp; + attach_pid(curr, PIDTYPE_PGID, pgrp); + } +} + +void set_special_pids(pid_t session, pid_t pgrp) +{ + write_lock_irq(&tasklist_lock); + __set_special_pids(session, pgrp); + write_unlock_irq(&tasklist_lock); +} + +/* + * Let kernel threads use this to say that they + * allow a certain signal (since daemonize() will + * have disabled all of them by default). + */ +int allow_signal(int sig) +{ + if (sig < 1 || sig > _NSIG) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + sigdelset(¤t->blocked, sig); + if (!current->mm) { + /* Kernel threads handle their own signals. + Let the signal code know it'll be handled, so + that they don't get converted to SIGKILL or + just silently dropped */ + current->sighand->action[(sig)-1].sa.sa_handler = (void *)2; + } + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(allow_signal); + +int disallow_signal(int sig) +{ + if (sig < 1 || sig > _NSIG) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(disallow_signal); + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(const char *name, ...) +{ + va_list args; + struct fs_struct *fs; + sigset_t blocked; + + va_start(args, name); + vsnprintf(current->comm, sizeof(current->comm), name, args); + va_end(args); + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + + set_special_pids(1, 1); + current->signal->tty = NULL; + + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + + reparent_to_init(); +} + +EXPORT_SYMBOL(daemonize); + +static inline void close_files(struct files_struct * files) +{ + int i, j; + + j = 0; + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= files->max_fdset || i >= files->max_fds) + break; + set = files->open_fds->fds_bits[j++]; + while (set) { + if (set & 1) { + struct file * file = xchg(&files->fd[i], NULL); + if (file) + filp_close(file, files); + } + i++; + set >>= 1; + } + } +} + +struct files_struct *get_files_struct(struct task_struct *task) +{ + struct files_struct *files; + + task_lock(task); + files = task->files; + if (files) + atomic_inc(&files->count); + task_unlock(task); + + return files; +} + +void fastcall put_files_struct(struct files_struct *files) +{ + if (atomic_dec_and_test(&files->count)) { + close_files(files); + /* + * Free the fd and fdset arrays if we expanded them. + */ + if (files->fd != &files->fd_array[0]) + free_fd_array(files->fd, files->max_fds); + if (files->max_fdset > __FD_SETSIZE) { + free_fdset(files->open_fds, files->max_fdset); + free_fdset(files->close_on_exec, files->max_fdset); + } + kmem_cache_free(files_cachep, files); + } +} + +EXPORT_SYMBOL(put_files_struct); + +static inline void __exit_files(struct task_struct *tsk) +{ + struct files_struct * files = tsk->files; + + if (files) { + task_lock(tsk); + tsk->files = NULL; + task_unlock(tsk); + put_files_struct(files); + } +} + +void exit_files(struct task_struct *tsk) +{ + __exit_files(tsk); +} + +static inline void __put_fs_struct(struct fs_struct *fs) +{ + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { + dput(fs->root); + mntput(fs->rootmnt); + dput(fs->pwd); + mntput(fs->pwdmnt); + if (fs->altroot) { + dput(fs->altroot); + mntput(fs->altrootmnt); + } + kmem_cache_free(fs_cachep, fs); + } +} + +void put_fs_struct(struct fs_struct *fs) +{ + __put_fs_struct(fs); +} + +static inline void __exit_fs(struct task_struct *tsk) +{ + struct fs_struct * fs = tsk->fs; + + if (fs) { + task_lock(tsk); + tsk->fs = NULL; + task_unlock(tsk); + __put_fs_struct(fs); + } +} + +void exit_fs(struct task_struct *tsk) +{ + __exit_fs(tsk); +} + +EXPORT_SYMBOL_GPL(exit_fs); + +/* + * Turn us into a lazy TLB process if we + * aren't already.. + */ +static inline void __exit_mm(struct task_struct * tsk) +{ + struct mm_struct *mm = tsk->mm; + + mm_release(tsk, mm); + if (!mm) + return; + /* + * Serialize with any possible pending coredump. + * We must hold mmap_sem around checking core_waiters + * and clearing tsk->mm. The core-inducing thread + * will increment core_waiters for each thread in the + * group with ->mm != NULL. + */ + down_read(&mm->mmap_sem); + if (mm->core_waiters) { + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + if (!--mm->core_waiters) + complete(mm->core_startup_done); + up_write(&mm->mmap_sem); + + wait_for_completion(&mm->core_done); + down_read(&mm->mmap_sem); + } + atomic_inc(&mm->mm_count); + if (mm != tsk->active_mm) BUG(); + /* more a memory barrier than a real lock */ + task_lock(tsk); + tsk->mm = NULL; + up_read(&mm->mmap_sem); + enter_lazy_tlb(mm, current); + task_unlock(tsk); + mmput(mm); +} + +void exit_mm(struct task_struct *tsk) +{ + __exit_mm(tsk); +} + +EXPORT_SYMBOL(exit_mm); + +static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) +{ + /* + * Make sure we're not reparenting to ourselves and that + * the parent is not a zombie. + */ + if (p == reaper || reaper->state >= TASK_ZOMBIE) + p->real_parent = child_reaper; + else + p->real_parent = reaper; + if (p->parent == p->real_parent) + BUG(); +} + +static inline void reparent_thread(task_t *p, task_t *father, int traced) +{ + /* We don't want people slaying init. */ + if (p->exit_signal != -1) + p->exit_signal = SIGCHLD; + p->self_exec_id++; + + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, (void *) 0, p); + + /* Move the child from its dying parent to the new one. */ + if (unlikely(traced)) { + /* Preserve ptrace links if someone else is tracing this child. */ + list_del_init(&p->ptrace_list); + if (p->parent != p->real_parent) + list_add(&p->ptrace_list, &p->real_parent->ptrace_children); + } else { + /* If this child is being traced, then we're the one tracing it + * anyway, so let go of it. + */ + p->ptrace = 0; + list_del_init(&p->sibling); + p->parent = p->real_parent; + list_add_tail(&p->sibling, &p->parent->children); + + /* If we'd notified the old parent about this child's death, + * also notify the new parent. + */ + if (p->state == TASK_ZOMBIE && p->exit_signal != -1 && + thread_group_empty(p)) + do_notify_parent(p, p->exit_signal); + } + + /* + * process group orphan check + * Case ii: Our child is in a different pgrp + * than we are, and it was the only connection + * outside, so the child pgrp is now orphaned. + */ + if ((process_group(p) != process_group(father)) && + (p->signal->session == father->signal->session)) { + int pgrp = process_group(p); + + if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { + __kill_pg_info(SIGHUP, (void *)1, pgrp); + __kill_pg_info(SIGCONT, (void *)1, pgrp); + } + } +} + +/* + * When we die, we re-parent all our children. + * Try to give them to another thread in our thread + * group, and if no such member exists, give it to + * the global child reaper process (ie "init") + */ +static inline void forget_original_parent(struct task_struct * father) +{ + struct task_struct *p, *reaper = father; + struct list_head *_p, *_n; + + reaper = father->group_leader; + if (reaper == father) + reaper = child_reaper; + + /* + * There are only two places where our children can be: + * + * - in our child list + * - in our ptraced child list + * + * Search them and reparent children. + */ + list_for_each_safe(_p, _n, &father->children) { + p = list_entry(_p,struct task_struct,sibling); + if (father == p->real_parent) { + choose_new_parent(p, reaper, child_reaper); + reparent_thread(p, father, 0); + } else { + ptrace_unlink (p); + if (p->state == TASK_ZOMBIE && p->exit_signal != -1 && + thread_group_empty(p)) + do_notify_parent(p, p->exit_signal); + } + } + list_for_each_safe(_p, _n, &father->ptrace_children) { + p = list_entry(_p,struct task_struct,ptrace_list); + choose_new_parent(p, reaper, child_reaper); + reparent_thread(p, father, 1); + } +} + +/* + * Send signals to all our closest relatives so that they know + * to properly mourn us.. + */ +static void exit_notify(struct task_struct *tsk) +{ + int state; + struct task_struct *t; + + ckrm_cb_exit(tsk); + + if (signal_pending(tsk) && !tsk->signal->group_exit + && !thread_group_empty(tsk)) { + /* + * This occurs when there was a race between our exit + * syscall and a group signal choosing us as the one to + * wake up. It could be that we are the only thread + * alerted to check for pending signals, but another thread + * should be woken now to take the signal since we will not. + * Now we'll wake all the threads in the group just to make + * sure someone gets all the pending signals. + */ + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + for (t = next_thread(tsk); t != tsk; t = next_thread(t)) + if (!signal_pending(t) && !(t->flags & PF_EXITING)) { + recalc_sigpending_tsk(t); + if (signal_pending(t)) + signal_wake_up(t, 0); + } + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + } + + write_lock_irq(&tasklist_lock); + + /* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ + + forget_original_parent(tsk); + BUG_ON(!list_empty(&tsk->children)); + + /* + * Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * + * Case i: Our father is in a different pgrp than we are + * and we were the only connection outside, so our pgrp + * is about to become orphaned. + */ + + t = tsk->real_parent; + + if ((process_group(t) != process_group(tsk)) && + (t->signal->session == tsk->signal->session) && + will_become_orphaned_pgrp(process_group(tsk), tsk) && + has_stopped_jobs(process_group(tsk))) { + __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); + __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); + } + + /* Let father know we died + * + * Thread signals are configurable, but you aren't going to use + * that to send signals to arbitary processes. + * That stops right now. + * + * If the parent exec id doesn't match the exec id we saved + * when we started then we know the parent has changed security + * domain. + * + * If our self_exec id doesn't match our parent_exec_id then + * we have changed execution domain as these two values started + * the same after a fork. + * + */ + + if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && + ( tsk->parent_exec_id != t->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) + && !capable(CAP_KILL)) + tsk->exit_signal = SIGCHLD; + + + /* If something other than our normal parent is ptracing us, then + * send it a SIGCHLD instead of honoring exit_signal. exit_signal + * only has special meaning to our real parent. + */ + if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { + int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; + do_notify_parent(tsk, signal); + } else if (tsk->ptrace) { + do_notify_parent(tsk, SIGCHLD); + } + + state = TASK_ZOMBIE; + if (tsk->exit_signal == -1 && tsk->ptrace == 0) + state = TASK_DEAD; + tsk->state = state; + tsk->flags |= PF_DEAD; + + /* + * Clear these here so that update_process_times() won't try to deliver + * itimer, profile or rlimit signals to this task while it is in late exit. + */ + tsk->it_virt_value = 0; + tsk->it_prof_value = 0; + tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY; + + /* + * In the preemption case it must be impossible for the task + * to get runnable again, so use "_raw_" unlock to keep + * preempt_count elevated until we schedule(). + * + * To avoid deadlock on SMP, interrupts must be unmasked. If we + * don't, subsequently called functions (e.g, wait_task_inactive() + * via release_task()) will spin, with interrupt flags + * unwittingly blocked, until the other task sleeps. That task + * may itself be waiting for smp_call_function() to answer and + * complete, and with interrupts blocked that will never happen. + */ + _raw_write_unlock(&tasklist_lock); + local_irq_enable(); + + /* If the process is dead, release it - nobody will wait for it */ + if (state == TASK_DEAD) + release_task(tsk); + +} + +asmlinkage NORET_TYPE void do_exit(long code) +{ + struct task_struct *tsk = current; + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + if (unlikely(tsk->pid == 1)) + panic("Attempted to kill init!"); + if (tsk->io_context) + exit_io_context(); + tsk->flags |= PF_EXITING; + del_timer_sync(&tsk->real_timer); + + if (unlikely(in_atomic())) + printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", + current->comm, current->pid, + preempt_count()); + + profile_exit_task(tsk); + + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { + current->ptrace_message = code; + ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); + } + + acct_process(code); + __exit_mm(tsk); + + exit_sem(tsk); + __exit_files(tsk); + __exit_fs(tsk); + exit_namespace(tsk); + exit_thread(); +#ifdef CONFIG_NUMA + mpol_free(tsk->mempolicy); +#endif + + if (tsk->signal->leader) + disassociate_ctty(1); + + module_put(tsk->thread_info->exec_domain->module); + if (tsk->binfmt) + module_put(tsk->binfmt->module); + + tsk->exit_code = code; +#ifdef CONFIG_CKRM_TYPE_TASKCLASS + numtasks_put_ref(tsk->taskclass); +#endif + exit_notify(tsk); + schedule(); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) ; +} + +NORET_TYPE void complete_and_exit(struct completion *comp, long code) +{ + if (comp) + complete(comp); + + do_exit(code); +} + +EXPORT_SYMBOL(complete_and_exit); + +asmlinkage long sys_exit(int error_code) +{ + do_exit((error_code&0xff)<<8); +} + +task_t fastcall *next_thread(task_t *p) +{ + struct pid_link *link = p->pids + PIDTYPE_TGID; + struct list_head *tmp, *head = &link->pidptr->task_list; + +#ifdef CONFIG_SMP + if (!p->sighand) + BUG(); + if (!spin_is_locked(&p->sighand->siglock) && + !rwlock_is_locked(&tasklist_lock)) + BUG(); +#endif + tmp = link->pid_chain.next; + if (tmp == head) + tmp = head->next; + + return pid_task(tmp, PIDTYPE_TGID); +} + +EXPORT_SYMBOL(next_thread); + +/* + * Take down every thread in the group. This is called by fatal signals + * as well as by sys_exit_group (below). + */ +NORET_TYPE void +do_group_exit(int exit_code) +{ + BUG_ON(exit_code & 0x80); /* core dumps don't get here */ + + if (current->signal->group_exit) + exit_code = current->signal->group_exit_code; + else if (!thread_group_empty(current)) { + struct signal_struct *const sig = current->signal; + struct sighand_struct *const sighand = current->sighand; + read_lock(&tasklist_lock); + spin_lock_irq(&sighand->siglock); + if (sig->group_exit) + /* Another thread got here before we took the lock. */ + exit_code = sig->group_exit_code; + else { + sig->group_exit = 1; + sig->group_exit_code = exit_code; + zap_other_threads(current); + } + spin_unlock_irq(&sighand->siglock); + read_unlock(&tasklist_lock); + } + + do_exit(exit_code); + /* NOTREACHED */ +} + +/* + * this kills every thread in the thread group. Note that any externally + * wait4()-ing process will get the correct exit code - even if this + * thread is not the thread group leader. + */ +asmlinkage void sys_exit_group(int error_code) +{ + do_group_exit((error_code & 0xff) << 8); +} + +static int eligible_child(pid_t pid, int options, task_t *p) +{ + if (pid > 0) { + if (p->pid != pid) + return 0; + } else if (!pid) { + if (process_group(p) != process_group(current)) + return 0; + } else if (pid != -1) { + if (process_group(p) != -pid) + return 0; + } + + /* + * Do not consider detached threads that are + * not ptraced: + */ + if (p->exit_signal == -1 && !p->ptrace) + return 0; + + /* Wait for all children (clone and not) if __WALL is set; + * otherwise, wait for clone children *only* if __WCLONE is + * set; otherwise, wait for non-clone children *only*. (Note: + * A "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD.) */ + if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) + && !(options & __WALL)) + return 0; + /* + * Do not consider thread group leaders that are + * in a non-empty thread group: + */ + if (current->tgid != p->tgid && delay_group_leader(p)) + return 2; + + if (security_task_wait(p)) + return 0; + + return 1; +} + +/* + * Handle sys_wait4 work for one task in state TASK_ZOMBIE. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_zombie(task_t *p, unsigned int __user *stat_addr, struct rusage __user *ru) +{ + unsigned long state; + int retval; + + /* + * Try to move the task's state to DEAD + * only one thread is allowed to do this: + */ + state = xchg(&p->state, TASK_DEAD); + if (state != TASK_ZOMBIE) { + BUG_ON(state != TASK_DEAD); + return 0; + } + if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) + /* + * This can only happen in a race with a ptraced thread + * dying on another processor. + */ + return 0; + + /* + * Now we are sure this task is interesting, and no other + * thread can reap it because we set its state to TASK_DEAD. + */ + read_unlock(&tasklist_lock); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + if (!retval && stat_addr) { + if (p->signal->group_exit) + retval = put_user(p->signal->group_exit_code, stat_addr); + else + retval = put_user(p->exit_code, stat_addr); + } + if (retval) { + p->state = TASK_ZOMBIE; + return retval; + } + retval = p->pid; + if (p->real_parent != p->parent) { + write_lock_irq(&tasklist_lock); + /* Double-check with lock held. */ + if (p->real_parent != p->parent) { + __ptrace_unlink(p); + p->state = TASK_ZOMBIE; + /* If this is a detached thread, this is where it goes away. */ + if (p->exit_signal == -1) { + /* release_task takes the lock itself. */ + write_unlock_irq(&tasklist_lock); + release_task (p); + } + else { + do_notify_parent(p, p->exit_signal); + write_unlock_irq(&tasklist_lock); + } + p = NULL; + } + else + write_unlock_irq(&tasklist_lock); + } + if (p != NULL) + release_task(p); + BUG_ON(!retval); + return retval; +} + +/* + * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_stopped(task_t *p, int delayed_group_leader, + unsigned int __user *stat_addr, + struct rusage __user *ru) +{ + int retval, exit_code; + + if (!p->exit_code) + return 0; + if (delayed_group_leader && !(p->ptrace & PT_PTRACED) && + p->signal && p->signal->group_stop_count > 0) + /* + * A group stop is in progress and this is the group leader. + * We won't report until all threads have stopped. + */ + return 0; + + /* + * Now we are pretty sure this task is interesting. + * Make sure it doesn't get reaped out from under us while we + * give up the lock and then examine it below. We don't want to + * keep holding onto the tasklist_lock while we call getrusage and + * possibly take page faults for user memory. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + write_lock_irq(&tasklist_lock); + + /* + * This uses xchg to be atomic with the thread resuming and setting + * it. It must also be done with the write lock held to prevent a + * race with the TASK_ZOMBIE case. + */ + exit_code = xchg(&p->exit_code, 0); + if (unlikely(p->state > TASK_STOPPED)) { + /* + * The task resumed and then died. Let the next iteration + * catch it in TASK_ZOMBIE. Note that exit_code might + * already be zero here if it resumed and did _exit(0). + * The task itself is dead and won't touch exit_code again; + * other processors in this function are locked out. + */ + p->exit_code = exit_code; + exit_code = 0; + } + if (unlikely(exit_code == 0)) { + /* + * Another thread in this function got to it first, or it + * resumed, or it resumed and then died. + */ + write_unlock_irq(&tasklist_lock); + put_task_struct(p); + read_lock(&tasklist_lock); + return 0; + } + + /* move to end of parent's list to avoid starvation */ + remove_parent(p); + add_parent(p, p->parent); + + write_unlock_irq(&tasklist_lock); + + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + if (!retval && stat_addr) + retval = put_user((exit_code << 8) | 0x7f, stat_addr); + if (!retval) + retval = p->pid; + put_task_struct(p); + + BUG_ON(!retval); + return retval; +} + +asmlinkage long sys_wait4(pid_t pid,unsigned int __user *stat_addr, int options, struct rusage __user *ru) +{ + DECLARE_WAITQUEUE(wait, current); + struct task_struct *tsk; + int flag, retval; + + if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) + return -EINVAL; + + add_wait_queue(¤t->wait_chldexit,&wait); +repeat: + flag = 0; + current->state = TASK_INTERRUPTIBLE; + read_lock(&tasklist_lock); + tsk = current; + do { + struct task_struct *p; + struct list_head *_p; + int ret; + + list_for_each(_p,&tsk->children) { + p = list_entry(_p,struct task_struct,sibling); + + ret = eligible_child(pid, options, p); + if (!ret) + continue; + flag = 1; + + switch (p->state) { + case TASK_STOPPED: + if (!(options & WUNTRACED) && + !(p->ptrace & PT_PTRACED)) + continue; + retval = wait_task_stopped(p, ret == 2, + stat_addr, ru); + if (retval != 0) /* He released the lock. */ + goto end_wait4; + break; + case TASK_ZOMBIE: + /* + * Eligible but we cannot release it yet: + */ + if (ret == 2) + continue; + retval = wait_task_zombie(p, stat_addr, ru); + if (retval != 0) /* He released the lock. */ + goto end_wait4; + break; + } + } + if (!flag) { + list_for_each (_p,&tsk->ptrace_children) { + p = list_entry(_p,struct task_struct,ptrace_list); + if (!eligible_child(pid, options, p)) + continue; + flag = 1; + break; + } + } + if (options & __WNOTHREAD) + break; + tsk = next_thread(tsk); + if (tsk->signal != current->signal) + BUG(); + } while (tsk != current); + read_unlock(&tasklist_lock); + if (flag) { + retval = 0; + if (options & WNOHANG) + goto end_wait4; + retval = -ERESTARTSYS; + if (signal_pending(current)) + goto end_wait4; + schedule(); + goto repeat; + } + retval = -ECHILD; +end_wait4: + current->state = TASK_RUNNING; + remove_wait_queue(¤t->wait_chldexit,&wait); + return retval; +} + +#ifdef __ARCH_WANT_SYS_WAITPID + +/* + * sys_waitpid() remains for compatibility. waitpid() should be + * implemented by calling sys_wait4() from libc.a. + */ +asmlinkage long sys_waitpid(pid_t pid, unsigned __user *stat_addr, int options) +{ + return sys_wait4(pid, stat_addr, options, NULL); +} + +#endif diff --git a/kernel/fork.c b/kernel/fork.c index 195394433..df85a9daa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include @@ -272,9 +271,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) ckrm_cb_newtask(tsk); /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); -#ifdef CONFIG_CKRM_RES_MEM - INIT_LIST_HEAD(&tsk->mm_peers); -#endif return tsk; } @@ -427,10 +423,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; -#ifdef CONFIG_CKRM_RES_MEM - INIT_LIST_HEAD(&mm->tasklist); - mm->peertask_lock = SPIN_LOCK_UNLOCKED; -#endif if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -452,10 +444,6 @@ struct mm_struct * mm_alloc(void) if (mm) { memset(mm, 0, sizeof(*mm)); mm = mm_init(mm); -#ifdef CONFIG_CKRM_RES_MEM - mm->memclass = GET_MEM_CLASS(current); - mem_class_get(mm->memclass); -#endif } return mm; } @@ -471,13 +459,6 @@ void fastcall __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); clr_vx_info(&mm->mm_vx_info); -#ifdef CONFIG_CKRM_RES_MEM - /* class can be null and mm's tasklist can be empty here */ - if (mm->memclass) { - mem_class_put(mm->memclass); - mm->memclass = NULL; - } -#endif free_mm(mm); } @@ -607,7 +588,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) good_mm: tsk->mm = mm; tsk->active_mm = mm; - ckrm_init_mm_to_task(mm, tsk); return 0; free_pt: @@ -1148,7 +1128,6 @@ struct task_struct *copy_process(unsigned long clone_flags, } else link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); - p->ioprio = current->ioprio; nr_threads++; /* p is copy of current */ vxi = p->vx_info; diff --git a/kernel/itimer.c b/kernel/itimer.c index 5bf6c881c..6918cb746 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -68,9 +68,7 @@ void it_real_fn(unsigned long __data) struct task_struct * p = (struct task_struct *) __data; unsigned long interval; - if (send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p)) - printk("*warning*: failed to send SIGALRM to %u\n", p->pid); - + send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); interval = p->it_real_incr; if (interval) { if (interval > (unsigned long) LONG_MAX) diff --git a/kernel/panic.c b/kernel/panic.c index 37f3e82de..290bf0d1e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,8 +23,8 @@ #include #endif -int panic_timeout = 900; -int panic_on_oops = 1; +int panic_timeout; +int panic_on_oops; int tainted; void (*dump_function_ptr)(const char *, const struct pt_regs *) = 0; diff --git a/kernel/sched.c b/kernel/sched.c index 20b09215e..b4512b77b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -163,21 +163,6 @@ EXPORT_SYMBOL(dump_oncpu); #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/* - * if belong to different class, compare class priority - * otherwise compare task priority - */ -#define TASK_PREEMPTS_CURR(p, rq) \ - ( ((p)->cpu_class != (rq)->curr->cpu_class) \ - && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \ - ? class_preempts_curr((p),(rq)->curr) \ - : ((p)->prio < (rq)->curr->prio) -#else -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) -#endif - /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -193,71 +178,14 @@ EXPORT_SYMBOL(dump_oncpu); ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) -/* - * These are the runqueue data structures: - */ - -typedef struct runqueue runqueue_t; -#include -#include - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct runqueue { - spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; -#if defined(CONFIG_SMP) - unsigned long cpu_load; -#endif - unsigned long long nr_switches, nr_preempt; - unsigned long expired_timestamp, nr_uninterruptible; - unsigned long long timestamp_last_tick; - task_t *curr, *idle; - struct mm_struct *prev_mm; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - struct classqueue_struct classqueue; - ckrm_load_t ckrm_load; -#else - prio_array_t *active, *expired, arrays[2]; -#endif - int best_expired_prio; - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct sched_domain *sd; - - /* For active balancing */ - int active_balance; - int push_cpu; - - task_t *migration_thread; - struct list_head migration_queue; -#endif - -#ifdef CONFIG_VSERVER_HARDCPU - struct list_head hold_queue; - int idle_tokens; -#endif -}; - -static DEFINE_PER_CPU(struct runqueue, runqueues); +DEFINE_PER_CPU(struct runqueue, runqueues); #define for_each_domain(cpu, domain) \ for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) @@ -276,111 +204,121 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); # define task_running(rq, p) ((rq)->curr == (p)) #endif +#ifdef CONFIG_CKRM_CPU_SCHEDULE +#include +spinlock_t cvt_lock = SPIN_LOCK_UNLOCKED; +rwlock_t class_list_lock = RW_LOCK_UNLOCKED; +LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor +struct ckrm_cpu_class default_cpu_class_obj; + /* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. + * the minimum CVT allowed is the base_cvt + * otherwise, it will starve others */ -static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +CVT_t get_min_cvt(int cpu) { - struct runqueue *rq; - -repeat_lock_task: - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { - spin_unlock_irqrestore(&rq->lock, *flags); - goto repeat_lock_task; - } - return rq; -} + cq_node_t *node; + struct ckrm_local_runqueue * lrq; + CVT_t min_cvt; -static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) -{ - spin_unlock_irqrestore(&rq->lock, *flags); + node = classqueue_get_head(bpt_queue(cpu)); + lrq = (node) ? class_list_entry(node) : NULL; + + if (lrq) + min_cvt = lrq->local_cvt; + else + min_cvt = 0; + + return min_cvt; } /* - * rq_lock - lock a given runqueue and disable interrupts. + * update the classueue base for all the runqueues + * TODO: we can only update half of the min_base to solve the movebackward issue */ -static runqueue_t *this_rq_lock(void) -{ - runqueue_t *rq; +static inline void check_update_class_base(int this_cpu) { + unsigned long min_base = 0xFFFFFFFF; + cq_node_t *node; + int i; - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); + if (! cpu_online(this_cpu)) return; - return rq; + /* + * find the min_base across all the processors + */ + for_each_online_cpu(i) { + /* + * I should change it to directly use bpt->base + */ + node = classqueue_get_head(bpt_queue(i)); + if (node && node->prio < min_base) { + min_base = node->prio; + } + } + if (min_base != 0xFFFFFFFF) + classqueue_update_base(bpt_queue(this_cpu),min_base); } -static inline void rq_unlock(runqueue_t *rq) +static inline void ckrm_rebalance_tick(int j,int this_cpu) { - spin_unlock_irq(&rq->lock); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + read_lock(&class_list_lock); + if (!(j % CVT_UPDATE_TICK)) + update_global_cvts(this_cpu); + +#define CKRM_BASE_UPDATE_RATE 400 + if (! (jiffies % CKRM_BASE_UPDATE_RATE)) + check_update_class_base(this_cpu); + + read_unlock(&class_list_lock); +#endif } -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) +static inline struct ckrm_local_runqueue *rq_get_next_class(struct runqueue *rq) { cq_node_t *node = classqueue_get_head(&rq->classqueue); return ((node) ? class_list_entry(node) : NULL); } -/* - * return the cvt of the current running class - * if no current running class, return 0 - * assume cpu is valid (cpu_online(cpu) == 1) - */ -CVT_t get_local_cur_cvt(int cpu) -{ - ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu)); - - if (lrq) - return lrq->local_cvt; - else - return 0; -} - static inline struct task_struct * rq_get_next_task(struct runqueue* rq) { prio_array_t *array; struct task_struct *next; - ckrm_lrq_t *queue; - int idx; + struct ckrm_local_runqueue *queue; int cpu = smp_processor_id(); - - // it is guaranteed be the ( rq->nr_running > 0 ) check in - // schedule that a task will be found. - + + next = rq->idle; retry_next_class: - queue = rq_get_next_class(rq); - // BUG_ON( !queue ); - - array = queue->active; - if (unlikely(!array->nr_active)) { - queue->active = queue->expired; - queue->expired = array; - queue->expired_timestamp = 0; + if ((queue = rq_get_next_class(rq))) { + array = queue->active; + //check switch active/expired queue + if (unlikely(!queue->active->nr_active)) { + queue->active = queue->expired; + queue->expired = array; + queue->expired_timestamp = 0; + + if (queue->active->nr_active) + set_top_priority(queue, + find_first_bit(queue->active->bitmap, MAX_PRIO)); + else { + classqueue_dequeue(queue->classqueue, + &queue->classqueue_linkobj); + cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); + } - if (queue->active->nr_active) - set_top_priority(queue, - find_first_bit(queue->active->bitmap, MAX_PRIO)); - else { - classqueue_dequeue(queue->classqueue, - &queue->classqueue_linkobj); - cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); + goto retry_next_class; } - goto retry_next_class; + BUG_ON(!queue->active->nr_active); + next = task_list_entry(array->queue[queue->top_priority].next); } - // BUG_ON(!array->nr_active); - - idx = queue->top_priority; - // BUG_ON (idx == MAX_PRIO); - next = task_list_entry(array->queue[idx].next); return next; } -#else /*! CONFIG_CKRM_CPU_SCHEDULE*/ + +static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load += cpu_class_weight(p->cpu_class); } +static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load -= cpu_class_weight(p->cpu_class); } + +#else /*CONFIG_CKRM_CPU_SCHEDULE*/ + static inline struct task_struct * rq_get_next_task(struct runqueue* rq) { prio_array_t *array; @@ -407,15 +345,61 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq) static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } static inline void init_cpu_classes(void) { } -#define rq_ckrm_load(rq) NULL -static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} +static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { } +static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { } #endif /* CONFIG_CKRM_CPU_SCHEDULE */ + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +{ + struct runqueue *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static runqueue_t *this_rq_lock(void) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +static inline void rq_unlock(runqueue_t *rq) +{ + spin_unlock_irq(&rq->lock); +} + /* * Adding/removing a task to/from a priority array: */ -static void dequeue_task(struct task_struct *p, prio_array_t *array) +void dequeue_task(struct task_struct *p, prio_array_t *array) { + BUG_ON(! array); array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) @@ -423,7 +407,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array) class_dequeue_task(p,array); } -static void enqueue_task(struct task_struct *p, prio_array_t *array) +void enqueue_task(struct task_struct *p, prio_array_t *array) { list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); @@ -487,6 +471,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq_active(p,rq)); rq->nr_running++; + rq_load_inc(rq,p); } /* @@ -496,6 +481,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq_active(p,rq)); rq->nr_running++; + rq_load_inc(rq,p); } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -627,6 +613,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) static void deactivate_task(struct task_struct *p, runqueue_t *rq) { rq->nr_running--; + rq_load_dec(rq,p); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -1000,10 +987,6 @@ void fastcall sched_fork(task_t *p) INIT_LIST_HEAD(&p->run_list); p->array = NULL; spin_lock_init(&p->switch_lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0); -#endif - #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -1079,7 +1062,7 @@ void fastcall wake_up_forked_process(task_t * p) p->array = current->array; p->array->nr_active++; rq->nr_running++; - class_enqueue_task(p,p->array); + rq_load_inc(rq,p); } task_rq_unlock(rq, &flags); } @@ -1412,7 +1395,7 @@ lock_again: p->array = current->array; p->array->nr_active++; rq->nr_running++; - class_enqueue_task(p,p->array); + rq_load_inc(rq,p); } } else { /* Not the local CPU - must adjust timestamp */ @@ -1517,9 +1500,13 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, { dequeue_task(p, src_array); src_rq->nr_running--; + rq_load_dec(src_rq,p); + set_task_cpu(p, this_cpu); this_rq->nr_running++; + rq_load_inc(this_rq,p); enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* @@ -1559,61 +1546,133 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, } #ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline int ckrm_preferred_task(task_t *tmp,long min, long max, - int phase, enum idle_type idle) + +struct ckrm_cpu_class *find_unbalanced_class(int busiest_cpu, int this_cpu, unsigned long *cls_imbalance) { - long pressure = task_load(tmp); - - if (pressure > max) - return 0; + struct ckrm_cpu_class *most_unbalanced_class = NULL; + struct ckrm_cpu_class *clsptr; + int max_unbalance = 0; - if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) - return 0; - return 1; + list_for_each_entry(clsptr,&active_cpu_classes,links) { + struct ckrm_local_runqueue *this_lrq = get_ckrm_local_runqueue(clsptr,this_cpu); + struct ckrm_local_runqueue *busiest_lrq = get_ckrm_local_runqueue(clsptr,busiest_cpu); + int unbalance_degree; + + unbalance_degree = (local_queue_nr_running(busiest_lrq) - local_queue_nr_running(this_lrq)) * cpu_class_weight(clsptr); + if (unbalance_degree >= *cls_imbalance) + continue; // already looked at this class + + if (unbalance_degree > max_unbalance) { + max_unbalance = unbalance_degree; + most_unbalanced_class = clsptr; + } + } + *cls_imbalance = max_unbalance; + return most_unbalanced_class; } + /* - * move tasks for a specic local class - * return number of tasks pulled + * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. */ -static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, - runqueue_t *this_rq, - runqueue_t *busiest, - struct sched_domain *sd, - int this_cpu, - enum idle_type idle, - long* pressure_imbalance) +static int find_busiest_cpu(runqueue_t *this_rq, int this_cpu, int idle, + int *imbalance) { - prio_array_t *array, *dst_array; + int cpu_load, load, max_load, i, busiest_cpu; + runqueue_t *busiest, *rq_src; + + + /*Hubertus ... the concept of nr_running is replace with cpu_load */ + cpu_load = this_rq->ckrm_cpu_load; + + busiest = NULL; + busiest_cpu = -1; + + max_load = -1; + for_each_online_cpu(i) { + rq_src = cpu_rq(i); + load = rq_src->ckrm_cpu_load; + + if ((load > max_load) && (rq_src != this_rq)) { + busiest = rq_src; + busiest_cpu = i; + max_load = load; + } + } + + if (likely(!busiest)) + goto out; + + *imbalance = max_load - cpu_load; + + /* It needs an at least ~25% imbalance to trigger balancing. */ + if (!idle && ((*imbalance)*4 < max_load)) { + busiest = NULL; + goto out; + } + + double_lock_balance(this_rq, busiest); + /* + * Make sure nothing changed since we checked the + * runqueue length. + */ + if (busiest->ckrm_cpu_load <= cpu_load) { + spin_unlock(&busiest->lock); + busiest = NULL; + } +out: + return (busiest ? busiest_cpu : -1); +} + +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + int imbalance, idx; + int busiest_cpu; + runqueue_t *busiest; + prio_array_t *array; struct list_head *head, *curr; task_t *tmp; - int idx; - int pulled = 0; - int phase = -1; - long pressure_min, pressure_max; - /*hzheng: magic : 90% balance is enough*/ - long balance_min = *pressure_imbalance / 10; -/* - * we don't want to migrate tasks that will reverse the balance - * or the tasks that make too small difference - */ -#define CKRM_BALANCE_MAX_RATIO 100 -#define CKRM_BALANCE_MIN_RATIO 1 - start: - phase ++; + struct ckrm_local_runqueue * busiest_local_queue; + struct ckrm_cpu_class *clsptr; + int weight; + unsigned long cls_imbalance; // so we can retry other classes + + // need to update global CVT based on local accumulated CVTs + read_lock(&class_list_lock); + busiest_cpu = find_busiest_cpu(this_rq, this_cpu, idle, &imbalance); + if (busiest_cpu == -1) + goto out; + + busiest = cpu_rq(busiest_cpu); + + /* + * We only want to steal a number of tasks equal to 1/2 the imbalance, + * otherwise we'll just shift the imbalance to the new queue: + */ + imbalance /= 2; + + /* now find class on that runqueue with largest inbalance */ + cls_imbalance = 0xFFFFFFFF; + + retry_other_class: + clsptr = find_unbalanced_class(busiest_cpu, this_cpu, &cls_imbalance); + if (!clsptr) + goto out_unlock; + + busiest_local_queue = get_ckrm_local_runqueue(clsptr,busiest_cpu); + weight = cpu_class_weight(clsptr); + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect * on them. */ - if (src_lrq->expired->nr_active) { - array = src_lrq->expired; - dst_array = dst_lrq->expired; - } else { - array = src_lrq->active; - dst_array = dst_lrq->active; - } + if (busiest_local_queue->expired->nr_active) + array = busiest_local_queue->expired; + else + array = busiest_local_queue->active; new_array: /* Start searching at priority 0: */ @@ -1624,15 +1683,11 @@ static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == src_lrq->expired && src_lrq->active->nr_active) { - array = src_lrq->active; - dst_array = dst_lrq->active; + if (array == busiest_local_queue->expired && busiest_local_queue->active->nr_active) { + array = busiest_local_queue->active; goto new_array; } - if ((! phase) && (! pulled) && (idle != IDLE)) - goto start; //try again - else - goto out; //finished search for this lrq + goto retry_other_class; } head = array->queue + idx; @@ -1642,365 +1697,42 @@ static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd,idle)) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } - - pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; - pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; + pull_task(busiest, array, tmp, this_rq, rq_active(tmp,this_rq),this_cpu); /* - * skip the tasks that will reverse the balance too much + * tmp BUG FIX: hzheng + * load balancing can make the busiest local queue empty + * thus it should be removed from bpt */ - if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { - *pressure_imbalance -= task_load(tmp); - pull_task(busiest, array, tmp, - this_rq, dst_array, this_cpu); - pulled++; - - if (*pressure_imbalance <= balance_min) - goto out; + if (! local_queue_nr_running(busiest_local_queue)) { + classqueue_dequeue(busiest_local_queue->classqueue,&busiest_local_queue->classqueue_linkobj); + cpu_demand_event(get_rq_local_stat(busiest_local_queue,busiest_cpu),CPU_DEMAND_DEQUEUE,0); } - - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - out: - return pulled; -} - -static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) -{ - long imbalance; - /* - * make sure after balance, imbalance' > - imbalance/2 - * we don't want the imbalance be reversed too much - */ - imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) - - pid_get_pressure(rq_ckrm_load(this_rq),1); - imbalance /= 2; - return imbalance; -} -/* - * try to balance the two runqueues - * - * Called with both runqueues locked. - * if move_tasks is called, it will try to move at least one task over - */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) -{ - struct ckrm_cpu_class *clsptr,*vip_cls = NULL; - ckrm_lrq_t* src_lrq,*dst_lrq; - long pressure_imbalance, pressure_imbalance_old; - int src_cpu = task_cpu(busiest->curr); - struct list_head *list; - int pulled = 0; - long imbalance; - - imbalance = ckrm_rq_imbalance(this_rq,busiest); - - if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) - goto out; - - //try to find the vip class - list_for_each_entry(clsptr,&active_cpu_classes,links) { - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - - if (! lrq_nr_running(src_lrq)) - continue; - - if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) - { - vip_cls = clsptr; - } + imbalance -= weight; + if (!idle && (imbalance>0)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } - - /* - * do search from the most significant class - * hopefully, less tasks will be migrated this way - */ - clsptr = vip_cls; - - move_class: - if (! clsptr) - goto out; - - - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - if (! lrq_nr_running(src_lrq)) - goto other_class; - - dst_lrq = get_ckrm_lrq(clsptr,this_cpu); - - //how much pressure for this class should be transferred - pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; - if (pulled && ! pressure_imbalance) - goto other_class; - - pressure_imbalance_old = pressure_imbalance; - - //move tasks - pulled += - ckrm_cls_move_tasks(src_lrq,dst_lrq, - this_rq, - busiest, - sd,this_cpu,idle, - &pressure_imbalance); - - /* - * hzheng: 2 is another magic number - * stop balancing if the imbalance is less than 25% of the orig - */ - if (pressure_imbalance <= (pressure_imbalance_old >> 2)) - goto out; - - //update imbalance - imbalance *= pressure_imbalance / pressure_imbalance_old; - other_class: - //who is next? - list = clsptr->links.next; - if (list == &active_cpu_classes) - list = list->next; - clsptr = list_entry(list, typeof(*clsptr), links); - if (clsptr != vip_cls) - goto move_class; + out_unlock: + spin_unlock(&busiest->lock); out: - return pulled; -} - -/** - * ckrm_check_balance - is load balancing necessary? - * return 0 if load balancing is not necessary - * otherwise return the average load of the system - * also, update nr_group - * - * heuristics: - * no load balancing if it's load is over average - * no load balancing if it's load is far more than the min - * task: - * read the status of all the runqueues - */ -static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, - enum idle_type idle, int* nr_group) -{ - struct sched_group *group = sd->groups; - unsigned long min_load, max_load, avg_load; - unsigned long total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - min_load = 0xFFFFFFFF; - *nr_group = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - /* Tally up the load of all CPUs in the group */ - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - - avg_load = 0; - local_group = cpu_isset(this_cpu, group->cpumask); - - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); - nr_cpus++; - avg_load += load; - } - - if (!nr_cpus) - goto nextgroup; - - total_load += avg_load; - total_pwr += group->cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - goto nextgroup; - } else if (avg_load > max_load) { - max_load = avg_load; - } - if (avg_load < min_load) { - min_load = avg_load; - } -nextgroup: - group = group->next; - *nr_group = *nr_group + 1; - } while (group != sd->groups); - - if (!max_load || this_load >= max_load) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - /* hzheng: debugging: 105 is a magic number - * 100*max_load <= sd->imbalance_pct*this_load) - * should use imbalance_pct instead - */ - if (this_load > avg_load - || 100*max_load < 105*this_load - || 100*min_load < 70*this_load - ) - goto out_balanced; - - return avg_load; - out_balanced: - return 0; -} - -/** - * any group that has above average load is considered busy - * find the busiest queue from any of busy group - */ -static runqueue_t * -ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, - unsigned long avg_load, enum idle_type idle, - int nr_group) -{ - struct sched_group *group; - runqueue_t * busiest=NULL; - unsigned long rand; - - group = sd->groups; - rand = get_ckrm_rand(nr_group); - nr_group = 0; - - do { - unsigned long load,total_load,max_load; - cpumask_t tmp; - int i; - runqueue_t * grp_busiest; - - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto find_nextgroup; - - total_load = 0; - max_load = 0; - grp_busiest = NULL; - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); - total_load += load; - if (load > max_load) { - max_load = load; - grp_busiest = cpu_rq(i); - } - } - - total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (total_load > avg_load) { - busiest = grp_busiest; - if (nr_group >= rand) - break; - } - find_nextgroup: - group = group->next; - nr_group ++; - } while (group != sd->groups); - - return busiest; -} - -/** - * load_balance - pressure based load balancing algorithm used by ckrm - */ -static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - runqueue_t *busiest; - unsigned long avg_load; - int nr_moved,nr_group; - - avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); - if (! avg_load) - goto out_balanced; - - busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); - if (! busiest) - goto out_balanced; - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } - - nr_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - 0,sd, idle); - spin_unlock(&busiest->lock); - if (nr_moved) { - adjust_local_weight(); - } - } - - if (!nr_moved) - sd->nr_balance_failed ++; - else - sd->nr_balance_failed = 0; - - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - - return nr_moved; - -out_balanced: - /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - + read_unlock(&class_list_lock); return 0; } -/* - * this_rq->lock is already held - */ -static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd) -{ - int ret; - read_lock(&class_list_lock); - ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - return ret; -} -static inline int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) { - int ret; - - spin_lock(&this_rq->lock); - read_lock(&class_list_lock); - ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - spin_unlock(&this_rq->lock); - return ret; } -#else /*! CONFIG_CKRM_CPU_SCHEDULE */ +#else /* CONFIG_CKRM_CPU_SCHEDULE */ /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of @@ -2365,8 +2097,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } -#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ - /* * idle_balance is called by schedule() if this_cpu is about to become @@ -2452,6 +2182,7 @@ next_group: group = group->next; } while (group != sd->groups); } +#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ /* * rebalance_tick will get called every timer tick, on every CPU. @@ -2472,6 +2203,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + ckrm_rebalance_tick(j,this_cpu); + /* Update our load */ old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; @@ -2510,7 +2243,9 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, */ static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { + ckrm_rebalance_tick(jiffies,cpu); } + static inline void idle_balance(int cpu, runqueue_t *rq) { } @@ -2531,7 +2266,8 @@ static inline int wake_priority_sleeper(runqueue_t *rq) return 0; } -DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; + EXPORT_PER_CPU_SYMBOL(kstat); /* @@ -2555,7 +2291,7 @@ EXPORT_PER_CPU_SYMBOL(kstat); #define EXPIRED_STARVING(rq) \ (STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) + STARVATION_LIMIT * (local_queue_nr_running(rq)) + 1))) #endif /* @@ -2587,10 +2323,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) } if (p == rq->idle) { -#ifdef CONFIG_VSERVER_HARDCPU if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) set_need_resched(); -#endif if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; @@ -2598,7 +2332,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2637,10 +2370,11 @@ void scheduler_tick(int user_ticks, int sys_ticks) } goto out_unlock; } +#warning MEF PLANETLAB: "if (vx_need_resched(p)) was if (!--p->time_slice) */" if (vx_need_resched(p)) { #ifdef CONFIG_CKRM_CPU_SCHEDULE /* Hubertus ... we can abstract this out */ - ckrm_lrq_t* rq = get_task_lrq(p); + struct ckrm_local_runqueue* rq = get_task_class_queue(p); #endif dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2687,7 +2421,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) out_unlock: spin_unlock(&rq->lock); out: - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2837,19 +2570,6 @@ need_resched: spin_lock_irq(&rq->lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - if (prev != rq->idle) { - unsigned long long run = now - prev->timestamp; - ckrm_lrq_t * lrq = get_task_lrq(prev); - - lrq->lrq_load -= task_load(prev); - cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); - lrq->lrq_load += task_load(prev); - - cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } -#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2898,17 +2618,17 @@ pick_next: #endif if (unlikely(!rq->nr_running)) { idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - rq->expired_timestamp = 0; -#endif - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; - } + if (!rq->nr_running) { + next = rq->idle; + rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + goto switch_tasks; + } } next = rq_get_next_task(rq); + if (next == rq->idle) + goto switch_tasks; if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; @@ -2950,6 +2670,14 @@ switch_tasks: rq->nr_preempt++; RCU_qsctr(task_cpu(prev))++; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + if (prev != rq->idle) { + unsigned long long run = now - prev->timestamp; + cpu_demand_event(get_task_local_stat(prev),CPU_DEMAND_DESCHEDULE,run); + update_local_cvt(prev, run); + } +#endif + prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) { prev->sleep_avg = 0; @@ -2992,6 +2720,7 @@ switch_tasks: } EXPORT_SYMBOL(schedule); + #ifdef CONFIG_PREEMPT /* * this is is the entry point to schedule() from in-kernel preemption @@ -4092,6 +3821,7 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + set_task_cpu(p, dest_cpu); if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -4102,12 +3832,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); - set_task_cpu(p, dest_cpu); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); - } else - set_task_cpu(p, dest_cpu); + } out: double_rq_unlock(rq_src, rq_dest); @@ -4142,7 +3870,9 @@ static int migration_thread(void * data) } if (rq->active_balance) { +#ifndef CONFIG_CKRM_CPU_SCHEDULE active_load_balance(rq, cpu); +#endif rq->active_balance = 0; } @@ -4617,6 +4347,9 @@ void __init sched_init(void) { runqueue_t *rq; int i; +#ifndef CONFIG_CKRM_CPU_SCHEDULE + int j, k; +#endif #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ @@ -4635,50 +4368,46 @@ void __init sched_init(void) sched_group_init.next = &sched_group_init; sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif + init_cpu_classes(); for (i = 0; i < NR_CPUS; i++) { #ifndef CONFIG_CKRM_CPU_SCHEDULE - int j, k; prio_array_t *array; - +#endif rq = cpu_rq(i); spin_lock_init(&rq->lock); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } - +#ifndef CONFIG_CKRM_CPU_SCHEDULE rq->active = rq->arrays; rq->expired = rq->arrays + 1; #else - rq = cpu_rq(i); - spin_lock_init(&rq->lock); + rq->ckrm_cpu_load = 0; #endif - rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - ckrm_load_init(rq_ckrm_load(rq)); -#endif rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif -#ifdef CONFIG_VSERVER_HARDCPU INIT_LIST_HEAD(&rq->hold_queue); -#endif atomic_set(&rq->nr_iowait, 0); + +#ifndef CONFIG_CKRM_CPU_SCHEDULE + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } +#endif } /* @@ -4690,8 +4419,7 @@ void __init sched_init(void) rq->idle = current; set_task_cpu(current, smp_processor_id()); #ifdef CONFIG_CKRM_CPU_SCHEDULE - cpu_demand_event(&(current)->demand_stat,CPU_DEMAND_INIT,0); - current->cpu_class = get_default_cpu_class(); + current->cpu_class = default_cpu_class; current->array = NULL; #endif wake_up_forked_process(current); @@ -4785,30 +4513,10 @@ EXPORT_SYMBOL(task_running_sys); #ifdef CONFIG_CKRM_CPU_SCHEDULE /** * return the classqueue object of a certain processor + * Note: not supposed to be used in performance sensitive functions */ struct classqueue_struct * get_cpu_classqueue(int cpu) { return (& (cpu_rq(cpu)->classqueue) ); } - -/** - * _ckrm_cpu_change_class - change the class of a task - */ -void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) -{ - prio_array_t *array; - struct runqueue *rq; - unsigned long flags; - - rq = task_rq_lock(tsk,&flags); - array = tsk->array; - if (array) { - dequeue_task(tsk,array); - tsk->cpu_class = newcls; - enqueue_task(tsk,rq_active(tsk,rq)); - } else - tsk->cpu_class = newcls; - - task_rq_unlock(rq,&flags); -} #endif diff --git a/kernel/signal.c b/kernel/signal.c index e4282d2de..b3574b096 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -603,28 +603,17 @@ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { int error = -EINVAL; - int user; - if (sig < 0 || sig > _NSIG) return error; - - user = (!info || - (info != SEND_SIG_PRIV && - info != SEND_SIG_FORCED && - SI_FROMUSER(info))); - error = -EPERM; - if (user && (sig != SIGCONT || - current->signal->session != t->signal->session) + if ((!info || ((unsigned long)info != 1 && + (unsigned long)info != 2 && SI_FROMUSER(info))) + && ((sig != SIGCONT) || + (current->signal->session != t->signal->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) return error; - - error = -ESRCH; - if (user && !vx_check(vx_task_xid(t), VX_ADMIN|VX_IDENT)) - return error; - return security_task_kill(t, info, sig); } @@ -1066,6 +1055,9 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) unsigned long flags; int ret; + if (!vx_check(vx_task_xid(p), VX_ADMIN|VX_WATCH|VX_IDENT)) + return -ESRCH; + ret = check_kill_permission(sig, info, p); if (!ret && sig && p->sighand) { spin_lock_irqsave(&p->sighand->siglock, flags); diff --git a/kernel/vserver/inode.c b/kernel/vserver/inode.c index 3e8120bd3..dda881895 100644 --- a/kernel/vserver/inode.c +++ b/kernel/vserver/inode.c @@ -170,37 +170,6 @@ int vc_set_iattr(uint32_t id, void __user *data) return ret; } -int vc_iattr_ioctl(struct dentry *de, unsigned int cmd, unsigned long arg) -{ - void __user *data = (void __user *)arg; - struct vcmd_ctx_iattr_v1 vc_data; - int ret; - - /* - * I don't think we need any dget/dput pairs in here as long as - * this function is always called from sys_ioctl i.e., de is - * a field of a struct file that is guaranteed not to be freed. - */ - if (cmd == FIOC_SETIATTR) { - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - if (copy_from_user (&vc_data, data, sizeof(vc_data))) - return -EFAULT; - ret = __vc_set_iattr(de, - &vc_data.xid, &vc_data.flags, &vc_data.mask); - } - else { - if (!vx_check(0, VX_ADMIN)) - return -ENOSYS; - ret = __vc_get_iattr(de->d_inode, - &vc_data.xid, &vc_data.flags, &vc_data.mask); - } - - if (!ret && copy_to_user (data, &vc_data, sizeof(vc_data))) - ret = -EFAULT; - return ret; -} - #ifdef CONFIG_VSERVER_LEGACY #include diff --git a/kernel/vserver/sysctl.c b/kernel/vserver/sysctl.c index e1f2cacc7..298c62f18 100644 --- a/kernel/vserver/sysctl.c +++ b/kernel/vserver/sysctl.c @@ -200,6 +200,4 @@ static ctl_table vserver_table[] = { EXPORT_SYMBOL_GPL(vx_debug_dlim); -EXPORT_SYMBOL_GPL(vx_debug_nid); -EXPORT_SYMBOL_GPL(vx_debug_xid); diff --git a/mm/Makefile b/mm/Makefile index 60fbbce51..b7866b1a5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,13 +7,11 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ shmem.o vmalloc.o -obj-y := bootmem.o filemap.o mempool.o fadvise.o \ +obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o prio_tree.o \ readahead.o slab.o swap.o truncate.o vmscan.o \ $(mmu-y) -obj-$(CONFIG_OOM_KILL) += oom_kill.o -obj-$(CONFIG_OOM_PANIC) += oom_panic.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_X86_4G) += usercopy.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/memory.c b/mm/memory.c index 0dfb74060..6c44ecca0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1650,9 +1650,8 @@ retry: */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - if (!PageReserved(new_page)) - //++mm->rss; - vx_rsspages_inc(mm); + if (!PageReserved(new_page)) + ++mm->rss; flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) diff --git a/mm/oom_panic.c b/mm/oom_panic.c deleted file mode 100644 index b782934ac..000000000 --- a/mm/oom_panic.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Just panic() instead of the default behavior of selecting processes - * for death. - * - * Based on - * Modular OOM handlers for 2.6.4 (C) 2003,2004 Tvrtko A. Ursulin - * and - * linux/mm/oom_kill.c (C) 1998,2000 Rik van Riel. - * - * Mark Huang - * - * $Id$ - */ - -#include -#include -#include - -/** - * out_of_memory - is the system out of memory? - */ -void out_of_memory(int gfp_mask) -{ - /* - * oom_lock protects out_of_memory()'s static variables. - * It's a global lock; this is not performance-critical. - */ - static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED; - static unsigned long count; - - spin_lock(&oom_lock); - - /* - * If we have gotten only a few failures, - * we're not really oom. - */ - if (++count < 10) - goto out_unlock; - - /* - * Ok, really out of memory. Panic. - */ - - printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); - show_free_areas(); - - panic("Out Of Memory"); - -out_unlock: - spin_unlock(&oom_lock); -} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 675b061b7..152299c39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -33,7 +33,6 @@ #include #include #include -#include #include @@ -277,7 +276,6 @@ free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, area, order); - ckrm_clear_page_class(page); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -624,10 +622,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, might_sleep_if(wait); - if (!ckrm_class_limit_ok((GET_MEM_CLASS(current)))) { - return NULL; - } - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (zones[0] == NULL) /* no zones in the zonelist */ return NULL; @@ -757,7 +751,6 @@ nopage: return NULL; got_pg: kernel_map_pages(page, 1 << order, 1); - ckrm_set_pages_class(page, 1 << order, GET_MEM_CLASS(current)); return page; } diff --git a/mm/vmscan.c b/mm/vmscan.c index e01d5c98d..95e02701a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -37,13 +37,6 @@ #include #include -#include - -#ifndef AT_LIMIT_SUPPORT -#warning "ckrm_at_limit disabled due to problems with memory hog tests -- seting ckrm_shrink_list_empty to true" -#undef ckrm_shrink_list_empty -#define ckrm_shrink_list_empty() (1) -#endif /* possible outcome of pageout() */ typedef enum { @@ -78,9 +71,6 @@ struct scan_control { /* This context's GFP mask */ unsigned int gfp_mask; - /* Flag used by CKRM */ - unsigned int ckrm_flags; - int may_writepage; }; @@ -559,23 +549,19 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan, nr_pass; - unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; + int max_scan = sc->nr_to_scan; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); -redo: - ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); - nr_pass = zone->nr_inactive; while (max_scan > 0) { struct page *page; int nr_taken = 0; int nr_scan = 0; int nr_freed; - while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX && + while (nr_scan++ < SWAP_CLUSTER_MAX && !list_empty(&zone->inactive_list)) { page = lru_to_page(&zone->inactive_list); @@ -593,25 +579,15 @@ redo: SetPageLRU(page); list_add(&page->lru, &zone->inactive_list); continue; - } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { - __put_page(page); - SetPageLRU(page); -#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE - list_add_tail(&page->lru, &zone->inactive_list); -#else - list_add(&page->lru, &zone->inactive_list); -#endif - continue; } list_add(&page->lru, &page_list); - ckrm_mem_dec_inactive(page); nr_taken++; } zone->nr_inactive -= nr_taken; zone->pages_scanned += nr_taken; spin_unlock_irq(&zone->lru_lock); - if ((bit_flag == 0) && (nr_taken == 0)) + if (nr_taken == 0) goto done; max_scan -= nr_scan; @@ -644,9 +620,6 @@ redo: spin_lock_irq(&zone->lru_lock); } } - if (ckrm_flags && (nr_pass <= 0)) { - goto redo; - } } spin_unlock_irq(&zone->lru_lock); done: @@ -686,17 +659,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) long mapped_ratio; long distress; long swap_tendency; - unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; - int nr_pass; lru_add_drain(); pgmoved = 0; spin_lock_irq(&zone->lru_lock); -redo: - ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); - nr_pass = zone->nr_active; - while (pgscanned < nr_pages && !list_empty(&zone->active_list) && - nr_pass) { + while (pgscanned < nr_pages && !list_empty(&zone->active_list)) { page = lru_to_page(&zone->active_list); prefetchw_prev_lru_page(page, &zone->active_list, flags); if (!TestClearPageLRU(page)) @@ -712,24 +679,11 @@ redo: __put_page(page); SetPageLRU(page); list_add(&page->lru, &zone->active_list); - pgscanned++; - } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { - __put_page(page); - SetPageLRU(page); -#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE - list_add_tail(&page->lru, &zone->active_list); -#else - list_add(&page->lru, &zone->active_list); -#endif } else { list_add(&page->lru, &l_hold); - ckrm_mem_dec_active(page); pgmoved++; - pgscanned++; - } - if (!--nr_pass && ckrm_flags) { - goto redo; } + pgscanned++; } zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); @@ -804,7 +758,6 @@ redo: if (!TestClearPageActive(page)) BUG(); list_move(&page->lru, &zone->inactive_list); - ckrm_mem_inc_inactive(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_inactive += pgmoved; @@ -833,7 +786,6 @@ redo: BUG(); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); - ckrm_mem_inc_active(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_active += pgmoved; @@ -881,7 +833,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_to_reclaim = SWAP_CLUSTER_MAX; while (nr_active || nr_inactive) { - sc->ckrm_flags = ckrm_setup_reclamation(); if (nr_active) { sc->nr_to_scan = min(nr_active, (unsigned long)SWAP_CLUSTER_MAX); @@ -897,118 +848,9 @@ shrink_zone(struct zone *zone, struct scan_control *sc) if (sc->nr_to_reclaim <= 0) break; } - ckrm_teardown_reclamation(); } } -#if defined(CONFIG_CKRM_RES_MEM) && defined(AT_LIMIT_SUPPORT) -// This function needs to be given more thought. -// Shrink the class to be at 90% of its limit -static void -ckrm_shrink_class(ckrm_mem_res_t *cls) -{ - struct scan_control sc; - struct zone *zone; - int zindex = 0, active_credit = 0, inactive_credit = 0; - - if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically - // if it is already set somebody is working on it. so... leave - return; - } - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.ckrm_flags = ckrm_get_reclaim_flags(cls); - sc.nr_reclaimed = 0; - sc.priority = 0; // always very high priority - - for_each_zone(zone) { - int zone_total, zone_limit, active_limit, inactive_limit; - int active_over, inactive_over; - unsigned long nr_active, nr_inactive; - u64 temp; - - zone->temp_priority = zone->prev_priority; - zone->prev_priority = sc.priority; - - zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; - - temp = (u64) cls->pg_limit * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_limit = (int) temp; - active_limit = (6 * zone_limit) / 10; // 2/3rd in active list - inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list - - active_over = cls->nr_active[zindex] - active_limit + active_credit; - inactive_over = active_over + - (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit; - - if (active_over > 0) { - zone->nr_scan_active += active_over + 1; - nr_active = zone->nr_scan_active; - active_credit = 0; - } else { - active_credit += active_over; - nr_active = 0; - } - - if (inactive_over > 0) { - zone->nr_scan_inactive += inactive_over; - nr_inactive = zone->nr_scan_inactive; - inactive_credit = 0; - } else { - inactive_credit += inactive_over; - nr_inactive = 0; - } - while (nr_active || nr_inactive) { - if (nr_active) { - sc.nr_to_scan = min(nr_active, - (unsigned long)SWAP_CLUSTER_MAX); - nr_active -= sc.nr_to_scan; - refill_inactive_zone(zone, &sc); - } - - if (nr_inactive) { - sc.nr_to_scan = min(nr_inactive, - (unsigned long)SWAP_CLUSTER_MAX); - nr_inactive -= sc.nr_to_scan; - shrink_cache(zone, &sc); - if (sc.nr_to_reclaim <= 0) - break; - } - } - zone->prev_priority = zone->temp_priority; - zindex++; - } - ckrm_clear_shrink(cls); -} - -static void -ckrm_shrink_classes(void) -{ - ckrm_mem_res_t *cls; - - spin_lock(&ckrm_mem_lock); - while (!ckrm_shrink_list_empty()) { - cls = list_entry(ckrm_shrink_list.next, ckrm_mem_res_t, - shrink_list); - spin_unlock(&ckrm_mem_lock); - ckrm_shrink_class(cls); - spin_lock(&ckrm_mem_lock); - list_del(&cls->shrink_list); - cls->flags &= ~MEM_AT_LIMIT; - } - spin_unlock(&ckrm_mem_lock); -} - -#else - -#if defined(CONFIG_CKRM_RES_MEM) && !defined(AT_LIMIT_SUPPORT) -#warning "disabling ckrm_at_limit -- setting ckrm_shrink_classes to noop " -#endif - -#define ckrm_shrink_classes() do { } while(0) -#endif - /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1315,9 +1157,6 @@ static int kswapd(void *p) finish_wait(&pgdat->kswapd_wait, &wait); try_to_clip_inodes(); - if (!ckrm_shrink_list_empty()) - ckrm_shrink_classes(); - else balance_pgdat(pgdat, 0); } return 0; @@ -1328,7 +1167,7 @@ static int kswapd(void *p) */ void wakeup_kswapd(struct zone *zone) { - if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty()) + if (zone->free_pages > zone->pages_low) return; if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) return; diff --git a/net/core/sock.c b/net/core/sock.c index 266397922..d5b2d9105 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -331,18 +331,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname, clear_bit(SOCK_PASS_CRED, &sock->flags); break; - case SO_SETXID: - if (current->xid) { - ret = -EPERM; - break; - } - if (val < 0 || val > MAX_S_CONTEXT) { - ret = -EINVAL; - break; - } - sk->sk_xid = val; - break; - case SO_TIMESTAMP: sk->sk_rcvtstamp = valbool; if (valbool) diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 05fbb43cc..00a89f4f8 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -670,10 +670,8 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, conntrack->ct_general.destroy = destroy_conntrack; conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack; - conntrack->xid[IP_CT_DIR_ORIGINAL] = -1; conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack; - conntrack->xid[IP_CT_DIR_REPLY] = -1; for (i=0; i < IP_CT_NUMBER; i++) conntrack->infos[i].master = &conntrack->ct_general; diff --git a/net/ipv4/netfilter/ip_conntrack_pptp.c b/net/ipv4/netfilter/ip_conntrack_pptp.c deleted file mode 100644 index 29ab1a495..000000000 --- a/net/ipv4/netfilter/ip_conntrack_pptp.c +++ /dev/null @@ -1,712 +0,0 @@ -/* - * ip_conntrack_pptp.c - Version 2.0 - * - * Connection tracking support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. - * It is a specification defined by Microsoft and some vendors - * working with Microsoft. PPTP is built on top of a modified - * version of the Internet Generic Routing Encapsulation Protocol. - * GRE is defined in RFC 1701 and RFC 1702. Documentation of - * PPTP can be found in RFC 2637 - * - * (C) 2000-2003 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * Limitations: - * - We blindly assume that control connections are always - * established in PNS->PAC direction. This is a violation - * of RFFC2673 - * - * TODO: - finish support for multiple calls within one session - * (needs expect reservations in newnat) - * - testing of incoming PPTP calls - * - * Changes: - * 2002-02-05 - Version 1.3 - * - Call ip_conntrack_unexpect_related() from - * pptp_timeout_related() to destroy expectations in case - * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen - * (Philip Craig ) - * - Add Version information at module loadtime - * 2002-02-10 - Version 1.6 - * - move to C99 style initializers - * - remove second expectation if first arrives - * 2004-10-22 - Version 2.0 - * - merge Mandrake's 2.6.x port with recent 2.6.x API changes - * - fix lots of linear skb assumptions from Mandrake's port - * - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define IP_CT_PPTP_VERSION "2.0" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP"); - -DECLARE_LOCK(ip_pptp_lock); - -#if 0 -#include "ip_conntrack_pptp_priv.h" -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#else -#define DEBUGP(format, args...) -#endif - -#define SECS *HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - -#define PPTP_GRE_TIMEOUT (10 MINS) -#define PPTP_GRE_STREAM_TIMEOUT (5 DAYS) - -static int pptp_expectfn(struct ip_conntrack *ct) -{ - struct ip_conntrack *master; - struct ip_conntrack_expect *exp; - - DEBUGP("increasing timeouts\n"); - /* increase timeout of GRE data channel conntrack entry */ - ct->proto.gre.timeout = PPTP_GRE_TIMEOUT; - ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT; - - master = master_ct(ct); - if (!master) { - DEBUGP(" no master!!!\n"); - return 0; - } - - exp = ct->master; - if (!exp) { - DEBUGP("no expectation!!\n"); - return 0; - } - - DEBUGP("completing tuples with ct info\n"); - /* we can do this, since we're unconfirmed */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == - htonl(master->help.ct_pptp_info.pac_call_id)) { - /* assume PNS->PAC */ - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = - htonl(master->help.ct_pptp_info.pns_call_id); - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = - htonl(master->help.ct_pptp_info.pns_call_id); - } else { - /* assume PAC->PNS */ - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = - htonl(master->help.ct_pptp_info.pac_call_id); - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = - htonl(master->help.ct_pptp_info.pac_call_id); - } - - /* delete other expectation */ - if (exp->expected_list.next != &exp->expected_list) { - struct ip_conntrack_expect *other_exp; - struct list_head *cur_item, *next; - - for (cur_item = master->sibling_list.next; - cur_item != &master->sibling_list; cur_item = next) { - next = cur_item->next; - other_exp = list_entry(cur_item, - struct ip_conntrack_expect, - expected_list); - /* remove only if occurred at same sequence number */ - if (other_exp != exp && other_exp->seq == exp->seq) { - DEBUGP("unexpecting other direction\n"); - ip_ct_gre_keymap_destroy(other_exp); - ip_conntrack_unexpect_related(other_exp); - } - } - } - - return 0; -} - -/* timeout GRE data connections */ -static int pptp_timeout_related(struct ip_conntrack *ct) -{ - struct list_head *cur_item, *next; - struct ip_conntrack_expect *exp; - - /* FIXME: do we have to lock something ? */ - for (cur_item = ct->sibling_list.next; - cur_item != &ct->sibling_list; cur_item = next) { - next = cur_item->next; - exp = list_entry(cur_item, struct ip_conntrack_expect, - expected_list); - - ip_ct_gre_keymap_destroy(exp); - if (!exp->sibling) { - ip_conntrack_unexpect_related(exp); - continue; - } - - DEBUGP("setting timeout of conntrack %p to 0\n", - exp->sibling); - exp->sibling->proto.gre.timeout = 0; - exp->sibling->proto.gre.stream_timeout = 0; - /* refresh_acct will not modify counters if skb == NULL */ - ip_ct_refresh_acct(exp->sibling, 0, NULL, 0); - } - - return 0; -} - -/* expect GRE connections (PNS->PAC and PAC->PNS direction) */ -static inline int -exp_gre(struct ip_conntrack *master, - u_int32_t seq, - u_int16_t callid, - u_int16_t peer_callid) -{ - struct ip_conntrack_tuple inv_tuple; - struct ip_conntrack_tuple exp_tuples[] = { - /* tuple in original direction, PNS->PAC */ - { .src = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip, - .u = { .gre = { .key = htonl(ntohs(peer_callid)) } } - }, - .dst = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip, - .u = { .gre = { .key = htonl(ntohs(callid)) } }, - .protonum = IPPROTO_GRE - }, - }, - /* tuple in reply direction, PAC->PNS */ - { .src = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip, - .u = { .gre = { .key = htonl(ntohs(callid)) } } - }, - .dst = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip, - .u = { .gre = { .key = htonl(ntohs(peer_callid)) } }, - .protonum = IPPROTO_GRE - }, - } - }, *exp_tuple; - - for (exp_tuple = exp_tuples; exp_tuple < &exp_tuples[2]; exp_tuple++) { - struct ip_conntrack_expect *exp; - - exp = ip_conntrack_expect_alloc(); - if (exp == NULL) - return 1; - - memcpy(&exp->tuple, exp_tuple, sizeof(exp->tuple)); - - exp->mask.src.ip = 0xffffffff; - exp->mask.src.u.all = 0; - exp->mask.dst.u.all = 0; - exp->mask.dst.u.gre.key = 0xffffffff; - exp->mask.dst.ip = 0xffffffff; - exp->mask.dst.protonum = 0xffff; - - exp->seq = seq; - exp->expectfn = pptp_expectfn; - - exp->help.exp_pptp_info.pac_call_id = ntohs(callid); - exp->help.exp_pptp_info.pns_call_id = ntohs(peer_callid); - - DEBUGP("calling expect_related "); - DUMP_TUPLE_RAW(&exp->tuple); - - /* Add GRE keymap entries */ - if (ip_ct_gre_keymap_add(exp, &exp->tuple, 0) != 0) { - kfree(exp); - return 1; - } - - invert_tuplepr(&inv_tuple, &exp->tuple); - if (ip_ct_gre_keymap_add(exp, &inv_tuple, 1) != 0) { - ip_ct_gre_keymap_destroy(exp); - kfree(exp); - return 1; - } - - if (ip_conntrack_expect_related(exp, master) != 0) { - ip_ct_gre_keymap_destroy(exp); - kfree(exp); - DEBUGP("cannot expect_related()\n"); - return 1; - } - } - - return 0; -} - -static inline int -pptp_inbound_pkt(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int ctlhoff, - size_t datalen, - struct ip_conntrack *ct) -{ - struct PptpControlHeader _ctlh, *ctlh; - unsigned int reqlen; - union pptp_ctrl_union _pptpReq, *pptpReq; - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - u_int16_t msg, *cid, *pcid; - u_int32_t seq; - - ctlh = skb_header_pointer(skb, ctlhoff, sizeof(_ctlh), &_ctlh); - if (unlikely(!ctlh)) { - DEBUGP("error during skb_header_pointer\n"); - return NF_ACCEPT; - } - - reqlen = datalen - sizeof(struct pptp_pkt_hdr) - sizeof(_ctlh); - pptpReq = skb_header_pointer(skb, ctlhoff+sizeof(struct pptp_pkt_hdr), - reqlen, &_pptpReq); - if (unlikely(!pptpReq)) { - DEBUGP("error during skb_header_pointer\n"); - return NF_ACCEPT; - } - - msg = ntohs(ctlh->messageType); - DEBUGP("inbound control message %s\n", strMName[msg]); - - switch (msg) { - case PPTP_START_SESSION_REPLY: - if (reqlen < sizeof(_pptpReq.srep)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server confirms new control session */ - if (info->sstate < PPTP_SESSION_REQUESTED) { - DEBUGP("%s without START_SESS_REQUEST\n", - strMName[msg]); - break; - } - if (pptpReq->srep.resultCode == PPTP_START_OK) - info->sstate = PPTP_SESSION_CONFIRMED; - else - info->sstate = PPTP_SESSION_ERROR; - break; - - case PPTP_STOP_SESSION_REPLY: - if (reqlen < sizeof(_pptpReq.strep)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server confirms end of control session */ - if (info->sstate > PPTP_SESSION_STOPREQ) { - DEBUGP("%s without STOP_SESS_REQUEST\n", - strMName[msg]); - break; - } - if (pptpReq->strep.resultCode == PPTP_STOP_OK) - info->sstate = PPTP_SESSION_NONE; - else - info->sstate = PPTP_SESSION_ERROR; - break; - - case PPTP_OUT_CALL_REPLY: - if (reqlen < sizeof(_pptpReq.ocack)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server accepted call, we now expect GRE frames */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("%s but no session\n", strMName[msg]); - break; - } - if (info->cstate != PPTP_CALL_OUT_REQ && - info->cstate != PPTP_CALL_OUT_CONF) { - DEBUGP("%s without OUTCALL_REQ\n", strMName[msg]); - break; - } - if (pptpReq->ocack.resultCode != PPTP_OUTCALL_CONNECT) { - info->cstate = PPTP_CALL_NONE; - break; - } - - cid = &pptpReq->ocack.callID; - pcid = &pptpReq->ocack.peersCallID; - - info->pac_call_id = ntohs(*cid); - - if (htons(info->pns_call_id) != *pcid) { - DEBUGP("%s for unknown callid %u\n", - strMName[msg], ntohs(*pcid)); - break; - } - - DEBUGP("%s, CID=%X, PCID=%X\n", strMName[msg], - ntohs(*cid), ntohs(*pcid)); - - info->cstate = PPTP_CALL_OUT_CONF; - - seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr) - + sizeof(struct PptpControlHeader) - + ((void *)pcid - (void *)pptpReq); - - if (exp_gre(ct, seq, *cid, *pcid) != 0) - printk("ip_conntrack_pptp: error during exp_gre\n"); - break; - - case PPTP_IN_CALL_REQUEST: - if (reqlen < sizeof(_pptpReq.icack)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server tells us about incoming call request */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("%s but no session\n", strMName[msg]); - break; - } - pcid = &pptpReq->icack.peersCallID; - DEBUGP("%s, PCID=%X\n", strMName[msg], ntohs(*pcid)); - info->cstate = PPTP_CALL_IN_REQ; - info->pac_call_id = ntohs(*pcid); - break; - - case PPTP_IN_CALL_CONNECT: - if (reqlen < sizeof(_pptpReq.iccon)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server tells us about incoming call established */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("%s but no session\n", strMName[msg]); - break; - } - if (info->sstate != PPTP_CALL_IN_REP - && info->sstate != PPTP_CALL_IN_CONF) { - DEBUGP("%s but never sent IN_CALL_REPLY\n", - strMName[msg]); - break; - } - - pcid = &pptpReq->iccon.peersCallID; - cid = &info->pac_call_id; - - if (info->pns_call_id != ntohs(*pcid)) { - DEBUGP("%s for unknown CallID %u\n", - strMName[msg], ntohs(*cid)); - break; - } - - DEBUGP("%s, PCID=%X\n", strMName[msg], ntohs(*pcid)); - info->cstate = PPTP_CALL_IN_CONF; - - /* we expect a GRE connection from PAC to PNS */ - seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr) - + sizeof(struct PptpControlHeader) - + ((void *)pcid - (void *)pptpReq); - - if (exp_gre(ct, seq, *cid, *pcid) != 0) - printk("ip_conntrack_pptp: error during exp_gre\n"); - - break; - - case PPTP_CALL_DISCONNECT_NOTIFY: - if (reqlen < sizeof(_pptpReq.disc)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server confirms disconnect */ - cid = &pptpReq->disc.callID; - DEBUGP("%s, CID=%X\n", strMName[msg], ntohs(*cid)); - info->cstate = PPTP_CALL_NONE; - - /* untrack this call id, unexpect GRE packets */ - pptp_timeout_related(ct); - break; - - case PPTP_WAN_ERROR_NOTIFY: - break; - - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* I don't have to explain these ;) */ - break; - default: - DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX) - ? strMName[msg]:strMName[0], msg); - break; - } - - return NF_ACCEPT; - -} - -static inline int -pptp_outbound_pkt(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int ctlhoff, - size_t datalen, - struct ip_conntrack *ct) -{ - struct PptpControlHeader _ctlh, *ctlh; - unsigned int reqlen; - union pptp_ctrl_union _pptpReq, *pptpReq; - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - u_int16_t msg, *cid, *pcid; - - ctlh = skb_header_pointer(skb, ctlhoff, sizeof(_ctlh), &_ctlh); - if (!ctlh) - return NF_ACCEPT; - - reqlen = datalen - sizeof(struct pptp_pkt_hdr) - sizeof(_ctlh); - pptpReq = skb_header_pointer(skb, ctlhoff+sizeof(_ctlh), reqlen, - &_pptpReq); - if (!pptpReq) - return NF_ACCEPT; - - msg = ntohs(ctlh->messageType); - DEBUGP("outbound control message %s\n", strMName[msg]); - - switch (msg) { - case PPTP_START_SESSION_REQUEST: - /* client requests for new control session */ - if (info->sstate != PPTP_SESSION_NONE) { - DEBUGP("%s but we already have one", - strMName[msg]); - } - info->sstate = PPTP_SESSION_REQUESTED; - break; - case PPTP_STOP_SESSION_REQUEST: - /* client requests end of control session */ - info->sstate = PPTP_SESSION_STOPREQ; - break; - - case PPTP_OUT_CALL_REQUEST: - if (reqlen < sizeof(_pptpReq.ocreq)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* client initiating connection to server */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("%s but no session\n", - strMName[msg]); - break; - } - info->cstate = PPTP_CALL_OUT_REQ; - /* track PNS call id */ - cid = &pptpReq->ocreq.callID; - DEBUGP("%s, CID=%X\n", strMName[msg], ntohs(*cid)); - info->pns_call_id = ntohs(*cid); - break; - case PPTP_IN_CALL_REPLY: - if (reqlen < sizeof(_pptpReq.icack)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* client answers incoming call */ - if (info->cstate != PPTP_CALL_IN_REQ - && info->cstate != PPTP_CALL_IN_REP) { - DEBUGP("%s without incall_req\n", - strMName[msg]); - break; - } - if (pptpReq->icack.resultCode != PPTP_INCALL_ACCEPT) { - info->cstate = PPTP_CALL_NONE; - break; - } - pcid = &pptpReq->icack.peersCallID; - if (info->pac_call_id != ntohs(*pcid)) { - DEBUGP("%s for unknown call %u\n", - strMName[msg], ntohs(*pcid)); - break; - } - DEBUGP("%s, CID=%X\n", strMName[msg], ntohs(*pcid)); - /* part two of the three-way handshake */ - info->cstate = PPTP_CALL_IN_REP; - info->pns_call_id = ntohs(pptpReq->icack.callID); - break; - - case PPTP_CALL_CLEAR_REQUEST: - /* client requests hangup of call */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("CLEAR_CALL but no session\n"); - break; - } - /* FUTURE: iterate over all calls and check if - * call ID is valid. We don't do this without newnat, - * because we only know about last call */ - info->cstate = PPTP_CALL_CLEAR_REQ; - break; - case PPTP_SET_LINK_INFO: - break; - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* I don't have to explain these ;) */ - break; - default: - DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX)? - strMName[msg]:strMName[0], msg); - /* unknown: no need to create GRE masq table entry */ - break; - } - - return NF_ACCEPT; -} - - -/* track caller id inside control connection, call expect_related */ -static int -conntrack_pptp_help(struct sk_buff *skb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) - -{ - struct pptp_pkt_hdr _pptph, *pptph; - - struct tcphdr _tcph, *tcph; - u_int32_t tcplen = skb->len - skb->nh.iph->ihl * 4; - u_int32_t datalen; - void *datalimit; - int dir = CTINFO2DIR(ctinfo); - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - unsigned int nexthdr_off; - - int oldsstate, oldcstate; - int ret; - - /* don't do any tracking before tcp handshake complete */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { - DEBUGP("ctinfo = %u, skipping\n", ctinfo); - return NF_ACCEPT; - } - - nexthdr_off = skb->nh.iph->ihl*4; - tcph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_tcph), - &_tcph); - if (!tcph) - return NF_ACCEPT; - - /* not a complete TCP header? */ - if (tcplen < sizeof(struct tcphdr) || tcplen < tcph->doff * 4) { - DEBUGP("tcplen = %u\n", tcplen); - return NF_ACCEPT; - } - - - datalen = tcplen - tcph->doff * 4; - - /* checksum invalid? */ - if (tcp_v4_check(tcph, tcplen, skb->nh.iph->saddr, skb->nh.iph->daddr, - csum_partial((char *) tcph, tcplen, 0))) { - printk(KERN_NOTICE __FILE__ ": bad csum\n"); - /* W2K PPTP server sends TCP packets with wrong checksum :(( */ - //return NF_ACCEPT; - } - - if (tcph->fin || tcph->rst) { - DEBUGP("RST/FIN received, timeouting GRE\n"); - /* can't do this after real newnat */ - info->cstate = PPTP_CALL_NONE; - - /* untrack this call id, unexpect GRE packets */ - pptp_timeout_related(ct); - } - - nexthdr_off += tcph->doff*4; - pptph = skb_header_pointer(skb, skb->nh.iph->ihl*4 + tcph->doff*4, - sizeof(_pptph), &_pptph); - if (!pptph) { - DEBUGP("no full PPTP header, can't track\n"); - return NF_ACCEPT; - } - - datalimit = (void *) pptph + datalen; - - /* if it's not a control message we can't do anything with it */ - if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || - ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { - DEBUGP("not a control packet\n"); - return NF_ACCEPT; - } - - oldsstate = info->sstate; - oldcstate = info->cstate; - - LOCK_BH(&ip_pptp_lock); - - nexthdr_off += sizeof(_pptph); - /* FIXME: We just blindly assume that the control connection is always - * established from PNS->PAC. However, RFC makes no guarantee */ - if (dir == IP_CT_DIR_ORIGINAL) - /* client -> server (PNS -> PAC) */ - ret = pptp_outbound_pkt(skb, tcph, nexthdr_off, datalen, ct); - else - /* server -> client (PAC -> PNS) */ - ret = pptp_inbound_pkt(skb, tcph, nexthdr_off, datalen, ct); - DEBUGP("sstate: %d->%d, cstate: %d->%d\n", - oldsstate, info->sstate, oldcstate, info->cstate); - UNLOCK_BH(&ip_pptp_lock); - - return ret; -} - -/* control protocol helper */ -static struct ip_conntrack_helper pptp = { - .list = { NULL, NULL }, - .name = "pptp", - .flags = IP_CT_HELPER_F_REUSE_EXPECT, - .me = THIS_MODULE, - .max_expected = 2, - .timeout = 0, - .tuple = { .src = { .ip = 0, - .u = { .tcp = { .port = - __constant_htons(PPTP_CONTROL_PORT) } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = IPPROTO_TCP - } - }, - .mask = { .src = { .ip = 0, - .u = { .tcp = { .port = 0xffff } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = 0xffff - } - }, - .help = conntrack_pptp_help -}; - -/* ip_conntrack_pptp initialization */ -static int __init init(void) -{ - int retcode; - - DEBUGP(__FILE__ ": registering helper\n"); - if ((retcode = ip_conntrack_helper_register(&pptp))) { - printk(KERN_ERR "Unable to register conntrack application " - "helper for pptp: %d\n", retcode); - return -EIO; - } - - printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION); - return 0; -} - -static void __exit fini(void) -{ - ip_conntrack_helper_unregister(&pptp); - printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION); -} - -module_init(init); -module_exit(fini); - -EXPORT_SYMBOL(ip_pptp_lock); diff --git a/net/ipv4/netfilter/ip_conntrack_pptp_priv.h b/net/ipv4/netfilter/ip_conntrack_pptp_priv.h deleted file mode 100644 index 6b52564e8..000000000 --- a/net/ipv4/netfilter/ip_conntrack_pptp_priv.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _IP_CT_PPTP_PRIV_H -#define _IP_CT_PPTP_PRIV_H - -/* PptpControlMessageType names */ -static const char *strMName[] = { - "UNKNOWN_MESSAGE", - "START_SESSION_REQUEST", - "START_SESSION_REPLY", - "STOP_SESSION_REQUEST", - "STOP_SESSION_REPLY", - "ECHO_REQUEST", - "ECHO_REPLY", - "OUT_CALL_REQUEST", - "OUT_CALL_REPLY", - "IN_CALL_REQUEST", - "IN_CALL_REPLY", - "IN_CALL_CONNECT", - "CALL_CLEAR_REQUEST", - "CALL_DISCONNECT_NOTIFY", - "WAN_ERROR_NOTIFY", - "SET_LINK_INFO" -}; - -#endif diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c deleted file mode 100644 index 013f759cc..000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ /dev/null @@ -1,349 +0,0 @@ -/* - * ip_conntrack_proto_gre.c - Version 2.0 - * - * Connection tracking protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2004 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -DECLARE_RWLOCK(ip_ct_gre_lock); -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_ct_gre_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_ct_gre_lock) - -#include -#include -#include -#include - -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE"); - -/* shamelessly stolen from ip_conntrack_proto_udp.c */ -#define GRE_TIMEOUT (30*HZ) -#define GRE_STREAM_TIMEOUT (180*HZ) - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x\n", \ - NIPQUAD((x)->src.ip), ntohl((x)->src.u.gre.key), \ - NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.gre.key)) -#else -#define DEBUGP(x, args...) -#define DUMP_TUPLE_GRE(x) -#endif - -/* GRE KEYMAP HANDLING FUNCTIONS */ -static LIST_HEAD(gre_keymap_list); - -static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, - const struct ip_conntrack_tuple *t) -{ - return ((km->tuple.src.ip == t->src.ip) && - (km->tuple.dst.ip == t->dst.ip) && - (km->tuple.dst.protonum == t->dst.protonum) && - (km->tuple.dst.u.all == t->dst.u.all)); -} - -/* look up the source key for a given tuple */ -static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) -{ - struct ip_ct_gre_keymap *km; - u_int32_t key; - - READ_LOCK(&ip_ct_gre_lock); - km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, - struct ip_ct_gre_keymap *, t); - if (!km) { - READ_UNLOCK(&ip_ct_gre_lock); - return 0; - } - - key = km->tuple.src.u.gre.key; - READ_UNLOCK(&ip_ct_gre_lock); - - return key; -} - -/* add a single keymap entry, associate with specified expect */ -int ip_ct_gre_keymap_add(struct ip_conntrack_expect *exp, - struct ip_conntrack_tuple *t, int reply) -{ - struct ip_ct_gre_keymap *km; - - km = kmalloc(sizeof(*km), GFP_ATOMIC); - if (!km) - return -1; - - /* initializing list head should be sufficient */ - memset(km, 0, sizeof(*km)); - - memcpy(&km->tuple, t, sizeof(*t)); - - if (!reply) - exp->proto.gre.keymap_orig = km; - else - exp->proto.gre.keymap_reply = km; - - DEBUGP("adding new entry %p: ", km); - DUMP_TUPLE_GRE(&km->tuple); - - WRITE_LOCK(&ip_ct_gre_lock); - list_append(&gre_keymap_list, km); - WRITE_UNLOCK(&ip_ct_gre_lock); - - return 0; -} - -/* change the tuple of a keymap entry (used by nat helper) */ -void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km, - struct ip_conntrack_tuple *t) -{ - if (!km) - { - printk(KERN_WARNING - "NULL GRE conntrack keymap change requested\n"); - return; - } - - DEBUGP("changing entry %p to: ", km); - DUMP_TUPLE_GRE(t); - - WRITE_LOCK(&ip_ct_gre_lock); - memcpy(&km->tuple, t, sizeof(km->tuple)); - WRITE_UNLOCK(&ip_ct_gre_lock); -} - -/* destroy the keymap entries associated with specified expect */ -void ip_ct_gre_keymap_destroy(struct ip_conntrack_expect *exp) -{ - DEBUGP("entering for exp %p\n", exp); - WRITE_LOCK(&ip_ct_gre_lock); - if (exp->proto.gre.keymap_orig) { - DEBUGP("removing %p from list\n", exp->proto.gre.keymap_orig); - list_del(&exp->proto.gre.keymap_orig->list); - kfree(exp->proto.gre.keymap_orig); - exp->proto.gre.keymap_orig = NULL; - } - if (exp->proto.gre.keymap_reply) { - DEBUGP("removing %p from list\n", exp->proto.gre.keymap_reply); - list_del(&exp->proto.gre.keymap_reply->list); - kfree(exp->proto.gre.keymap_reply); - exp->proto.gre.keymap_reply = NULL; - } - WRITE_UNLOCK(&ip_ct_gre_lock); -} - - -/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ - -/* invert gre part of tuple */ -static int gre_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->dst.u.gre.key = orig->src.u.gre.key; - tuple->src.u.gre.key = orig->dst.u.gre.key; - - return 1; -} - -/* gre hdr info to tuple */ -static int gre_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct gre_hdr _grehdr, *grehdr; - struct gre_hdr_pptp _pgrehdr, *pgrehdr; - u_int32_t srckey; - - grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - /* PPTP header is variable length, only need up to the call_id field */ - pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); - - if (!grehdr || !pgrehdr) - return 0; - - switch (grehdr->version) { - case GRE_VERSION_1701: - if (!grehdr->key) { - DEBUGP("Can't track GRE without key\n"); - return 0; - } - tuple->dst.u.gre.key = *(gre_key(grehdr)); - break; - - case GRE_VERSION_PPTP: - if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { - DEBUGP("GRE_VERSION_PPTP but unknown proto\n"); - return 0; - } - tuple->dst.u.gre.key = htonl(ntohs(pgrehdr->call_id)); - break; - - default: - printk(KERN_WARNING "unknown GRE version %hu\n", - grehdr->version); - return 0; - } - - srckey = gre_keymap_lookup(tuple); - - tuple->src.u.gre.key = srckey; -#if 0 - DEBUGP("found src key %x for tuple ", ntohl(srckey)); - DUMP_TUPLE_GRE(tuple); -#endif - - return 1; -} - -/* print gre part of tuple */ -static unsigned int gre_print_tuple(char *buffer, - const struct ip_conntrack_tuple *tuple) -{ - return sprintf(buffer, "srckey=0x%x dstkey=0x%x ", - ntohl(tuple->src.u.gre.key), - ntohl(tuple->dst.u.gre.key)); -} - -/* print private data for conntrack */ -static unsigned int gre_print_conntrack(char *buffer, - const struct ip_conntrack *ct) -{ - return sprintf(buffer, "timeout=%u, stream_timeout=%u ", - (ct->proto.gre.timeout / HZ), - (ct->proto.gre.stream_timeout / HZ)); -} - -/* Returns verdict for packet, and may modify conntrack */ -static int gre_packet(struct ip_conntrack *ct, - const struct sk_buff *skb, - enum ip_conntrack_info conntrackinfo) -{ - /* If we've seen traffic both ways, this is a GRE connection. - * Extend timeout. */ - if (ct->status & IPS_SEEN_REPLY) { - ip_ct_refresh_acct(ct, conntrackinfo, skb, - ct->proto.gre.stream_timeout); - /* Also, more likely to be important, and not a probe. */ - set_bit(IPS_ASSURED_BIT, &ct->status); - } else - ip_ct_refresh_acct(ct, conntrackinfo, skb, - ct->proto.gre.timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int gre_new(struct ip_conntrack *ct, - const struct sk_buff *skb) -{ - DEBUGP(": "); - DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - - /* initialize to sane value. Ideally a conntrack helper - * (e.g. in case of pptp) is increasing them */ - ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT; - ct->proto.gre.timeout = GRE_TIMEOUT; - - return 1; -} - -/* Called when a conntrack entry has already been removed from the hashes - * and is about to be deleted from memory */ -static void gre_destroy(struct ip_conntrack *ct) -{ - struct ip_conntrack_expect *master = ct->master; - - DEBUGP(" entering\n"); - - if (!master) { - DEBUGP("no master exp for ct %p\n", ct); - return; - } - - ip_ct_gre_keymap_destroy(master); -} - -/* protocol helper struct */ -static struct ip_conntrack_protocol gre = { - .proto = IPPROTO_GRE, - .name = "gre", - .pkt_to_tuple = gre_pkt_to_tuple, - .invert_tuple = gre_invert_tuple, - .print_tuple = gre_print_tuple, - .print_conntrack = gre_print_conntrack, - .packet = gre_packet, - .new = gre_new, - .destroy = gre_destroy, - .exp_matches_pkt = NULL, - .me = THIS_MODULE -}; - -/* ip_conntrack_proto_gre initialization */ -static int __init init(void) -{ - int retcode; - - if ((retcode = ip_conntrack_protocol_register(&gre))) { - printk(KERN_ERR "Unable to register conntrack protocol " - "helper for gre: %d\n", retcode); - return -EIO; - } - - return 0; -} - -static void __exit fini(void) -{ - struct list_head *pos, *n; - - /* delete all keymap entries */ - WRITE_LOCK(&ip_ct_gre_lock); - list_for_each_safe(pos, n, &gre_keymap_list) { - DEBUGP("deleting keymap %p at module unload time\n", pos); - list_del(pos); - kfree(pos); - } - WRITE_UNLOCK(&ip_ct_gre_lock); - - ip_conntrack_protocol_unregister(&gre); -} - -EXPORT_SYMBOL(ip_ct_gre_keymap_add); -EXPORT_SYMBOL(ip_ct_gre_keymap_change); -EXPORT_SYMBOL(ip_ct_gre_keymap_destroy); - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index fd688f4fe..0c935eddf 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -102,17 +102,17 @@ print_conntrack(char *buffer, struct ip_conntrack *conntrack) len += print_tuple(buffer + len, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, proto); - len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_ORIGINAL]); if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) len += sprintf(buffer + len, "[UNREPLIED] "); len += print_tuple(buffer + len, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, proto); - len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_REPLY]); if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) len += sprintf(buffer + len, "[ASSURED] "); len += sprintf(buffer + len, "use=%u ", atomic_read(&conntrack->ct_general.use)); + len += sprintf(buffer + len, "sxid=%d dxid=%d ", + conntrack->xid[IP_CT_DIR_ORIGINAL], conntrack->xid[IP_CT_DIR_REPLY]); len += sprintf(buffer + len, "\n"); return len; diff --git a/net/ipv4/netfilter/ip_nat_pptp.c b/net/ipv4/netfilter/ip_nat_pptp.c deleted file mode 100644 index 2bbb815e9..000000000 --- a/net/ipv4/netfilter/ip_nat_pptp.c +++ /dev/null @@ -1,477 +0,0 @@ -/* - * ip_nat_pptp.c - Version 2.0 - * - * NAT support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. - * It is a specification defined by Microsoft and some vendors - * working with Microsoft. PPTP is built on top of a modified - * version of the Internet Generic Routing Encapsulation Protocol. - * GRE is defined in RFC 1701 and RFC 1702. Documentation of - * PPTP can be found in RFC 2637 - * - * (C) 2000-2004 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * TODO: - Support for multiple calls within one session - * (needs netfilter newnat code) - * - NAT to a unique tuple, not to TCP source port - * (needs netfilter tuple reservation) - * - * Changes: - * 2002-02-10 - Version 1.3 - * - Use ip_nat_mangle_tcp_packet() because of cloned skb's - * in local connections (Philip Craig ) - * - add checks for magicCookie and pptp version - * - make argument list of pptp_{out,in}bound_packet() shorter - * - move to C99 style initializers - * - print version number at module loadtime - * 2003-09-22 - Version 1.5 - * - use SNATed tcp sourceport as callid, since we get called before - * TCP header is mangled (Philip Craig ) - * 2004-10-22 - Version 2.0 - * - kernel 2.6.x version - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define IP_NAT_PPTP_VERSION "2.0" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP"); - - -#if 0 -#include "ip_conntrack_pptp_priv.h" -#define DEBUGP(format, args...) printk(KERN_DEBUG __FILE__ ":" __FUNCTION__ \ - ": " format, ## args) -#else -#define DEBUGP(format, args...) -#endif - -static unsigned int -pptp_nat_expected(struct sk_buff **pskb, - unsigned int hooknum, - struct ip_conntrack *ct, - struct ip_nat_info *info) -{ - struct ip_conntrack *master = master_ct(ct); - struct ip_nat_multi_range mr; - struct ip_ct_pptp_master *ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info; - u_int32_t newip, newcid; - int ret; - - IP_NF_ASSERT(info); - IP_NF_ASSERT(master); - IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum)))); - - DEBUGP("we have a connection!\n"); - - LOCK_BH(&ip_pptp_lock); - ct_pptp_info = &master->help.ct_pptp_info; - nat_pptp_info = &master->nat.help.nat_pptp_info; - - /* need to alter GRE tuple because conntrack expectfn() used 'wrong' - * (unmanipulated) values */ - if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) { - DEBUGP("completing tuples with NAT info \n"); - /* we can do this, since we're unconfirmed */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == - htonl(ct_pptp_info->pac_call_id)) { - /* assume PNS->PAC */ - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = - htonl(nat_pptp_info->pns_call_id); - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = - htonl(nat_pptp_info->pns_call_id); - newip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; - newcid = htonl(nat_pptp_info->pac_call_id); - } else { - /* assume PAC->PNS */ - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = - htonl(nat_pptp_info->pac_call_id); - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = - htonl(nat_pptp_info->pac_call_id); - newip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; - newcid = htonl(nat_pptp_info->pns_call_id); - } - } else { - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == - htonl(ct_pptp_info->pac_call_id)) { - /* assume PNS->PAC */ - newip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - newcid = htonl(ct_pptp_info->pns_call_id); - } - else { - /* assume PAC->PNS */ - newip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; - newcid = htonl(ct_pptp_info->pac_call_id); - } - } - - mr.rangesize = 1; - mr.range[0].flags = IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED; - mr.range[0].min_ip = mr.range[0].max_ip = newip; - mr.range[0].min = mr.range[0].max = - ((union ip_conntrack_manip_proto ) { newcid }); - DEBUGP("change ip to %u.%u.%u.%u\n", - NIPQUAD(newip)); - DEBUGP("change key to 0x%x\n", ntohl(newcid)); - ret = ip_nat_setup_info(ct, &mr, hooknum); - - UNLOCK_BH(&ip_pptp_lock); - - return ret; - -} - -/* outbound packets == from PNS to PAC */ -static inline unsigned int -pptp_outbound_pkt(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp) - -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct tcphdr *tcph = (void *) iph + iph->ihl*4; - struct pptp_pkt_hdr *pptph = (struct pptp_pkt_hdr *) - ((void *)tcph + tcph->doff*4); - - struct PptpControlHeader *ctlh; - union pptp_ctrl_union *pptpReq; - struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - - u_int16_t msg, *cid = NULL, new_callid; - - /* FIXME: size checks !!! */ - ctlh = (struct PptpControlHeader *) ((void *) pptph + sizeof(*pptph)); - pptpReq = (void *) ((void *) ctlh + sizeof(*ctlh)); - - new_callid = htons(ct_pptp_info->pns_call_id); - - switch (msg = ntohs(ctlh->messageType)) { - case PPTP_OUT_CALL_REQUEST: - cid = &pptpReq->ocreq.callID; - /* FIXME: ideally we would want to reserve a call ID - * here. current netfilter NAT core is not able to do - * this :( For now we use TCP source port. This breaks - * multiple calls within one control session */ - - /* save original call ID in nat_info */ - nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id; - - /* don't use tcph->source since we are at a DSTmanip - * hook (e.g. PREROUTING) and pkt is not mangled yet */ - new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; - - /* save new call ID in ct info */ - ct_pptp_info->pns_call_id = ntohs(new_callid); - break; - case PPTP_IN_CALL_REPLY: - cid = &pptpReq->icreq.callID; - break; - case PPTP_CALL_CLEAR_REQUEST: - cid = &pptpReq->clrreq.callID; - break; - default: - DEBUGP("unknown outbound packet 0x%04x:%s\n", msg, - (msg <= PPTP_MSG_MAX)? strMName[msg]:strMName[0]); - /* fall through */ - - case PPTP_SET_LINK_INFO: - /* only need to NAT in case PAC is behind NAT box */ - case PPTP_START_SESSION_REQUEST: - case PPTP_START_SESSION_REPLY: - case PPTP_STOP_SESSION_REQUEST: - case PPTP_STOP_SESSION_REPLY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* no need to alter packet */ - return NF_ACCEPT; - } - - IP_NF_ASSERT(cid); - - DEBUGP("altering call id from 0x%04x to 0x%04x\n", - ntohs(*cid), ntohs(new_callid)); - - /* mangle packet */ - ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, (void *)cid - (void *)pptph, - sizeof(new_callid), (char *)&new_callid, - sizeof(new_callid)); - - return NF_ACCEPT; -} - -/* inbound packets == from PAC to PNS */ -static inline unsigned int -pptp_inbound_pkt(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *oldexp) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct tcphdr *tcph = (void *) iph + iph->ihl*4; - struct pptp_pkt_hdr *pptph = (struct pptp_pkt_hdr *) - ((void *)tcph + tcph->doff*4); - - struct PptpControlHeader *ctlh; - union pptp_ctrl_union *pptpReq; - struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - - u_int16_t msg, new_cid = 0, new_pcid, *pcid = NULL, *cid = NULL; - u_int32_t old_dst_ip; - - struct ip_conntrack_tuple t, inv_t; - struct ip_conntrack_tuple *orig_t, *reply_t; - - /* FIXME: size checks !!! */ - ctlh = (struct PptpControlHeader *) ((void *) pptph + sizeof(*pptph)); - pptpReq = (void *) ((void *) ctlh + sizeof(*ctlh)); - - new_pcid = htons(nat_pptp_info->pns_call_id); - - switch (msg = ntohs(ctlh->messageType)) { - case PPTP_OUT_CALL_REPLY: - pcid = &pptpReq->ocack.peersCallID; - cid = &pptpReq->ocack.callID; - if (!oldexp) { - DEBUGP("outcall but no expectation\n"); - break; - } - old_dst_ip = oldexp->tuple.dst.ip; - t = oldexp->tuple; - invert_tuplepr(&inv_t, &t); - - /* save original PAC call ID in nat_info */ - nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id; - - /* alter expectation */ - orig_t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - reply_t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - if (t.src.ip == orig_t->src.ip && t.dst.ip == orig_t->dst.ip) { - /* expectation for PNS->PAC direction */ - t.src.u.gre.key = htonl(nat_pptp_info->pns_call_id); - t.dst.u.gre.key = htonl(ct_pptp_info->pac_call_id); - inv_t.src.ip = reply_t->src.ip; - inv_t.dst.ip = reply_t->dst.ip; - inv_t.src.u.gre.key = htonl(nat_pptp_info->pac_call_id); - inv_t.dst.u.gre.key = htonl(ct_pptp_info->pns_call_id); - } else { - /* expectation for PAC->PNS direction */ - t.src.u.gre.key = htonl(nat_pptp_info->pac_call_id); - t.dst.u.gre.key = htonl(ct_pptp_info->pns_call_id); - inv_t.src.ip = orig_t->src.ip; - inv_t.dst.ip = orig_t->dst.ip; - inv_t.src.u.gre.key = htonl(nat_pptp_info->pns_call_id); - inv_t.dst.u.gre.key = htonl(ct_pptp_info->pac_call_id); - } - - if (!ip_conntrack_change_expect(oldexp, &t)) { - DEBUGP("successfully changed expect\n"); - } else { - DEBUGP("can't change expect\n"); - } - ip_ct_gre_keymap_change(oldexp->proto.gre.keymap_orig, &t); - ip_ct_gre_keymap_change(oldexp->proto.gre.keymap_reply, &inv_t); - break; - case PPTP_IN_CALL_CONNECT: - pcid = &pptpReq->iccon.peersCallID; - if (!oldexp) - break; - old_dst_ip = oldexp->tuple.dst.ip; - t = oldexp->tuple; - - /* alter expectation, no need for callID */ - if (t.dst.ip == ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip) { - /* expectation for PNS->PAC direction */ - t.src.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - } else { - /* expectation for PAC->PNS direction */ - t.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - } - - if (!ip_conntrack_change_expect(oldexp, &t)) { - DEBUGP("successfully changed expect\n"); - } else { - DEBUGP("can't change expect\n"); - } - break; - case PPTP_IN_CALL_REQUEST: - /* only need to nat in case PAC is behind NAT box */ - break; - case PPTP_WAN_ERROR_NOTIFY: - pcid = &pptpReq->wanerr.peersCallID; - break; - case PPTP_CALL_DISCONNECT_NOTIFY: - pcid = &pptpReq->disc.callID; - break; - - default: - DEBUGP("unknown inbound packet %s\n", - (msg <= PPTP_MSG_MAX)? strMName[msg]:strMName[0]); - /* fall through */ - - case PPTP_START_SESSION_REQUEST: - case PPTP_START_SESSION_REPLY: - case PPTP_STOP_SESSION_REQUEST: - case PPTP_STOP_SESSION_REPLY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* no need to alter packet */ - return NF_ACCEPT; - } - - /* mangle packet */ - IP_NF_ASSERT(pcid); - DEBUGP("altering peer call id from 0x%04x to 0x%04x\n", - ntohs(*pcid), ntohs(new_pcid)); - ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, (void *)pcid - (void *)pptph, - sizeof(new_pcid), (char *)&new_pcid, - sizeof(new_pcid)); - - if (new_cid) { - IP_NF_ASSERT(cid); - DEBUGP("altering call id from 0x%04x to 0x%04x\n", - ntohs(*cid), ntohs(new_cid)); - ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - (void *)cid - (void *)pptph, - sizeof(new_cid), (char *)&new_cid, - sizeof(new_cid)); - } - - /* great, at least we don't need to resize packets */ - return NF_ACCEPT; -} - - -static unsigned int tcp_help(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp, - struct ip_nat_info *info, - enum ip_conntrack_info ctinfo, - unsigned int hooknum, struct sk_buff **pskb) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct tcphdr *tcph = (void *) iph + iph->ihl*4; - unsigned int datalen = (*pskb)->len - iph->ihl*4 - tcph->doff*4; - struct pptp_pkt_hdr *pptph; - - int dir; - - DEBUGP("entering\n"); - - /* Only mangle things once: DST for original direction - and SRC for reply direction. */ - dir = CTINFO2DIR(ctinfo); - if (!((HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - && dir == IP_CT_DIR_ORIGINAL) - || (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST - && dir == IP_CT_DIR_REPLY))) { - DEBUGP("Not touching dir %s at hook %s\n", - dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY", - hooknum == NF_IP_POST_ROUTING ? "POSTROUTING" - : hooknum == NF_IP_PRE_ROUTING ? "PREROUTING" - : hooknum == NF_IP_LOCAL_OUT ? "OUTPUT" - : hooknum == NF_IP_LOCAL_IN ? "INPUT" : "???"); - return NF_ACCEPT; - } - - /* if packet is too small, just skip it */ - if (datalen < sizeof(struct pptp_pkt_hdr)+ - sizeof(struct PptpControlHeader)) { - DEBUGP("pptp packet too short\n"); - return NF_ACCEPT; - } - - pptph = (struct pptp_pkt_hdr *) ((void *)tcph + tcph->doff*4); - - /* if it's not a control message, we can't handle it */ - if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || - ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { - DEBUGP("not a pptp control packet\n"); - return NF_ACCEPT; - } - - LOCK_BH(&ip_pptp_lock); - - if (dir == IP_CT_DIR_ORIGINAL) { - /* reuqests sent by client to server (PNS->PAC) */ - pptp_outbound_pkt(pskb, ct, ctinfo, exp); - } else { - /* response from the server to the client (PAC->PNS) */ - pptp_inbound_pkt(pskb, ct, ctinfo, exp); - } - - UNLOCK_BH(&ip_pptp_lock); - - return NF_ACCEPT; -} - -/* nat helper struct for control connection */ -static struct ip_nat_helper pptp_tcp_helper = { - .list = { NULL, NULL }, - .name = "pptp", - .flags = IP_NAT_HELPER_F_ALWAYS, - .me = THIS_MODULE, - .tuple = { .src = { .ip = 0, - .u = { .tcp = { .port = - __constant_htons(PPTP_CONTROL_PORT) } - } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = IPPROTO_TCP - } - }, - - .mask = { .src = { .ip = 0, - .u = { .tcp = { .port = 0xFFFF } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = 0xFFFF - } - }, - .help = tcp_help, - .expect = pptp_nat_expected -}; - - -static int __init init(void) -{ - DEBUGP("%s: registering NAT helper\n", __FILE__); - if (ip_nat_helper_register(&pptp_tcp_helper)) { - printk(KERN_ERR "Unable to register NAT application helper " - "for pptp\n"); - return -EIO; - } - - printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION); - return 0; -} - -static void __exit fini(void) -{ - DEBUGP("cleanup_module\n" ); - ip_nat_helper_unregister(&pptp_tcp_helper); - printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c deleted file mode 100644 index 5691a102a..000000000 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * ip_nat_proto_gre.c - Version 2.0 - * - * NAT protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2004 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - */ - -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG __FILE__ ":" __FUNCTION__ \ - ": " format, ## args) -#else -#define DEBUGP(x, args...) -#endif - -/* is key in given range between min and max */ -static int -gre_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - u_int32_t key; - - if (maniptype == IP_NAT_MANIP_SRC) - key = tuple->src.u.gre.key; - else - key = tuple->dst.u.gre.key; - - return ntohl(key) >= ntohl(min->gre.key) - && ntohl(key) <= ntohl(max->gre.key); -} - -/* generate unique tuple ... */ -static int -gre_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - u_int32_t min, i, range_size; - u_int32_t key = 0, *keyptr; - - if (maniptype == IP_NAT_MANIP_SRC) - keyptr = &tuple->src.u.gre.key; - else - keyptr = &tuple->dst.u.gre.key; - - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - DEBUGP("%p: NATing GRE PPTP\n", conntrack); - min = 1; - range_size = 0xffff; - } else { - min = ntohl(range->min.gre.key); - range_size = ntohl(range->max.gre.key) - min + 1; - } - - DEBUGP("min = %u, range_size = %u\n", min, range_size); - - for (i = 0; i < range_size; i++, key++) { - *keyptr = htonl(min + key % range_size); - if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; - } - - DEBUGP("%p: no NAT mapping\n", conntrack); - - return 0; -} - -/* manipulate a GRE packet according to maniptype */ -static int -gre_manip_pkt(struct sk_buff **pskb, - unsigned int hdroff, - const struct ip_conntrack_manip *manip, - enum ip_nat_manip_type maniptype) -{ - struct gre_hdr *greh; - struct gre_hdr_pptp *pgreh; - - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*pgreh))) - return 0; - - greh = (void *)(*pskb)->data + hdroff; - pgreh = (struct gre_hdr_pptp *) greh; - - /* we only have destination manip of a packet, since 'source key' - * is not present in the packet itself */ - if (maniptype == IP_NAT_MANIP_DST) { - /* key manipulation is always dest */ - switch (greh->version) { - case 0: - if (!greh->key) { - DEBUGP("can't nat GRE w/o key\n"); - break; - } - if (greh->csum) { - /* FIXME: Never tested this code... */ - *(gre_csum(greh)) = - ip_nat_cheat_check(~*(gre_key(greh)), - manip->u.gre.key, - *(gre_csum(greh))); - } - *(gre_key(greh)) = manip->u.gre.key; - break; - case GRE_VERSION_PPTP: - DEBUGP("call_id -> 0x%04x\n", - ntohl(manip->u.gre.key)); - pgreh->call_id = htons(ntohl(manip->u.gre.key)); - break; - default: - DEBUGP("can't nat unknown GRE version\n"); - return 0; - break; - } - } - return 1; -} - -/* print out a nat tuple */ -static unsigned int -gre_print(char *buffer, - const struct ip_conntrack_tuple *match, - const struct ip_conntrack_tuple *mask) -{ - unsigned int len = 0; - - if (mask->src.u.gre.key) - len += sprintf(buffer + len, "srckey=0x%x ", - ntohl(match->src.u.gre.key)); - - if (mask->dst.u.gre.key) - len += sprintf(buffer + len, "dstkey=0x%x ", - ntohl(match->src.u.gre.key)); - - return len; -} - -/* print a range of keys */ -static unsigned int -gre_print_range(char *buffer, const struct ip_nat_range *range) -{ - if (range->min.gre.key != 0 - || range->max.gre.key != 0xFFFF) { - if (range->min.gre.key == range->max.gre.key) - return sprintf(buffer, "key 0x%x ", - ntohl(range->min.gre.key)); - else - return sprintf(buffer, "keys 0x%u-0x%u ", - ntohl(range->min.gre.key), - ntohl(range->max.gre.key)); - } else - return 0; -} - -/* nat helper struct */ -static struct ip_nat_protocol gre = { - .name = "GRE", - .protonum = IPPROTO_GRE, - .manip_pkt = gre_manip_pkt, - .in_range = gre_in_range, - .unique_tuple = gre_unique_tuple, - .print = gre_print, - .print_range = gre_print_range -}; - -static int __init init(void) -{ - if (ip_nat_protocol_register(&gre)) - return -EIO; - - return 0; -} - -static void __exit fini(void) -{ - ip_nat_protocol_unregister(&gre); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 70945b48a..7bbe1cb55 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1822,7 +1822,7 @@ process: * packet. */ if (inet_stream_ops.bind != inet_bind && - (int) sk->sk_xid > 0 && sk->sk_xid != skb->xid) + (int) sk->sk_xid >= 0 && sk->sk_xid != skb->xid) goto discard_it; if (sk->sk_state == TCP_TIME_WAIT) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 844a087b0..5edc92cf8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -451,7 +451,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packe sk = pt->af_packet_priv; po = pkt_sk(sk); - if ((int) sk->sk_xid > 0 && sk->sk_xid != skb->xid) + if (sk->sk_xid && sk->sk_xid != skb->xid) goto drop; skb->dev = dev; diff --git a/scripts/kernel-2.6-planetlab.spec b/scripts/kernel-2.6-planetlab.spec index 4e2be569b..81aec8077 100644 --- a/scripts/kernel-2.6-planetlab.spec +++ b/scripts/kernel-2.6-planetlab.spec @@ -22,7 +22,7 @@ Summary: The Linux kernel (the core of the Linux operating system) %define kversion 2.6.%{sublevel} %define rpmversion 2.6.%{sublevel} %define rhbsys %([ -r /etc/beehive-root ] && echo || echo .`whoami`) -%define release 1.521.2.6.planetlab%{?date:.%{date}} +%define release 1.planetlab%{?date:.%{date}} %define signmodules 0 %define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE} @@ -62,11 +62,6 @@ Summary: The Linux kernel (the core of the Linux operating system) # %define kernel_prereq fileutils, module-init-tools, initscripts >= 5.83, mkinitrd >= 3.5.5 -Vendor: PlanetLab -Packager: PlanetLab Central -Distribution: PlanetLab 3.0 -URL: http://cvs.planet-lab.org/cvs/linux-2.6 - Name: kernel Group: System Environment/Kernel License: GPLv2 @@ -178,19 +173,6 @@ Group: System Environment/Kernel %description uml This package includes a user mode version of the Linux kernel. -%package vserver -Summary: A placeholder RPM that provides kernel and kernel-drm - -Group: System Environment/Kernel -Provides: kernel = %{version} -Provides: kernel-drm = 4.3.0 - -%description vserver -VServers do not require and cannot use kernels, but some RPMs have -implicit or explicit dependencies on the "kernel" package -(e.g. tcpdump). This package installs no files but provides the -necessary dependencies to make rpm and yum happy. - %prep %setup -n linux-%{kversion} @@ -258,7 +240,7 @@ BuildKernel() { grep "__crc_$i\$" System.map >> $RPM_BUILD_ROOT/boot/System.map-$KernelVer ||: done rm -f exported -# install -m 644 init/kerntypes.o $RPM_BUILD_ROOT/boot/Kerntypes-$KernelVer + install -m 644 init/kerntypes.o $RPM_BUILD_ROOT/boot/Kerntypes-$KernelVer install -m 644 .config $RPM_BUILD_ROOT/boot/config-$KernelVer rm -f System.map cp arch/*/boot/bzImage $RPM_BUILD_ROOT/%{image_install_path}/vmlinuz-$KernelVer @@ -429,7 +411,7 @@ fi # make some useful links pushd /boot > /dev/null ; { ln -sf System.map-%{KVERREL} System.map -# ln -sf Kerntypes-%{KVERREL} Kerntypes + ln -sf Kerntypes-%{KVERREL} Kerntypes ln -sf config-%{KVERREL} config ln -sf initrd-%{KVERREL}.img initrd-boot ln -sf vmlinuz-%{KVERREL} kernel-boot @@ -468,7 +450,7 @@ fi %files %defattr(-,root,root) /%{image_install_path}/vmlinuz-%{KVERREL} -#/boot/Kerntypes-%{KVERREL} +/boot/Kerntypes-%{KVERREL} /boot/System.map-%{KVERREL} /boot/config-%{KVERREL} %dir /lib/modules/%{KVERREL} @@ -481,7 +463,7 @@ fi %files smp %defattr(-,root,root) /%{image_install_path}/vmlinuz-%{KVERREL}smp -#/boot/Kerntypes-%{KVERREL}smp +/boot/Kerntypes-%{KVERREL}smp /boot/System.map-%{KVERREL}smp /boot/config-%{KVERREL}smp %dir /lib/modules/%{KVERREL}smp @@ -511,11 +493,6 @@ fi /usr/share/doc/kernel-doc-%{kversion}/Documentation/* %endif - -%files vserver -%defattr(-,root,root) -# no files - %changelog * Thu Sep 16 2004 Mark Huang - merge to Fedora Core 2 2.6.8-1.521 -- 2.47.0