This commit was manufactured by cvs2svn to create tag

author Planet-Lab Support <support@planet-lab.org>

Fri, 21 Jan 2005 03:34:32 +0000 (03:34 +0000)

committer Planet-Lab Support <support@planet-lab.org>

Fri, 21 Jan 2005 03:34:32 +0000 (03:34 +0000)
author Planet-Lab Support <support@planet-lab.org>
Fri, 21 Jan 2005 03:34:32 +0000 (03:34 +0000)
committer Planet-Lab Support <support@planet-lab.org>
Fri, 21 Jan 2005 03:34:32 +0000 (03:34 +0000)
diff --git a/.cvsignore b/.cvsignore

new file mode 100644 (file)

index 0000000..5e7d074
--- /dev/null
+++ b/.cvsignore
@@ -0,0 +1,13 @@
+.config
+.tmp_System.map
+.tmp_kallsyms1.S
+.tmp_kallsyms2.S
+.tmp_kallsyms3.S
+.tmp_versions
+.tmp_vmlinux1
+.tmp_vmlinux2
+.tmp_vmlinux3
+.version
+Module.symvers
+System.map
+vmlinux
diff --git a/Documentation/ckrm/cpusched b/Documentation/ckrm/cpusched

new file mode 100644 (file)

index 0000000..01f7f23
--- /dev/null
+++ b/Documentation/ckrm/cpusched
@@ -0,0 +1,86 @@
+CKRM CPU Scheduling 
+===================
+
+Overview
+--------
+
+In CKRM, cpu scheduling is based on a two level scheduling decision.
+Every time a new task is to be selected, the scheduler first determines
+which class to run next and then schedules the next task in selected
+task.
+
+The scheduling within a class is performed using the default Linux
+O(1) scheduler.
+
+The class scheduler also follows the O(1) principle and works as
+follows: 
+
+Each class maintains a local runqueue per cpu aka <struct
+ckrm_runqueue> or short lrq. The existing O(1) scheduler is used to
+schedule within an <lrq>.
+
+Weights are assigned to each lrq that mirror the effectives shares of
+that class. Every time a task executes, its weighted cycles are
+charged against its class. Thus classes progress in time called
+cummulative virtual time (CVT). In essence the class with the smallest
+CVT is selected next. Provisions are made to keep interactivity and
+avoid starvation by longer sleeping classes.
+
+Load balancing across an SMP is performed by balancing the load of
+each class across CPUs such that they produce equal load and thus 
+on the whole system maintain their share.
+
+Due to the fact that CKRM uses a class hierarchy, cycles that are unused
+by a class are redistributed to among busy siblings.
+Enabling the CKRM CPU scheduler
+-------------------------------
+
+The scheduler is integrated into the linux scheduler and therefore
+can not be loaded dynamically like other CKRM schedulers
+
+However it can be selected at boot time or dynamically at run time.
+
+The boot options "ckrmcpu" OR "nockrmcpu" enable / disable the CKRM
+cpu scheduler at boot time. Currently by default the scheduler is
+disabled.
+
+# cat /rcfs/taskclass/config 
+
+"res=cpu,mode=enabled" indicates that the CKRM cpu scheduler is
+enabled
+
+"res=cpu,mode=disabled" indicates that the CKRM cpu scheduler is
+disabled
+
+The strings can also be used to dynamically change the scheduling modus
+at runtime. For example, to dynamically activate the scheduler.
+
+# echo "res=cpu,mode=enabled" > /rcfs/taskclass/config
+
+# cat /rcfs/taskclass/*/stats
+
+The cpu portion of the scheduler is shown
+
+    "cpu-usage(2,10,60)= 290 340 510"
+
+The 3 numbers represent the load for the 2 second, 10 second 
+and 60 seconds. The base = 1000.
+Hence the system has 29.0%, 33.5% and 49.8% respectively
+
+For debugging purposes additional information can be printed out but
+that format should not be relied upon. 
+
+Use `echo "res=cpu,usage_detail=3" for the highest detail on usage.
+Please consult the source code for the specifics.
+
+Assigning shares
+----------------
+
+Follows the general approach described under ckrm_basics.
+
+# echo "res=cpu,guarantee=val" > shares   
+
+sets the minimum guarantee of a class.
+
+
+
diff --git a/MAINTAINERS b/MAINTAINERS

index c8c25df..523f115 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1226,6 +1226,17 @@ W:       http://nfs.sourceforge.net/
  W:     http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
  S:     Maintained
  
  W:     http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
  S:     Maintained
  
+KEXEC
+P:     Eric Biederman
+P:     Randy Dunlap
+M:     ebiederm@xmission.com
+M:     rddunlap@osdl.org
+W:     http://www.xmission.com/~ebiederm/files/kexec/
+W:     http://developer.osdl.org/rddunlap/kexec/
+L:     linux-kernel@vger.kernel.org
+L:     fastboot@osdl.org
+S:     Maintained
+
  LANMEDIA WAN CARD DRIVER
  P:     Andrew Stanley-Jones
  M:     asj@lanmedia.com
  LANMEDIA WAN CARD DRIVER
  P:     Andrew Stanley-Jones
  M:     asj@lanmedia.com
diff --git a/Makefile b/Makefile

index 4d94580..c576843 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 2
  PATCHLEVEL = 6
  SUBLEVEL = 8
  VERSION = 2
  PATCHLEVEL = 6
  SUBLEVEL = 8
-EXTRAVERSION = -1.521.2.5.planetlab
+EXTRAVERSION = -1.521.3.planetlab
  NAME=Zonked Quokka
  
  # *DOCUMENTATION*
  NAME=Zonked Quokka
  
  # *DOCUMENTATION*
@@ -453,6 +453,10 @@ ifndef CONFIG_FRAME_POINTER
  CFLAGS         += -fomit-frame-pointer
  endif
  
  CFLAGS         += -fomit-frame-pointer
  endif
  
+ifdef CONFIG_X86_STACK_CHECK
+CFLAGS         += -p
+endif
+
  ifdef CONFIG_DEBUG_INFO
  CFLAGS         += -g
  endif
  ifdef CONFIG_DEBUG_INFO
  CFLAGS         += -g
  endif
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig

index 15b003b..3a3ba7f 100644 (file)
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -926,6 +926,74 @@ config REGPARM
         generate incorrect output with certain kernel constructs when
         -mregparm=3 is used.
  
         generate incorrect output with certain kernel constructs when
         -mregparm=3 is used.
  
+config IRQSTACKS
+       bool "Use separate IRQ stacks"
+       help
+       If you say Y here the kernel will use a separate IRQ stack on each
+       cpu to handle interrupts.
+
+config STACK_SIZE_SHIFT
+       int "Kernel stack size (12 => 4KB, 13 => 8KB, 14 => 16KB)"
+       range 12 14
+       default 12 if IRQSTACKS
+       default 13
+       help
+       Select kernel stack size.  4KB stacks are best as they let
+       the system scale further.  Use 8KB stacks if you have an 
+       experimental kernel where a stack overlow with a 4KB stack
+       might occur.  Use 16KB stacks if you want to safely support
+       Windows device drivers using either Linuxant or ndiswrapper.
+
+config STACK_WARN
+       int "Print stack trace when stack grows beyond specified bytes"
+       default 4096 if IRQSTACKS
+       default 4096
+       help
+       The kernel will print a stack trace when the current stack exceeds
+       the specified size.
+
+config X86_STACK_CHECK
+       bool "Check for stack overflows"
+       default n
+       help
+       Say Y here to have the kernel attempt to detect when the per-task
+       kernel stack overflows.
+
+       Some older versions of gcc don't handle the -p option correctly.
+       Kernprof is affected by the same problem, which is described here:
+       http://oss.sgi.com/projects/kernprof/faq.html#Q9
+
+       Basically, if you get oopses in __free_pages_ok during boot when
+       you have this turned on, you need to fix gcc. The Redhat 2.96
+       version and gcc-3.x seem to work.
+
+       If not debugging a stack overflow problem, say N
+
+config STACK_PANIC
+       int "Panic when stack approaches with specified bytes of the stack limit"
+       depends on X86_STACK_CHECK
+       default 512 if IRQSTACKS
+       default 512
+       help
+       Panic if the stack grows to within specified byte range.
+
+config KEXEC
+       bool "kexec system call (EXPERIMENTAL)"
+       depends on EXPERIMENTAL
+       help
+         kexec is a system call that implements the ability to shutdown your
+         current kernel, and to start another kernel.  It is like a reboot
+         but it is indepedent of the system firmware.   And like a reboot
+         you can start any kernel with it, not just Linux.
+
+         The name comes from the similiarity to the exec system call.
+
+         It is an ongoing process to be certain the hardware in a machine
+         is properly shutdown, so do not be surprised if this code does not
+         initially work for you.  It may help to enable device hotplugging
+         support.  As of this writing the exact hardware interface is
+         strongly in flux, so no good recommendation can be made.
+
  endmenu
  
  
  endmenu
  
  
diff --git a/arch/i386/boot/.cvsignore b/arch/i386/boot/.cvsignore

new file mode 100644 (file)

index 0000000..2d8a3af
--- /dev/null
+++ b/arch/i386/boot/.cvsignore
@@ -0,0 +1,4 @@
+bootsect
+bzImage
+setup
+vmlinux.bin
diff --git a/arch/i386/boot/compressed/.cvsignore b/arch/i386/boot/compressed/.cvsignore

new file mode 100644 (file)

index 0000000..96b1b00
--- /dev/null
+++ b/arch/i386/boot/compressed/.cvsignore
@@ -0,0 +1,3 @@
+vmlinux
+vmlinux.bin
+vmlinux.bin.gz
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c

index fa67045..8745683 100644 (file)
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -380,3 +380,6 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
         if (high_loaded) close_output_buffer_if_we_run_high(mv);
         return high_loaded;
  }
         if (high_loaded) close_output_buffer_if_we_run_high(mv);
         return high_loaded;
  }
+
+/* We don't actually check for stack overflows this early. */
+__asm__(".globl mcount ; mcount: ret\n");
diff --git a/arch/i386/boot/tools/.cvsignore b/arch/i386/boot/tools/.cvsignore

new file mode 100644 (file)

index 0000000..378eac2
--- /dev/null
+++ b/arch/i386/boot/tools/.cvsignore
@@ -0,0 +1 @@
+build
diff --git a/arch/i386/defconfig b/arch/i386/defconfig

index aed3bc2..ed2bbb5 100644 (file)
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1221,7 +1221,7 @@ CONFIG_OPROFILE=y
  CONFIG_EARLY_PRINTK=y
  CONFIG_DEBUG_SPINLOCK_SLEEP=y
  # CONFIG_FRAME_POINTER is not set
  CONFIG_EARLY_PRINTK=y
  CONFIG_DEBUG_SPINLOCK_SLEEP=y
  # CONFIG_FRAME_POINTER is not set
-CONFIG_4KSTACKS=y
+# CONFIG_4KSTACKS is not set
  CONFIG_X86_FIND_SMP_CONFIG=y
  CONFIG_X86_MPPARSE=y
  
  CONFIG_X86_FIND_SMP_CONFIG=y
  CONFIG_X86_MPPARSE=y
  
diff --git a/arch/i386/kernel/.cvsignore b/arch/i386/kernel/.cvsignore

new file mode 100644 (file)

index 0000000..21c2876
--- /dev/null
+++ b/arch/i386/kernel/.cvsignore
@@ -0,0 +1,2 @@
+asm-offsets.s
+vmlinux.lds.s
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile

index a056d50..ab1ef80 100644 (file)
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_X86_TRAMPOLINE)  += trampoline.o
  obj-$(CONFIG_X86_MPPARSE)      += mpparse.o
  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o nmi.o
  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
  obj-$(CONFIG_X86_MPPARSE)      += mpparse.o
  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o nmi.o
  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
+obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o
  obj-$(CONFIG_X86_NUMAQ)                += numaq.o
  obj-$(CONFIG_X86_SUMMIT_NUMA)  += summit.o
  obj-$(CONFIG_MODULES)          += module.o
  obj-$(CONFIG_X86_NUMAQ)                += numaq.o
  obj-$(CONFIG_X86_SUMMIT_NUMA)  += summit.o
  obj-$(CONFIG_MODULES)          += module.o
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c

index ecf2b63..eb4d416 100644 (file)
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -193,6 +193,36 @@ void disconnect_bsp_APIC(void)
                 outb(0x70, 0x22);
                 outb(0x00, 0x23);
         }
                 outb(0x70, 0x22);
                 outb(0x00, 0x23);
         }
+       else {
+               /* Go back to Virtual Wire compatibility mode */
+               unsigned long value;
+
+               /* For the spurious interrupt use vector F, and enable it */
+               value = apic_read(APIC_SPIV);
+               value &= ~APIC_VECTOR_MASK;
+               value |= APIC_SPIV_APIC_ENABLED;
+               value |= 0xf;
+               apic_write_around(APIC_SPIV, value);
+
+               /* For LVT0 make it edge triggered, active high, external and enabled */
+               value = apic_read(APIC_LVT0);
+               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
+               apic_write_around(APIC_LVT0, value);
+
+               /* For LVT1 make it edge triggered, active high, nmi and enabled */
+               value = apic_read(APIC_LVT1);
+               value &= ~(
+                       APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+               apic_write_around(APIC_LVT1, value);
+       }
  }
  
  void disable_local_APIC(void)
  }
  
  void disable_local_APIC(void)
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c

index 43943f8..b03f579 100644 (file)
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -7,11 +7,11 @@
  #include <linux/sched.h>
  #include <linux/signal.h>
  #include <linux/personality.h>
  #include <linux/sched.h>
  #include <linux/signal.h>
  #include <linux/personality.h>
+#include <linux/thread_info.h>
  #include <asm/ucontext.h>
  #include "sigframe.h"
  #include <asm/fixmap.h>
  #include <asm/processor.h>
  #include <asm/ucontext.h>
  #include "sigframe.h"
  #include <asm/fixmap.h>
  #include <asm/processor.h>
-#include <asm/thread_info.h>
  
  #define DEFINE(sym, val) \
          asm volatile("\n->" #sym " %0 " #val : : "i" (val))
  
  #define DEFINE(sym, val) \
          asm volatile("\n->" #sym " %0 " #val : : "i" (val))
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S

index 3ac7418..dfbade1 100644 (file)
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1029,8 +1029,55 @@ ENTRY(sys_call_table)
         .long sys_mq_timedreceive       /* 280 */
         .long sys_mq_notify
         .long sys_mq_getsetattr
         .long sys_mq_timedreceive       /* 280 */
         .long sys_mq_notify
         .long sys_mq_getsetattr
-       .long sys_ni_syscall            /* reserved for kexec */
+       .long sys_kexec_load
         .long sys_ioprio_set
         .long sys_ioprio_get            /* 285 */
  
  syscall_table_size=(.-sys_call_table)
         .long sys_ioprio_set
         .long sys_ioprio_get            /* 285 */
  
  syscall_table_size=(.-sys_call_table)
+
+#ifdef CONFIG_X86_STACK_CHECK
+.data
+.globl stack_overflowed
+stack_overflowed:
+       .long 0
+.text
+
+ENTRY(mcount)
+#warning stack check enabled
+       push %eax
+       movl $(THREAD_SIZE - 1),%eax
+       andl %esp,%eax
+       cmpl $STACK_WARN,%eax
+       jle 1f
+2:
+       popl %eax
+       ret
+1:
+       /* prevent infinite recursion from call to mcount from the
+        * stack_overflow function.  Need to revisit this code for
+        * SMP based systems.
+        */
+       lock; btsl $0,stack_overflowed
+       jc 2b
+
+       /* prepare to jmp to stack_overflow directly, as if it were 
+        * called directly by the caller of mcount.  
+        */
+       pushl %ebp
+       pushl %ebx
+       pushl %esi
+       pushl %edi
+       
+       call stack_overflow
+       /* Note that stack_overflow() will clear the stack_overflowed
+        * variable.
+        */
+
+       popl %edi
+       popl %esi
+       popl %ebx
+       popl %ebp
+       
+       popl %eax       
+       ret
+#endif
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c

index 5a50c53..584982c 100644 (file)
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -188,6 +188,12 @@ EXPORT_SYMBOL(atomic_dec_and_lock);
  
  EXPORT_SYMBOL(__PAGE_KERNEL);
  
  
  EXPORT_SYMBOL(__PAGE_KERNEL);
  
+#ifdef CONFIG_X86_STACK_CHECK
+extern void mcount(void);
+EXPORT_SYMBOL(mcount);
+#endif
+
+
  #ifdef CONFIG_HIGHMEM
  EXPORT_SYMBOL(kmap);
  EXPORT_SYMBOL(kunmap);
  #ifdef CONFIG_HIGHMEM
  EXPORT_SYMBOL(kmap);
  EXPORT_SYMBOL(kunmap);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c

index 97653d2..7141d27 100644 (file)
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -244,9 +244,21 @@ static int i8259A_resume(struct sys_device *dev)
         return 0;
  }
  
         return 0;
  }
  
+static int i8259A_shutdown(struct sys_device *dev)
+{
+      /* Put the i8259A into a quiescent state that
+       * the kernel initialization code can get it
+       * out of.
+       */
+      outb(0xff, 0x21);       /* mask all of 8259A-1 */
+      outb(0xff, 0xA1);       /* mask all of 8259A-1 */
+      return 0;
+}
+
  static struct sysdev_class i8259_sysdev_class = {
         set_kset_name("i8259"),
         .resume = i8259A_resume,
  static struct sysdev_class i8259_sysdev_class = {
         set_kset_name("i8259"),
         .resume = i8259A_resume,
+        .shutdown = i8259A_shutdown,
  };
  
  static struct sys_device device_i8259A = {
  };
  
  static struct sys_device device_i8259A = {
diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c

index 7422d73..30cfd40 100644 (file)
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -29,6 +29,13 @@ union thread_union init_thread_union
         __attribute__((__section__(".data.init_task"))) =
                 { INIT_THREAD_INFO(init_task, init_thread_union) };
  
         __attribute__((__section__(".data.init_task"))) =
                 { INIT_THREAD_INFO(init_task, init_thread_union) };
  
+#ifdef CONFIG_X86_STACK_CHECK
+union thread_union stack_overflow_stack
+ __attribute__((__section__(".data.init_task"))) =
+               { INIT_THREAD_INFO(init_task, stack_overflow_stack) };
+#endif
+
+
  /*
   * Initial task structure.
   *
  /*
   * Initial task structure.
   *
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c

index 39af35d..f600e67 100644 (file)
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1604,11 +1604,42 @@ static void __init enable_IO_APIC(void)
   */
  void disable_IO_APIC(void)
  {
   */
  void disable_IO_APIC(void)
  {
+       int pin;
         /*
          * Clear the IO-APIC before rebooting:
          */
         clear_IO_APIC();
  
         /*
          * Clear the IO-APIC before rebooting:
          */
         clear_IO_APIC();
  
+       /*
+        * If the i82559 is routed through an IOAPIC
+        * Put that IOAPIC in virtual wire mode
+        * so legacy interrups can be delivered.
+        */
+       pin = find_isa_irq_pin(0, mp_ExtINT);
+       if (pin != -1) {
+               struct IO_APIC_route_entry entry;
+               unsigned long flags;
+
+               memset(&entry, 0, sizeof(entry));
+               entry.mask            = 0; /* Enabled */
+               entry.trigger         = 0; /* Edge */
+               entry.irr             = 0;
+               entry.polarity        = 0; /* High */
+               entry.delivery_status = 0;
+               entry.dest_mode       = 0; /* Physical */
+               entry.delivery_mode   = 7; /* ExtInt */
+               entry.vector          = 0;
+               entry.dest.physical.physical_dest = 0;
+
+
+               /*
+                * Add it to the IO-APIC irq-routing table:
+                */
+               spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+               io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+       }
         disconnect_bsp_APIC();
  }
  
         disconnect_bsp_APIC();
  }
  
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c

index 22f7fc7..1c8beda 100644 (file)
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -76,8 +76,10 @@ static void register_irq_proc (unsigned int irq);
  /*
   * per-CPU IRQ handling stacks
   */
  /*
   * per-CPU IRQ handling stacks
   */
+#ifdef CONFIG_IRQSTACKS
  union irq_ctx *hardirq_ctx[NR_CPUS];
  union irq_ctx *softirq_ctx[NR_CPUS];
  union irq_ctx *hardirq_ctx[NR_CPUS];
  union irq_ctx *softirq_ctx[NR_CPUS];
+#endif
  
  /*
   * Special irq handlers.
  
  /*
   * Special irq handlers.
@@ -220,6 +222,9 @@ asmlinkage int handle_IRQ_event(unsigned int irq,
         int status = 1; /* Force the "do bottom halves" bit */
         int retval = 0;
  
         int status = 1; /* Force the "do bottom halves" bit */
         int retval = 0;
  
+       if (!(action->flags & SA_INTERRUPT))
+               local_irq_enable();
+
         do {
                 status |= action->flags;
                 retval |= action->handler(irq, action->dev_id, regs);
         do {
                 status |= action->flags;
                 retval |= action->handler(irq, action->dev_id, regs);
@@ -489,10 +494,12 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
                 u32 *isp;
                 union irq_ctx * curctx;
                 union irq_ctx * irqctx;
                 u32 *isp;
                 union irq_ctx * curctx;
                 union irq_ctx * irqctx;
-
+#ifdef CONFIG_IRQSTACKS
                 curctx = (union irq_ctx *) current_thread_info();
                 irqctx = hardirq_ctx[smp_processor_id()];
                 curctx = (union irq_ctx *) current_thread_info();
                 irqctx = hardirq_ctx[smp_processor_id()];
-
+#else
+               curctx = irqctx = (union irq_ctx *)0;
+#endif
                 spin_unlock(&desc->lock);
  
                 /*
                 spin_unlock(&desc->lock);
  
                 /*
@@ -536,7 +543,6 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
                         break;
                 desc->status &= ~IRQ_PENDING;
         }
                         break;
                 desc->status &= ~IRQ_PENDING;
         }
-
         desc->status &= ~IRQ_INPROGRESS;
  
  out:
         desc->status &= ~IRQ_INPROGRESS;
  
  out:
@@ -1095,6 +1101,7 @@ void init_irq_proc (void)
  }
  
  
  }
  
  
+#ifdef CONFIG_IRQSTACKS
  /*
   * These should really be __section__(".bss.page_aligned") as well, but
   * gcc's 3.0 and earlier don't handle that correctly.
  /*
   * These should really be __section__(".bss.page_aligned") as well, but
   * gcc's 3.0 and earlier don't handle that correctly.
@@ -1174,3 +1181,4 @@ asmlinkage void do_softirq(void)
  }
  
  EXPORT_SYMBOL(do_softirq);
  }
  
  EXPORT_SYMBOL(do_softirq);
+#endif
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c

new file mode 100644 (file)

index 0000000..3a9e878
--- /dev/null
+++ b/arch/i386/kernel/machine_kexec.c
@@ -0,0 +1,208 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+
+static inline unsigned long read_cr3(void)
+{
+       unsigned long cr3;
+       asm volatile("movl %%cr3,%0": "=r"(cr3));
+       return cr3;
+}
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#define LEVEL0_SIZE (1UL << 12UL)
+
+#ifndef CONFIG_X86_PAE
+#define LEVEL1_SIZE (1UL << 22UL)
+static u32 pgtable_level1[1024] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+       unsigned long level1_index, level2_index;
+       u32 *pgtable_level2;
+
+       /* Find the current page table */
+       pgtable_level2 = __va(read_cr3());
+
+       /* Find the indexes of the physical address to identity map */
+       level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+       level2_index = address / LEVEL1_SIZE;
+
+       /* Identity map the page table entry */
+       pgtable_level1[level1_index] = address | L0_ATTR;
+       pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+
+       /* Flush the tlb so the new mapping takes effect.
+        * Global tlb entries are not flushed but that is not an issue.
+        */
+       load_cr3(pgtable_level2);
+}
+
+#else
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+static u64 pgtable_level1[512] PAGE_ALIGNED;
+static u64 pgtable_level2[512] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+       unsigned long level1_index, level2_index, level3_index;
+       u64 *pgtable_level3;
+
+       /* Find the current page table */
+       pgtable_level3 = __va(read_cr3());
+
+       /* Find the indexes of the physical address to identity map */
+       level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+       level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+       level3_index = address / LEVEL2_SIZE;
+
+       /* Identity map the page table entry */
+       pgtable_level1[level1_index] = address | L0_ATTR;
+       pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+       set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR);
+
+       /* Flush the tlb so the new mapping takes effect.
+        * Global tlb entries are not flushed but that is not an issue.
+        */
+       load_cr3(pgtable_level3);
+}
+#endif
+
+
+static void set_idt(void *newidt, __u16 limit)
+{
+       unsigned char curidt[6];
+
+       /* ia32 supports unaliged loads & stores */
+       (*(__u16 *)(curidt)) = limit;
+       (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
+
+       __asm__ __volatile__ (
+               "lidt %0\n"
+               : "=m" (curidt)
+               );
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+       unsigned char curgdt[6];
+
+       /* ia32 supports unaligned loads & stores */
+       (*(__u16 *)(curgdt)) = limit;
+       (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+       __asm__ __volatile__ (
+               "lgdt %0\n"
+               : "=m" (curgdt)
+               );
+};
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+       __asm__ __volatile__ (
+               "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+               "\t1:\n"
+               "\tmovl $"STR(__KERNEL_DS)",%eax\n"
+               "\tmovl %eax,%ds\n"
+               "\tmovl %eax,%es\n"
+               "\tmovl %eax,%fs\n"
+               "\tmovl %eax,%gs\n"
+               "\tmovl %eax,%ss\n"
+               );
+#undef STR
+#undef __STR
+}
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+       unsigned long indirection_page, unsigned long reboot_code_buffer,
+       unsigned long start_address, unsigned int has_pae);
+
+const extern unsigned char relocate_new_kernel[];
+extern void relocate_new_kernel_end(void);
+const extern unsigned int relocate_new_kernel_size;
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.  Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+       return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+       unsigned long indirection_page;
+       unsigned long reboot_code_buffer;
+       relocate_new_kernel_t rnk;
+
+       /* Interrupts aren't acceptable while we reboot */
+       local_irq_disable();
+
+       /* Compute some offsets */
+       reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+       indirection_page = image->head & PAGE_MASK;
+
+       /* Set up an identity mapping for the reboot_code_buffer */
+       identity_map_page(reboot_code_buffer);
+
+       /* copy it out */
+       memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
+
+       /* The segment registers are funny things, they are
+        * automatically loaded from a table, in memory wherever you
+        * set them to a specific selector, but this table is never
+        * accessed again you set the segment to a different selector.
+        *
+        * The more common model is are caches where the behide
+        * the scenes work is done, but is also dropped at arbitrary
+        * times.
+        *
+        * I take advantage of this here by force loading the
+        * segments, before I zap the gdt with an invalid value.
+        */
+       load_segments();
+       /* The gdt & idt are now invalid.
+        * If you want to load them you must set up your own idt & gdt.
+        */
+       set_gdt(phys_to_virt(0),0);
+       set_idt(phys_to_virt(0),0);
+
+       /* now call it */
+       rnk = (relocate_new_kernel_t) reboot_code_buffer;
+       (*rnk)(indirection_page, reboot_code_buffer, image->start, cpu_has_pae);
+}
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c

index 3093d1f..e8a01f2 100644 (file)
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -219,6 +219,32 @@ static int __init idle_setup (char *str)
  
  __setup("idle=", idle_setup);
  
  
  __setup("idle=", idle_setup);
  
+void stack_overflow(void)
+{
+        extern unsigned long stack_overflowed;
+        unsigned long esp = current_stack_pointer();
+       int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC);
+
+       oops_in_progress = 1;
+       printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%lx %d %d\n",
+               esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, 
+               (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing);
+       show_trace(current,(void*)esp);
+
+       if (panicing)
+         panic("stack overflow\n");
+
+       oops_in_progress = 0;
+
+       /* Just let it happen once per task, as otherwise it goes nuts
+        * in printing stack traces.  This means that I need to dump
+        * the stack_overflowed boolean into the task or thread_info
+        * structure.  For now just turn it off all together.
+        */
+
+       /* stack_overflowed = 0; */
+}
+
  void show_regs(struct pt_regs * regs)
  {
         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
  void show_regs(struct pt_regs * regs)
  {
         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c

index e8d5cd3..85e89f9 100644 (file)
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -23,7 +23,6 @@ static int reboot_mode;
  int reboot_thru_bios;
  
  #ifdef CONFIG_SMP
  int reboot_thru_bios;
  
  #ifdef CONFIG_SMP
-int reboot_smp = 0;
  static int reboot_cpu = -1;
  /* shamelessly grabbed from lib/vsprintf.c for readability */
  #define is_digit(c)    ((c) >= '0' && (c) <= '9')
  static int reboot_cpu = -1;
  /* shamelessly grabbed from lib/vsprintf.c for readability */
  #define is_digit(c)    ((c) >= '0' && (c) <= '9')
@@ -85,33 +84,9 @@ static int __init set_bios_reboot(struct dmi_system_id *d)
         return 0;
  }
  
         return 0;
  }
  
-/*
- * Some machines require the "reboot=s"  commandline option, this quirk makes that automatic.
- */
-static int __init set_smp_reboot(struct dmi_system_id *d)
-{
-#ifdef CONFIG_SMP
-       if (!reboot_smp) {
-               reboot_smp = 1;
-               printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
-       }
-#endif
-       return 0;
-}
-
-/*
- * Some machines require the "reboot=b,s"  commandline option, this quirk makes that automatic.
- */
-static int __init set_smp_bios_reboot(struct dmi_system_id *d)
-{
-       set_smp_reboot(d);
-       set_bios_reboot(d);
-       return 0;
-}
-
  static struct dmi_system_id __initdata reboot_dmi_table[] = {
         {       /* Handle problems with rebooting on Dell 1300's */
  static struct dmi_system_id __initdata reboot_dmi_table[] = {
         {       /* Handle problems with rebooting on Dell 1300's */
-               .callback = set_smp_bios_reboot,
+               .callback = set_bios_reboot,
                 .ident = "Dell PowerEdge 1300",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
                 .ident = "Dell PowerEdge 1300",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
@@ -294,41 +269,32 @@ void machine_real_restart(unsigned char *code, int length)
                                 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
  }
  
                                 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
  }
  
-void machine_restart(char * __unused)
+void machine_shutdown(void)
  {
  #ifdef CONFIG_SMP
  {
  #ifdef CONFIG_SMP
-       int cpuid;
-       
-       cpuid = GET_APIC_ID(apic_read(APIC_ID));
-
-       if (reboot_smp) {
-
-               /* check to see if reboot_cpu is valid 
-                  if its not, default to the BSP */
-               if ((reboot_cpu == -1) ||  
-                     (reboot_cpu > (NR_CPUS -1))  || 
-                     !physid_isset(cpuid, phys_cpu_present_map))
-                       reboot_cpu = boot_cpu_physical_apicid;
-
-               reboot_smp = 0;  /* use this as a flag to only go through this once*/
-               /* re-run this function on the other CPUs
-                  it will fall though this section since we have 
-                  cleared reboot_smp, and do the reboot if it is the
-                  correct CPU, otherwise it halts. */
-               if (reboot_cpu != cpuid)
-                       smp_call_function((void *)machine_restart , NULL, 1, 0);
+        int reboot_cpu_id;
+
+        /* The boot cpu is always logical cpu 0 */
+        reboot_cpu_id = 0;
+
+        /* See if there has been given a command line override */
+       if ((reboot_cpu_id != -1) && (reboot_cpu < NR_CPUS) &&
+               cpu_isset(reboot_cpu, cpu_online_map)) {
+                reboot_cpu_id = reboot_cpu;
         }
  
         }
  
-       /* if reboot_cpu is still -1, then we want a tradional reboot, 
-          and if we are not running on the reboot_cpu,, halt */
-       if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
-               for (;;)
-               __asm__ __volatile__ ("hlt");
+       /* Make certain the cpu I'm rebooting on is online */
+        if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+                reboot_cpu_id = smp_processor_id();
         }
         }
-       /*
-        * Stop all CPUs and turn off local APICs and the IO-APIC, so
-        * other OSs see a clean IRQ state.
+
+        /* Make certain I only run on the appropriate processor */
+        set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+        /* O.K. Now that I'm on the appropriate processor, stop
+         * all of the others, and disable their local APICs.
          */
          */
+
         if (!netdump_mode)
                 smp_send_stop();
  #elif defined(CONFIG_X86_LOCAL_APIC)
         if (!netdump_mode)
                 smp_send_stop();
  #elif defined(CONFIG_X86_LOCAL_APIC)
@@ -341,6 +307,11 @@ void machine_restart(char * __unused)
  #ifdef CONFIG_X86_IO_APIC
         disable_IO_APIC();
  #endif
  #ifdef CONFIG_X86_IO_APIC
         disable_IO_APIC();
  #endif
+}
+
+void machine_restart(char * __unused)
+{
+        machine_shutdown();
  
         if (!reboot_thru_bios) {
                 if (efi_enabled) {
  
         if (!reboot_thru_bios) {
                 if (efi_enabled) {
diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S

new file mode 100644 (file)

index 0000000..54be4c2
--- /dev/null
+++ b/arch/i386/kernel/relocate_kernel.S
@@ -0,0 +1,118 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+       /*
+        * Must be relocatable PIC code callable as a C function, that once
+        * it starts can not use the previous processes stack.
+        */
+       .globl relocate_new_kernel
+relocate_new_kernel:
+       /* read the arguments and say goodbye to the stack */
+       movl  4(%esp), %ebx /* indirection_page */
+       movl  8(%esp), %ebp /* reboot_code_buffer */
+       movl  12(%esp), %edx /* start address */
+       movl  16(%esp), %ecx /* cpu_has_pae */
+
+       /* zero out flags, and disable interrupts */
+       pushl $0
+       popfl
+
+       /* set a new stack at the bottom of our page... */
+       lea   4096(%ebp), %esp
+
+       /* store the parameters back on the stack */
+       pushl   %edx /* store the start address */
+
+       /* Set cr0 to a known state:
+        * 31 0 == Paging disabled
+        * 18 0 == Alignment check disabled
+        * 16 0 == Write protect disabled
+        * 3  0 == No task switch
+        * 2  0 == Don't do FP software emulation.
+        * 0  1 == Proctected mode enabled
+        */
+       movl    %cr0, %eax
+       andl    $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+       orl     $(1<<0), %eax
+       movl    %eax, %cr0
+
+       /* clear cr4 if applicable */
+       testl   %ecx, %ecx
+       jz      1f
+       /* Set cr4 to a known state:
+        * Setting everything to zero seems safe.
+        */
+       movl    %cr4, %eax
+       andl    $0, %eax
+       movl    %eax, %cr4
+
+       jmp 1f
+1:
+
+       /* Flush the TLB (needed?) */
+       xorl    %eax, %eax
+       movl    %eax, %cr3
+
+       /* Do the copies */
+       cld
+0:     /* top, read another word for the indirection page */
+       movl    %ebx, %ecx
+       movl    (%ebx), %ecx
+       addl    $4, %ebx
+       testl   $0x1,   %ecx  /* is it a destination page */
+       jz      1f
+       movl    %ecx,   %edi
+       andl    $0xfffff000, %edi
+       jmp     0b
+1:
+       testl   $0x2,   %ecx  /* is it an indirection page */
+       jz      1f
+       movl    %ecx,   %ebx
+       andl    $0xfffff000, %ebx
+       jmp     0b
+1:
+       testl   $0x4,   %ecx /* is it the done indicator */
+       jz      1f
+       jmp     2f
+1:
+       testl   $0x8,   %ecx /* is it the source indicator */
+       jz      0b           /* Ignore it otherwise */
+       movl    %ecx,   %esi /* For every source page do a copy */
+       andl    $0xfffff000, %esi
+
+       movl    $1024, %ecx
+       rep ; movsl
+       jmp     0b
+
+2:
+
+       /* To be certain of avoiding problems with self-modifying code
+        * I need to execute a serializing instruction here.
+        * So I flush the TLB, it's handy, and not processor dependent.
+        */
+       xorl    %eax, %eax
+       movl    %eax, %cr3
+
+       /* set all of the registers to known values */
+       /* leave %esp alone */
+
+       xorl    %eax, %eax
+       xorl    %ebx, %ebx
+       xorl    %ecx, %ecx
+       xorl    %edx, %edx
+       xorl    %esi, %esi
+       xorl    %edi, %edi
+       xorl    %ebp, %ebp
+       ret
+relocate_new_kernel_end:
+
+       .globl relocate_new_kernel_size
+relocate_new_kernel_size:
+       .long relocate_new_kernel_end - relocate_new_kernel
diff --git a/configs/kernel-2.6.8-i686-planetlab.config b/configs/kernel-2.6.8-i686-planetlab.config

index ea66387..8cc762f 100644 (file)
--- a/configs/kernel-2.6.8-i686-planetlab.config
+++ b/configs/kernel-2.6.8-i686-planetlab.config
@@ -30,8 +30,9 @@ CONFIG_RCFS_FS=y
  CONFIG_CKRM_TYPE_TASKCLASS=y
  CONFIG_CKRM_RES_NUMTASKS=y
  CONFIG_CKRM_CPU_SCHEDULE=y
  CONFIG_CKRM_TYPE_TASKCLASS=y
  CONFIG_CKRM_RES_NUMTASKS=y
  CONFIG_CKRM_CPU_SCHEDULE=y
-CONFIG_CKRM_RES_BLKIO=y
+# CONFIG_CKRM_RES_BLKIO is not set
  # CONFIG_CKRM_RES_MEM is not set
  # CONFIG_CKRM_RES_MEM is not set
+CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y
  # CONFIG_CKRM_TYPE_SOCKETCLASS is not set
  CONFIG_CKRM_RBCE=y
  CONFIG_SYSCTL=y
  # CONFIG_CKRM_TYPE_SOCKETCLASS is not set
  CONFIG_CKRM_RBCE=y
  CONFIG_SYSCTL=y
@@ -140,6 +141,12 @@ CONFIG_HIGHPTE=y
  # CONFIG_MATH_EMULATION is not set
  CONFIG_MTRR=y
  CONFIG_REGPARM=y
  # CONFIG_MATH_EMULATION is not set
  CONFIG_MTRR=y
  CONFIG_REGPARM=y
+CONFIG_IRQSTACKS=y
+CONFIG_STACK_SIZE_SHIFT=13
+CONFIG_STACK_WARN=4000
+CONFIG_X86_STACK_CHECK=y
+CONFIG_STACK_PANIC=512
+CONFIG_KEXEC=y
  
  #
  # Power management options (ACPI, APM)
  
  #
  # Power management options (ACPI, APM)
@@ -211,7 +218,7 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y
  #
  # Block devices
  #
  #
  # Block devices
  #
-# CONFIG_BLK_DEV_FD is not set
+CONFIG_BLK_DEV_FD=m
  # CONFIG_BLK_DEV_XD is not set
  CONFIG_BLK_CPQ_DA=m
  CONFIG_BLK_CPQ_CISS_DA=m
  # CONFIG_BLK_DEV_XD is not set
  CONFIG_BLK_CPQ_DA=m
  CONFIG_BLK_CPQ_CISS_DA=m
diff --git a/drivers/block/cfq-iosched-orig.c b/drivers/block/cfq-iosched-orig.c

deleted file mode 100644 (file)

index 977d32d..0000000
--- a/drivers/block/cfq-iosched-orig.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- *  linux/drivers/block/cfq-iosched.c
- *
- *  CFQ, or complete fairness queueing, disk scheduler.
- *
- *  Based on ideas from a previously unfinished io
- *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
- *
- *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/hash.h>
-#include <linux/rbtree.h>
-#include <linux/mempool.h>
-
-/*
- * tunables
- */
-static int cfq_quantum = 4;
-static int cfq_queued = 8;
-
-#define CFQ_QHASH_SHIFT                6
-#define CFQ_QHASH_ENTRIES      (1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)        list_entry((entry), struct cfq_queue, cfq_hash)
-
-#define CFQ_MHASH_SHIFT                8
-#define CFQ_MHASH_BLOCK(sec)   ((sec) >> 3)
-#define CFQ_MHASH_ENTRIES      (1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)      (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
-#define ON_MHASH(crq)          !list_empty(&(crq)->hash)
-#define rq_hash_key(rq)                ((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)   list_entry((ptr), struct cfq_rq, hash)
-
-#define list_entry_cfqq(ptr)   list_entry((ptr), struct cfq_queue, cfq_list)
-
-#define RQ_DATA(rq)            ((struct cfq_rq *) (rq)->elevator_private)
-
-static kmem_cache_t *crq_pool;
-static kmem_cache_t *cfq_pool;
-static mempool_t *cfq_mpool;
-
-struct cfq_data {
-       struct list_head rr_list;
-       struct list_head *dispatch;
-       struct list_head *cfq_hash;
-
-       struct list_head *crq_hash;
-
-       unsigned int busy_queues;
-       unsigned int max_queued;
-
-       mempool_t *crq_pool;
-};
-
-struct cfq_queue {
-       struct list_head cfq_hash;
-       struct list_head cfq_list;
-       struct rb_root sort_list;
-       int pid;
-       int queued[2];
-#if 0
-       /*
-        * with a simple addition like this, we can do io priorities. almost.
-        * does need a split request free list, too.
-        */
-       int io_prio
-#endif
-};
-
-struct cfq_rq {
-       struct rb_node rb_node;
-       sector_t rb_key;
-
-       struct request *request;
-
-       struct cfq_queue *cfq_queue;
-
-       struct list_head hash;
-};
-
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
-static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq);
-
-/*
- * lots of deadline iosched dupes, can be abstracted later...
- */
-static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
-{
-       list_del_init(&crq->hash);
-}
-
-static inline void cfq_del_crq_hash(struct cfq_rq *crq)
-{
-       if (ON_MHASH(crq))
-               __cfq_del_crq_hash(crq);
-}
-
-static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
-{
-       cfq_del_crq_hash(crq);
-
-       if (q->last_merge == crq->request)
-               q->last_merge = NULL;
-}
-
-static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-       struct request *rq = crq->request;
-
-       BUG_ON(ON_MHASH(crq));
-
-       list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
-}
-
-static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
-{
-       struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-       struct list_head *entry, *next = hash_list->next;
-
-       while ((entry = next) != hash_list) {
-               struct cfq_rq *crq = list_entry_hash(entry);
-               struct request *__rq = crq->request;
-
-               next = entry->next;
-
-               BUG_ON(!ON_MHASH(crq));
-
-               if (!rq_mergeable(__rq)) {
-                       __cfq_del_crq_hash(crq);
-                       continue;
-               }
-
-               if (rq_hash_key(__rq) == offset)
-                       return __rq;
-       }
-
-       return NULL;
-}
-
-/*
- * rb tree support functions
- */
-#define RB_NONE                (2)
-#define RB_EMPTY(node) ((node)->rb_node == NULL)
-#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
-#define RB_CLEAR_ROOT(root)    ((root)->rb_node = NULL)
-#define ON_RB(node)    ((node)->rb_color != RB_NONE)
-#define rb_entry_crq(node)     rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)          (rq)->sector
-
-static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
-{
-       if (ON_RB(&crq->rb_node)) {
-               cfqq->queued[rq_data_dir(crq->request)]--;
-               rb_erase(&crq->rb_node, &cfqq->sort_list);
-               crq->cfq_queue = NULL;
-       }
-}
-
-static struct cfq_rq *
-__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
-{
-       struct rb_node **p = &cfqq->sort_list.rb_node;
-       struct rb_node *parent = NULL;
-       struct cfq_rq *__crq;
-
-       while (*p) {
-               parent = *p;
-               __crq = rb_entry_crq(parent);
-
-               if (crq->rb_key < __crq->rb_key)
-                       p = &(*p)->rb_left;
-               else if (crq->rb_key > __crq->rb_key)
-                       p = &(*p)->rb_right;
-               else
-                       return __crq;
-       }
-
-       rb_link_node(&crq->rb_node, parent, p);
-       return 0;
-}
-
-static void
-cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
-{
-       struct request *rq = crq->request;
-       struct cfq_rq *__alias;
-
-       crq->rb_key = rq_rb_key(rq);
-       cfqq->queued[rq_data_dir(rq)]++;
-retry:
-       __alias = __cfq_add_crq_rb(cfqq, crq);
-       if (!__alias) {
-               rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-               crq->cfq_queue = cfqq;
-               return;
-       }
-
-       cfq_del_crq_rb(cfqq, __alias);
-       cfq_dispatch_sort(cfqd->dispatch, __alias);
-       goto retry;
-}
-
-static struct request *
-cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
-{
-       struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
-       struct rb_node *n;
-
-       if (!cfqq)
-               goto out;
-
-       n = cfqq->sort_list.rb_node;
-       while (n) {
-               struct cfq_rq *crq = rb_entry_crq(n);
-
-               if (sector < crq->rb_key)
-                       n = n->rb_left;
-               else if (sector > crq->rb_key)
-                       n = n->rb_right;
-               else
-                       return crq->request;
-       }
-
-out:
-       return NULL;
-}
-
-static void cfq_remove_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       if (crq) {
-               struct cfq_queue *cfqq = crq->cfq_queue;
-
-               cfq_remove_merge_hints(q, crq);
-               list_del_init(&rq->queuelist);
-
-               if (cfqq) {
-                       cfq_del_crq_rb(cfqq, crq);
-
-                       if (RB_EMPTY(&cfqq->sort_list))
-                               cfq_put_queue(cfqd, cfqq);
-               }
-       }
-}
-
-static int
-cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct request *__rq;
-       int ret;
-
-       ret = elv_try_last_merge(q, bio);
-       if (ret != ELEVATOR_NO_MERGE) {
-               __rq = q->last_merge;
-               goto out_insert;
-       }
-
-       __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
-       if (__rq) {
-               BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-               if (elv_rq_merge_ok(__rq, bio)) {
-                       ret = ELEVATOR_BACK_MERGE;
-                       goto out;
-               }
-       }
-
-       __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
-       if (__rq) {
-               if (elv_rq_merge_ok(__rq, bio)) {
-                       ret = ELEVATOR_FRONT_MERGE;
-                       goto out;
-               }
-       }
-
-       return ELEVATOR_NO_MERGE;
-out:
-       q->last_merge = __rq;
-out_insert:
-       *req = __rq;
-       return ret;
-}
-
-static void cfq_merged_request(request_queue_t *q, struct request *req)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(req);
-
-       cfq_del_crq_hash(crq);
-       cfq_add_crq_hash(cfqd, crq);
-
-       if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
-               struct cfq_queue *cfqq = crq->cfq_queue;
-
-               cfq_del_crq_rb(cfqq, crq);
-               cfq_add_crq_rb(cfqd, cfqq, crq);
-       }
-
-       q->last_merge = req;
-}
-
-static void
-cfq_merged_requests(request_queue_t *q, struct request *req,
-                   struct request *next)
-{
-       cfq_merged_request(q, req);
-       cfq_remove_request(q, next);
-}
-
-static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq)
-{
-       struct list_head *entry = head;
-       struct request *__rq;
-
-       if (!list_empty(head)) {
-               __rq = list_entry_rq(head->next);
-
-               if (crq->request->sector < __rq->sector) {
-                       entry = head->prev;
-                       goto link;
-               }
-       }
-
-       while ((entry = entry->prev) != head) {
-               __rq = list_entry_rq(entry);
-
-               if (crq->request->sector <= __rq->sector)
-                       break;
-       }
-
-link:
-       list_add_tail(&crq->request->queuelist, entry);
-}
-
-static inline void
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-                       struct cfq_queue *cfqq)
-{
-       struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
-
-       cfq_del_crq_rb(cfqq, crq);
-       cfq_remove_merge_hints(q, crq);
-       cfq_dispatch_sort(cfqd->dispatch, crq);
-}
-
-static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
-{
-       struct cfq_queue *cfqq;
-       struct list_head *entry, *tmp;
-       int ret, queued, good_queues;
-
-       if (list_empty(&cfqd->rr_list))
-               return 0;
-
-       queued = ret = 0;
-restart:
-       good_queues = 0;
-       list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-               cfqq = list_entry_cfqq(cfqd->rr_list.next);
-
-               BUG_ON(RB_EMPTY(&cfqq->sort_list));
-
-               __cfq_dispatch_requests(q, cfqd, cfqq);
-
-               if (RB_EMPTY(&cfqq->sort_list))
-                       cfq_put_queue(cfqd, cfqq);
-               else
-                       good_queues++;
-
-               queued++;
-               ret = 1;
-       }
-
-       if ((queued < cfq_quantum) && good_queues)
-               goto restart;
-
-       return ret;
-}
-
-static struct request *cfq_next_request(request_queue_t *q)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct request *rq;
-
-       if (!list_empty(cfqd->dispatch)) {
-               struct cfq_rq *crq;
-dispatch:
-               rq = list_entry_rq(cfqd->dispatch->next);
-
-               crq = RQ_DATA(rq);
-               if (crq)
-                       cfq_remove_merge_hints(q, crq);
-
-               return rq;
-       }
-
-       if (cfq_dispatch_requests(q, cfqd))
-               goto dispatch;
-
-       return NULL;
-}
-
-static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
-{
-       struct list_head *hash_list = &cfqd->cfq_hash[hashval];
-       struct list_head *entry;
-
-       list_for_each(entry, hash_list) {
-               struct cfq_queue *__cfqq = list_entry_qhash(entry);
-
-               if (__cfqq->pid == pid)
-                       return __cfqq;
-       }
-
-       return NULL;
-}
-
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
-{
-       const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
-
-       return __cfq_find_cfq_hash(cfqd, pid, hashval);
-}
-
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-       cfqd->busy_queues--;
-       list_del(&cfqq->cfq_list);
-       list_del(&cfqq->cfq_hash);
-       mempool_free(cfqq, cfq_mpool);
-}
-
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid)
-{
-       const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
-       struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
-
-       if (!cfqq) {
-               cfqq = mempool_alloc(cfq_mpool, GFP_NOIO);
-
-               INIT_LIST_HEAD(&cfqq->cfq_hash);
-               INIT_LIST_HEAD(&cfqq->cfq_list);
-               RB_CLEAR_ROOT(&cfqq->sort_list);
-
-               cfqq->pid = pid;
-               cfqq->queued[0] = cfqq->queued[1] = 0;
-               list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
-       }
-
-       return cfqq;
-}
-
-static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-       struct cfq_queue *cfqq;
-
-       cfqq = cfq_get_queue(cfqd, current->tgid);
-
-       cfq_add_crq_rb(cfqd, cfqq, crq);
-
-       if (list_empty(&cfqq->cfq_list)) {
-               list_add(&cfqq->cfq_list, &cfqd->rr_list);
-               cfqd->busy_queues++;
-       }
-}
-
-static void
-cfq_insert_request(request_queue_t *q, struct request *rq, int where)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       switch (where) {
-               case ELEVATOR_INSERT_BACK:
-                       while (cfq_dispatch_requests(q, cfqd))
-                               ;
-                       list_add_tail(&rq->queuelist, cfqd->dispatch);
-                       break;
-               case ELEVATOR_INSERT_FRONT:
-                       list_add(&rq->queuelist, cfqd->dispatch);
-                       break;
-               case ELEVATOR_INSERT_SORT:
-                       BUG_ON(!blk_fs_request(rq));
-                       cfq_enqueue(cfqd, crq);
-                       break;
-               default:
-                       printk("%s: bad insert point %d\n", __FUNCTION__,where);
-                       return;
-       }
-
-       if (rq_mergeable(rq)) {
-               cfq_add_crq_hash(cfqd, crq);
-
-               if (!q->last_merge)
-                       q->last_merge = rq;
-       }
-}
-
-static int cfq_queue_empty(request_queue_t *q)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-
-       if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
-               return 1;
-
-       return 0;
-}
-
-static struct request *
-cfq_former_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_rq *crq = RQ_DATA(rq);
-       struct rb_node *rbprev = rb_prev(&crq->rb_node);
-
-       if (rbprev)
-               return rb_entry_crq(rbprev)->request;
-
-       return NULL;
-}
-
-static struct request *
-cfq_latter_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_rq *crq = RQ_DATA(rq);
-       struct rb_node *rbnext = rb_next(&crq->rb_node);
-
-       if (rbnext)
-               return rb_entry_crq(rbnext)->request;
-
-       return NULL;
-}
-
-static int cfq_may_queue(request_queue_t *q, int rw)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_queue *cfqq;
-       int ret = 1;
-
-       if (!cfqd->busy_queues)
-               goto out;
-
-       cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
-       if (cfqq) {
-               int limit = (q->nr_requests - cfq_queued) / cfqd->busy_queues;
-
-               if (limit < 3)
-                       limit = 3;
-               else if (limit > cfqd->max_queued)
-                       limit = cfqd->max_queued;
-
-               if (cfqq->queued[rw] > limit)
-                       ret = 0;
-       }
-out:
-       return ret;
-}
-
-static void cfq_put_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       if (crq) {
-               BUG_ON(q->last_merge == rq);
-               BUG_ON(ON_MHASH(crq));
-
-               mempool_free(crq, cfqd->crq_pool);
-               rq->elevator_private = NULL;
-       }
-}
-
-static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
-
-       if (crq) {
-               RB_CLEAR(&crq->rb_node);
-               crq->request = rq;
-               crq->cfq_queue = NULL;
-               INIT_LIST_HEAD(&crq->hash);
-               rq->elevator_private = crq;
-               return 0;
-       }
-
-       return 1;
-}
-
-static void cfq_exit(request_queue_t *q, elevator_t *e)
-{
-       struct cfq_data *cfqd = e->elevator_data;
-
-       e->elevator_data = NULL;
-       mempool_destroy(cfqd->crq_pool);
-       kfree(cfqd->crq_hash);
-       kfree(cfqd->cfq_hash);
-       kfree(cfqd);
-}
-
-static int cfq_init(request_queue_t *q, elevator_t *e)
-{
-       struct cfq_data *cfqd;
-       int i;
-
-       cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
-       if (!cfqd)
-               return -ENOMEM;
-
-       memset(cfqd, 0, sizeof(*cfqd));
-       INIT_LIST_HEAD(&cfqd->rr_list);
-
-       cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
-       if (!cfqd->crq_hash)
-               goto out_crqhash;
-
-       cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
-       if (!cfqd->cfq_hash)
-               goto out_cfqhash;
-
-       cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
-       if (!cfqd->crq_pool)
-               goto out_crqpool;
-
-       for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-               INIT_LIST_HEAD(&cfqd->crq_hash[i]);
-       for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
-               INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
-
-       cfqd->dispatch = &q->queue_head;
-       e->elevator_data = cfqd;
-
-       /*
-        * just set it to some high value, we want anyone to be able to queue
-        * some requests. fairness is handled differently
-        */
-       cfqd->max_queued = q->nr_requests;
-       q->nr_requests = 8192;
-
-       return 0;
-out_crqpool:
-       kfree(cfqd->cfq_hash);
-out_cfqhash:
-       kfree(cfqd->crq_hash);
-out_crqhash:
-       kfree(cfqd);
-       return -ENOMEM;
-}
-
-static int __init cfq_slab_setup(void)
-{
-       crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
-                                       NULL, NULL);
-
-       if (!crq_pool)
-               panic("cfq_iosched: can't init crq pool\n");
-
-       cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
-                                       NULL, NULL);
-
-       if (!cfq_pool)
-               panic("cfq_iosched: can't init cfq pool\n");
-
-       cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
-
-       if (!cfq_mpool)
-               panic("cfq_iosched: can't init cfq mpool\n");
-
-       return 0;
-}
-
-subsys_initcall(cfq_slab_setup);
-
-elevator_t iosched_cfq = {
-       .elevator_name =                "cfq",
-       .elevator_merge_fn =            cfq_merge,
-       .elevator_merged_fn =           cfq_merged_request,
-       .elevator_merge_req_fn =        cfq_merged_requests,
-       .elevator_next_req_fn =         cfq_next_request,
-       .elevator_add_req_fn =          cfq_insert_request,
-       .elevator_remove_req_fn =       cfq_remove_request,
-       .elevator_queue_empty_fn =      cfq_queue_empty,
-       .elevator_former_req_fn =       cfq_former_request,
-       .elevator_latter_req_fn =       cfq_latter_request,
-       .elevator_set_req_fn =          cfq_set_request,
-       .elevator_put_req_fn =          cfq_put_request,
-       .elevator_may_queue_fn =        cfq_may_queue,
-       .elevator_init_fn =             cfq_init,
-       .elevator_exit_fn =             cfq_exit,
-};
-
-EXPORT_SYMBOL(iosched_cfq);
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c

index 7b45a80..70d66c5 100644 (file)
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -39,8 +39,6 @@
  #error Cannot support this many io priority levels
  #endif
  
  #error Cannot support this many io priority levels
  #endif
  
-#define LIMIT_DEBUG   1
-
  /*
   * tunables
   */
  /*
   * tunables
   */
@@ -52,6 +50,10 @@ static int cfq_queued = 4;
  static int cfq_grace_rt = HZ / 100 ?: 1;
  static int cfq_grace_idle = HZ / 10;
  
  static int cfq_grace_rt = HZ / 100 ?: 1;
  static int cfq_grace_idle = HZ / 10;
  
+#define CFQ_EPOCH              1000000000
+#define CFQ_SECTORATE          1000   
+#define CFQ_HMAX_PCT           80
+
  #define CFQ_QHASH_SHIFT                6
  #define CFQ_QHASH_ENTRIES      (1 << CFQ_QHASH_SHIFT)
  #define list_entry_qhash(entry)        hlist_entry((entry), struct cfq_queue, cfq_hash)
  #define CFQ_QHASH_SHIFT                6
  #define CFQ_QHASH_ENTRIES      (1 << CFQ_QHASH_SHIFT)
  #define list_entry_qhash(entry)        hlist_entry((entry), struct cfq_queue, cfq_hash)
@@ -69,13 +71,6 @@ static int cfq_grace_idle = HZ / 10;
  #define cfq_account_io(crq)    \
         ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
  
  #define cfq_account_io(crq)    \
         ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
  
-/* define to be 50 ms for now; make tunable later */
-#define CFQ_EPOCH              50000
-/* Needs to be made tunable right away, in MiB/s */
-#define CFQ_DISKBW             10       
-/* Temporary global limit, as percent of available b/w, for each "class" */
-#define CFQ_TEMPLIM            10
-
  /*
   * defines how we distribute bandwidth (can be tgid, uid, etc)
   */
  /*
   * defines how we distribute bandwidth (can be tgid, uid, etc)
   */
@@ -87,18 +82,22 @@ static int cfq_grace_idle = HZ / 10;
   */
  
  #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
   */
  
  #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
-extern inline void *cki_hash_key(struct task_struct *tsk);
-extern inline int cki_ioprio(struct task_struct *tsk);
-#define cfq_hash_key(current)   ((int)cki_hash_key((current)))
-#define cfq_ioprio(current)    (cki_ioprio((current)))
+extern void *cki_hash_key(struct task_struct *tsk);
+extern int cki_ioprio(struct task_struct *tsk);
+extern void *cki_cfqpriv(struct task_struct *tsk); 
+
+#define cfq_hash_key(tsk)   ((int)cki_hash_key((tsk)))
+#define cfq_ioprio(tsk)        (cki_ioprio((tsk)))
+#define cfq_cfqpriv(cfqd,tsk)  (cki_cfqpriv((tsk)))
  
  #else
  
  #else
-#define cfq_hash_key(current)  ((current)->tgid)
+#define cfq_hash_key(tsk)      ((tsk)->tgid)
+#define cfq_cfqpriv(cfqd,tsk)  (&(((cfqd)->cid[(tsk)->ioprio]).cfqpriv))
  
  /*
   * move to io_context
   */
  
  /*
   * move to io_context
   */
-#define cfq_ioprio(current)    ((current)->ioprio)
+#define cfq_ioprio(tsk)        ((tsk)->ioprio)
  #endif
  
  #define CFQ_WAIT_RT    0
  #endif
  
  #define CFQ_WAIT_RT    0
@@ -125,16 +124,12 @@ struct io_prio_data {
         atomic_t cum_sectors_in,cum_sectors_out;    
         atomic_t cum_queues_in,cum_queues_out;
  
         atomic_t cum_sectors_in,cum_sectors_out;    
         atomic_t cum_queues_in,cum_queues_out;
  
-#ifdef LIMIT_DEBUG
-       int nskip;
-       unsigned long navsec;
-       unsigned long csectorate;
-       unsigned long lsectorate;
-#endif
+       cfqlim_t cfqpriv;       /* data for enforcing limits */
  
         struct list_head prio_list;
         int last_rq;
         int last_sectors;
  
         struct list_head prio_list;
         int last_rq;
         int last_sectors;
+
  };
  
  /*
  };
  
  /*
@@ -179,8 +174,9 @@ struct cfq_data {
         unsigned int cfq_grace_rt;
         unsigned int cfq_grace_idle;
  
         unsigned int cfq_grace_rt;
         unsigned int cfq_grace_idle;
  
-       unsigned long cfq_epoch;        /* duration for limit enforcement */
-       unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */
+       unsigned int cfq_epoch;
+       unsigned int cfq_hmax_pct;
+       unsigned int cfq_qsectorate;
  };
  
  /*
  };
  
  /*
@@ -194,14 +190,34 @@ struct cfq_queue {
         int queued[2];
         int ioprio;
  
         int queued[2];
         int ioprio;
  
+       /* limit related settings/stats obtained 
+          either from io_prio_data or ckrm I/O class
+       */
+       struct cfqlim *cfqpriv; 
+
+       u64 epstart;            /* current epoch's starting timestamp (ns) */
+       u64 epsector[2];        /* Total sectors dispatched in [0] previous
+                                * and [1] current epoch
+                                */
+       
         unsigned long avsec;            /* avg sectors dispatched/epoch */
         unsigned long avsec;            /* avg sectors dispatched/epoch */
-       unsigned long long lastime;     /* timestamp of last request served */
-       unsigned long sectorate;        /* limit for sectors served/epoch */
+//     unsigned long long lastime;     /* timestamp of last request served */
+//     unsigned long sectorate;        /* limit for sectors served/epoch */
         int skipped;                    /* queue skipped at last dispatch ? */
         int skipped;                    /* queue skipped at last dispatch ? */
+
+       /* Per queue timer to suspend/resume queue from processing */
+       struct timer_list timer;
+       unsigned long wait_end;
+       unsigned long flags;
+       struct work_struct work;
+
+       struct cfq_data *cfqd;
  };
  
  };
  
+
+
  /*
  /*
- * per-request structure
+ * Per-request structure
   */
  struct cfq_rq {
         struct cfq_queue *cfq_queue;
   */
  struct cfq_rq {
         struct cfq_queue *cfq_queue;
@@ -516,69 +532,101 @@ link:
         list_add_tail(&crq->request->queuelist, entry);
  }
  
         list_add_tail(&crq->request->queuelist, entry);
  }
  
-/*
- * remove from io scheduler core and put on dispatch list for service
- */
+struct cfq_queue *dcfqq;
+u64 dtmp;
+
+
+
+/* Over how many ns is sectorate defined */
+#define NS4SCALE  (100000000)
+
  static inline int
  static inline int
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-                       struct cfq_queue *cfqq)
+__cfq_check_limit(struct cfq_data *cfqd,struct cfq_queue *cfqq, int dontskip)
  {
         struct cfq_rq *crq;
  {
         struct cfq_rq *crq;
-       unsigned long long ts, gap;
-       unsigned long newavsec;
+       unsigned long long ts, gap, epoch, tmp;
+       unsigned long newavsec, sectorate;
  
         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
  
  
         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
  
-#if 1
-       /* Determine if queue should be skipped for being overshare */
         ts = sched_clock();
         ts = sched_clock();
-       gap = ts - cfqq->lastime;
-#ifdef LIMIT_DEBUG
-       cfqq->sectorate = (cfqd->cfq_epochsectors 
-                          * CFQ_TEMPLIM)/100;
-       
-#endif
-       if ((gap >= cfqd->cfq_epoch) || (gap < 0)) {
-               cfqq->avsec = crq->nr_sectors ; 
-               cfqq->lastime = ts;
+       gap = ts - cfqq->epstart;
+       epoch = cfqd->cfq_epoch;
+
+       sectorate = atomic_read(&cfqq->cfqpriv->sectorate);
+//     sectorate = atomic_read(&(cfqd->cid[crq->ioprio].sectorate));
+
+       dcfqq = cfqq;
+
+       if ((gap >= epoch) || (gap < 0)) {
+
+               if (gap >= (epoch << 1)) {
+                       cfqq->epsector[0] = 0;
+                       cfqq->epstart = ts ; 
+               } else {
+                       cfqq->epsector[0] = cfqq->epsector[1];
+                       cfqq->epstart += epoch;
+               } 
+               cfqq->epsector[1] = 0;
+               gap = ts - cfqq->epstart;
+
+               tmp  = (cfqq->epsector[0] + crq->nr_sectors) * NS4SCALE;
+               do_div(tmp,epoch+gap);
+
+               cfqq->avsec = (unsigned long)tmp;
+               cfqq->skipped = 0;
+               cfqq->epsector[1] += crq->nr_sectors;
+               
+               cfqq->cfqpriv->navsec = cfqq->avsec;
+               cfqq->cfqpriv->sec[0] = cfqq->epsector[0];
+               cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
+               cfqq->cfqpriv->timedout++;
+               /*
+               cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
+               cfqd->cid[crq->ioprio].sec[0] = cfqq->epsector[0];
+               cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
+               cfqd->cid[crq->ioprio].timedout++;
+               */
+               return 0;
         } else {
         } else {
-               u64 tmp;
-               /* Age old average and accumalate request to be served */
-
-//             tmp = (u64) (cfqq->avsec * gap) ;
-//             do_div(tmp, cfqd->cfq_epoch);
-               newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors;
-//             if (crq->ioprio >= 0 && crq->ioprio <= 20)
-//                     cfqd->cid[crq->ioprio].lsectorate = newavsec; 
-//             atomic_set(&(cfqd->cid[crq->ioprio].lsectorate),
-//                        newavsec);
-
-               if ((newavsec < cfqq->sectorate) || cfqq->skipped) {
+               
+               tmp = (cfqq->epsector[0] + cfqq->epsector[1] + crq->nr_sectors)
+                       * NS4SCALE;
+               do_div(tmp,epoch+gap);
+
+               newavsec = (unsigned long)tmp;
+               if ((newavsec < sectorate) || dontskip) {
                         cfqq->avsec = newavsec ;
                         cfqq->avsec = newavsec ;
-                       cfqq->lastime = ts;
                         cfqq->skipped = 0;
                         cfqq->skipped = 0;
+                       cfqq->epsector[1] += crq->nr_sectors;
+                       cfqq->cfqpriv->navsec = cfqq->avsec;
+                       cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
+                       /*
+                       cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
+                       cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
+                       */
                 } else {
                 } else {
-                       /* queue over share ; skip once */
                         cfqq->skipped = 1;
                         cfqq->skipped = 1;
-#ifdef LIMIT_DEBUG     
-//                     atomic_inc(&(cfqd->cid[crq->ioprio].nskip));
-//                     if (crq->ioprio >= 0 && crq->ioprio <= 20)
-//                             cfqd->cid[crq->ioprio].nskip++;
-#endif
-                       return 0;
+                       /* pause q's processing till avsec drops to 
+                          cfq_hmax_pct % of its value */
+                       tmp = (epoch+gap) * (100-cfqd->cfq_hmax_pct);
+                       do_div(tmp,1000000*cfqd->cfq_hmax_pct);
+                       cfqq->wait_end = jiffies+msecs_to_jiffies(tmp);
                 }
                 }
-       }
-#endif
+       }                       
+}
  
  
-#ifdef LIMIT_DEBUG
-//     if (crq->ioprio >= 0 && crq->ioprio <= 20) {
-//             cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
-//             cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate;
-//     }
+/*
+ * remove from io scheduler core and put on dispatch list for service
+ */
+static inline int
+__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
+                       struct cfq_queue *cfqq)
+{
+       struct cfq_rq *crq;
+
+       crq = rb_entry_crq(rb_first(&cfqq->sort_list));
  
  
-//     atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec);
-//     atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate);
-#endif
         cfq_dispatch_sort(cfqd, cfqq, crq);
  
         /*
         cfq_dispatch_sort(cfqd, cfqq, crq);
  
         /*
@@ -593,44 +641,83 @@ cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
  {
         struct cfq_data *cfqd = q->elevator.elevator_data;
         struct list_head *plist = &cfqd->cid[prio].rr_list;
  {
         struct cfq_data *cfqd = q->elevator.elevator_data;
         struct list_head *plist = &cfqd->cid[prio].rr_list;
+       struct cfq_queue *cfqq;
         struct list_head *entry, *nxt;
         int q_rq, q_io;
         struct list_head *entry, *nxt;
         int q_rq, q_io;
-       int ret ;
+       int first_round,busy_queues,busy_unlimited;
+
  
         /*
          * for each queue at this prio level, dispatch a request
          */
         q_rq = q_io = 0;
  
         /*
          * for each queue at this prio level, dispatch a request
          */
         q_rq = q_io = 0;
+       first_round=1;
+ restart:
+       busy_unlimited = 0;
+       busy_queues = 0;
         list_for_each_safe(entry, nxt, plist) {
         list_for_each_safe(entry, nxt, plist) {
-               struct cfq_queue *cfqq = list_entry_cfqq(entry);
+               cfqq = list_entry_cfqq(entry);
  
                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
  
                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
+               busy_queues++;
  
  
-               ret = __cfq_dispatch_requests(q, cfqd, cfqq);
-               if (ret <= 0) {
-                       continue; /* skip queue */
-                       /* can optimize more by moving q to end of plist ? */
+               
+               if (first_round || busy_unlimited)
+                       __cfq_check_limit(cfqd,cfqq,0);
+               else
+                       __cfq_check_limit(cfqd,cfqq,1);
+
+               if (cfqq->skipped) {
+                       cfqq->cfqpriv->nskip++;
+                       /* cfqd->cid[prio].nskip++; */
+                       busy_queues--;
+                       if (time_before(jiffies, cfqq->wait_end)) {
+                               list_del(&cfqq->cfq_list);
+                               mod_timer(&cfqq->timer,cfqq->wait_end);
+                       }
+                       continue;
                 }
                 }
-               q_io += ret ;
-               q_rq++ ;
+               busy_unlimited++;
+
+               q_io += __cfq_dispatch_requests(q, cfqd, cfqq);
+               q_rq++;
  
  
-               if (RB_EMPTY(&cfqq->sort_list))
+               if (RB_EMPTY(&cfqq->sort_list)) {
+                       busy_unlimited--;
+                       busy_queues--;
                         cfq_put_queue(cfqd, cfqq);
                         cfq_put_queue(cfqd, cfqq);
-               /*
-                * if we hit the queue limit, put the string of serviced
-                * queues at the back of the pending list
-                */
+               } 
+
                 if (q_io >= max_sectors || q_rq >= max_rq) {
                 if (q_io >= max_sectors || q_rq >= max_rq) {
+#if 0
                         struct list_head *prv = nxt->prev;
  
                         if (prv != plist) {
                                 list_del(plist);
                                 list_add(plist, prv);
                         }
                         struct list_head *prv = nxt->prev;
  
                         if (prv != plist) {
                                 list_del(plist);
                                 list_add(plist, prv);
                         }
+#endif
                         break;
                 }
         }
  
                         break;
                 }
         }
  
+       if ((q_io < max_sectors) && (q_rq < max_rq) && 
+           (busy_queues || first_round))
+       {
+               first_round = 0;
+               goto restart;
+       } else {
+               /*
+                * if we hit the queue limit, put the string of serviced
+                * queues at the back of the pending list
+                */
+               struct list_head *prv = nxt->prev;
+               if (prv != plist) {
+                       list_del(plist);
+                       list_add(plist, prv);
+               }
+       }
+
         cfqd->cid[prio].last_rq = q_rq;
         cfqd->cid[prio].last_sectors = q_io;
         return q_rq;
         cfqd->cid[prio].last_rq = q_rq;
         cfqd->cid[prio].last_sectors = q_io;
         return q_rq;
@@ -806,6 +893,29 @@ static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         mempool_free(cfqq, cfq_mpool);
  }
  
         mempool_free(cfqq, cfq_mpool);
  }
  
+static void cfq_pauseq_timer(unsigned long data)
+{
+       struct cfq_queue *cfqq = (struct cfq_queue *) data;
+       kblockd_schedule_work(&cfqq->work);
+}
+
+static void cfq_pauseq_work(void *data)
+{
+       struct cfq_queue *cfqq = (struct cfq_queue *) data;
+       struct cfq_data *cfqd = cfqq->cfqd;
+       request_queue_t *q = cfqd->queue;
+       unsigned long flags;
+       
+       spin_lock_irqsave(q->queue_lock, flags);
+       list_add_tail(&cfqq->cfq_list,&cfqd->cid[cfqq->ioprio].rr_list);
+       cfqq->skipped = 0;
+       if (cfq_next_request(q))
+               q->request_fn(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+
+       //del_timer(&cfqq->timer);
+}      
+
  static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
                                          int gfp_mask)
  {
  static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
                                          int gfp_mask)
  {
@@ -833,9 +943,22 @@ retry:
                 INIT_LIST_HEAD(&cfqq->cfq_list);
                 cfqq->hash_key = cfq_hash_key(current);
                 cfqq->ioprio = cfq_ioprio(current);
                 INIT_LIST_HEAD(&cfqq->cfq_list);
                 cfqq->hash_key = cfq_hash_key(current);
                 cfqq->ioprio = cfq_ioprio(current);
-               cfqq->avsec = 0 ;
-               cfqq->lastime = sched_clock();
-               cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100;
+               
+               cfqq->cfqpriv = cfq_cfqpriv(cfqd,current);
+               if (!cfqq->cfqpriv)
+                       cfqq->cfqpriv = &((cfqd->cid[cfqq->ioprio]).cfqpriv);
+
+               cfqq->epstart = sched_clock();
+               /* epsector, avsec, skipped initialized to zero by memset */
+               
+               init_timer(&cfqq->timer);
+               cfqq->timer.function = cfq_pauseq_timer;
+               cfqq->timer.data = (unsigned long) cfqq;
+
+               INIT_WORK(&cfqq->work, cfq_pauseq_work, cfqq); 
+
+               cfqq->cfqd = cfqd ;
+
                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
         }
  
                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
         }
  
@@ -1132,6 +1255,8 @@ static void cfq_exit(request_queue_t *q, elevator_t *e)
         kfree(cfqd);
  }
  
         kfree(cfqd);
  }
  
+       
+
  static void cfq_timer(unsigned long data)
  {
         struct cfq_data *cfqd = (struct cfq_data *) data;
  static void cfq_timer(unsigned long data)
  {
         struct cfq_data *cfqd = (struct cfq_data *) data;
@@ -1182,12 +1307,12 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
                 atomic_set(&cid->cum_sectors_out,0);            
                 atomic_set(&cid->cum_queues_in,0);
                 atomic_set(&cid->cum_queues_out,0);
                 atomic_set(&cid->cum_sectors_out,0);            
                 atomic_set(&cid->cum_queues_in,0);
                 atomic_set(&cid->cum_queues_out,0);
-#if 0
-               atomic_set(&cid->nskip,0);
-               atomic_set(&cid->navsec,0);
-               atomic_set(&cid->csectorate,0);
-               atomic_set(&cid->lsectorate,0);
-#endif
+
+               
+               atomic_set(&((cid->cfqpriv).sectorate),CFQ_SECTORATE);
+               (cid->cfqpriv).nskip = 0;
+               (cid->cfqpriv).navsec = 0;
+               (cid->cfqpriv).timedout = 0;
         }
  
         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
         }
  
         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
@@ -1217,6 +1342,9 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
         cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
         cfqd->cfq_grace_rt = cfq_grace_rt;
         cfqd->cfq_grace_idle = cfq_grace_idle;
         cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
         cfqd->cfq_grace_rt = cfq_grace_rt;
         cfqd->cfq_grace_idle = cfq_grace_idle;
+       
+       cfqd->cfq_epoch = CFQ_EPOCH;
+       cfqd->cfq_hmax_pct = CFQ_HMAX_PCT;
  
         q->nr_requests <<= 2;
  
  
         q->nr_requests <<= 2;
  
@@ -1224,14 +1352,6 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
         e->elevator_data = cfqd;
         cfqd->queue = q;
  
         e->elevator_data = cfqd;
         cfqd->queue = q;
  
-       cfqd->cfq_epoch = CFQ_EPOCH;
-       if (q->hardsect_size)
-               cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/
-                                     q->hardsect_size)* (1000000 / CFQ_EPOCH);
-       else
-               cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512)
-                       * (1000000 / CFQ_EPOCH) ;
-
         return 0;
  out_crqpool:
         kfree(cfqd->cfq_hash);
         return 0;
  out_crqpool:
         kfree(cfqd->cfq_hash);
@@ -1302,6 +1422,8 @@ SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
  SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
  SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
  SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
  SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
  SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
  SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
+SHOW_FUNCTION(cfq_epoch_show, cfqd->cfq_epoch);
+SHOW_FUNCTION(cfq_hmax_pct_show, cfqd->cfq_hmax_pct);
  #undef SHOW_FUNCTION
  
  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                                \
  #undef SHOW_FUNCTION
  
  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                                \
@@ -1321,63 +1443,38 @@ STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX
  STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
  STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
  STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
  STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
  STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
  STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
+STORE_FUNCTION(cfq_epoch_store, &cfqd->cfq_epoch, 0, INT_MAX);
+STORE_FUNCTION(cfq_hmax_pct_store, &cfqd->cfq_hmax_pct, 1, 100);
  #undef STORE_FUNCTION
  
  
  #undef STORE_FUNCTION
  
  
-static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page)
-{
-       return sprintf(page, "%lu\n", cfqd->cfq_epoch);
-}
-
-static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
-       char *p = (char *) page;
-       cfqd->cfq_epoch = simple_strtoul(p, &p, 10);
-       return count;
-}
-
-static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page)
-{
-       return sprintf(page, "%lu\n", cfqd->cfq_epochsectors);
-}
-
-static ssize_t 
-cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
-       char *p = (char *) page;
-       cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10);
-       return count;
-}
-
  /* Additional entries to get priority level data */
  static ssize_t
  cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
  {
  /* Additional entries to get priority level data */
  static ssize_t
  cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
  {
-       int r1,r2,s1,s2,q1,q2;
+    //int r1,r2,s1,s2,q1,q2;
  
         if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) 
                 return 0;
         
  
         if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) 
                 return 0;
         
+       /*
         r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
         r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
         s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
         s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
         q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); 
         q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
         r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
         r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
         s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
         s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
         q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); 
         q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
-       
-       return sprintf(page,"skip %d avsec %lu rate %lu new %lu"
-                      "rq (%d,%d) sec (%d,%d) q (%d,%d)\n",
-                      cfqd->cid[priolvl].nskip,
-                      cfqd->cid[priolvl].navsec,
-                      cfqd->cid[priolvl].csectorate,
-                      cfqd->cid[priolvl].lsectorate,
-//                    atomic_read(&cfqd->cid[priolvl].nskip),
-//                    atomic_read(&cfqd->cid[priolvl].navsec),
-//                    atomic_read(&cfqd->cid[priolvl].csectorate),
-//                    atomic_read(&cfqd->cid[priolvl].lsectorate),
-                      r1,r2,
-                      s1,s2,
-                      q1,q2);
+       */
+
+       return sprintf(page,"skip %d timdout %d avsec %lu rate %ld "
+                      " sec0 %lu sec1 %lu\n",
+                      cfqd->cid[priolvl].cfqpriv.nskip,
+                      cfqd->cid[priolvl].cfqpriv.timedout,
+                      cfqd->cid[priolvl].cfqpriv.navsec,
+                      atomic_read(&(cfqd->cid[priolvl].cfqpriv.sectorate)),
+                      (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[0],
+                      (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[1]);
+
  }
  
  #define SHOW_PRIO_DATA(__PRIOLVL)                                               \
  }
  
  #define SHOW_PRIO_DATA(__PRIOLVL)                                               \
@@ -1411,12 +1508,25 @@ SHOW_PRIO_DATA(20);
  
  static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
  {      
  
  static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
  {      
+
+       char *p = (char *) page;
+       int val;
+
+       val = (int) simple_strtoul(p, &p, 10);
+
+       atomic_set(&(cfqd->cid[priolvl].cfqpriv.sectorate),val);
+       cfqd->cid[priolvl].cfqpriv.nskip = 0;
+       cfqd->cid[priolvl].cfqpriv.navsec = 0;
+       cfqd->cid[priolvl].cfqpriv.timedout = 0;
+
+#if 0
         atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
         atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
         atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
         atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
         atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
         atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
+#endif
  
         return count;
  }
  
         return count;
  }
@@ -1491,10 +1601,10 @@ static struct cfq_fs_entry cfq_epoch_entry = {
         .show = cfq_epoch_show,
         .store = cfq_epoch_store,
  };
         .show = cfq_epoch_show,
         .store = cfq_epoch_store,
  };
-static struct cfq_fs_entry cfq_epochsectors_entry = {
-       .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR },
-       .show = cfq_epochsectors_show,
-       .store = cfq_epochsectors_store,
+static struct cfq_fs_entry cfq_hmax_pct_entry = {
+       .attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR },
+       .show = cfq_hmax_pct_show,
+       .store = cfq_hmax_pct_store,
  };
  
  #define P_0_STR   "p0"
  };
  
  #define P_0_STR   "p0"
@@ -1558,7 +1668,7 @@ static struct attribute *default_attrs[] = {
         &cfq_grace_rt_entry.attr,
         &cfq_grace_idle_entry.attr,
         &cfq_epoch_entry.attr,
         &cfq_grace_rt_entry.attr,
         &cfq_grace_idle_entry.attr,
         &cfq_epoch_entry.attr,
-       &cfq_epochsectors_entry.attr,
+       &cfq_hmax_pct_entry.attr,
         &cfq_prio_0_entry.attr,
         &cfq_prio_1_entry.attr,
         &cfq_prio_2_entry.attr,
         &cfq_prio_0_entry.attr,
         &cfq_prio_1_entry.attr,
         &cfq_prio_2_entry.attr,
diff --git a/drivers/block/ckrm-io.c b/drivers/block/ckrm-io.c

index 7edfce7..8991026 100644 (file)
--- a/drivers/block/ckrm-io.c
+++ b/drivers/block/ckrm-io.c
@@ -35,14 +35,11 @@
  #include <linux/ckrm_tc.h>
  #include <linux/ckrm-io.h>
  
  #include <linux/ckrm_tc.h>
  #include <linux/ckrm-io.h>
  
-/* Tie to cfq priorities */
-#define CKI_IOPRIO_NORM                IOPRIO_NORM
+/* sectorate == 512 byte sectors served in CFQ_EPOCH ns*/
  
  
-/* Divisor to get fraction of bandwidth represented by an IOPRIO value */
-/* FIXME: Will not work if IOPRIO_NR > 100 */
-#define CKI_IOPRIO_DIV         (IOPRIO_NR-1)
-/* Minimum ioprio value to be assigned to a class */
-#define CKI_IOPRIO_MIN         1
+/* CKI_ROOTSECTORATE needs to be made configurable from outside */
+#define CKI_ROOTSECTORATE      100000
+#define CKI_MINSECTORATE       100
  
  #define CKI_IOUSAGE_UNIT       512
  
  
  #define CKI_IOUSAGE_UNIT       512
  
@@ -52,7 +49,12 @@ typedef struct ckrm_io_stats{
         unsigned long        blksz;  /* size of bandwidth unit */
         atomic_t             blkrd;  /* read units submitted to DD */
         atomic_t             blkwr; /* write units submitted to DD */
         unsigned long        blksz;  /* size of bandwidth unit */
         atomic_t             blkrd;  /* read units submitted to DD */
         atomic_t             blkwr; /* write units submitted to DD */
-       
+
+       int nskip;                      /* # times q skipped    */
+       unsigned long navsec;           /* avg sectors serviced */
+       int timedout;                   /* # times gap > epoch  */
+       u64 sec[2];                     /* sectors serviced in 
+                                          prev & curr epochs   */
  } cki_stats_t;          /* per class I/O statistics */
  
  /* Note
  } cki_stats_t;          /* per class I/O statistics */
  
  /* Note
@@ -75,8 +77,12 @@ typedef struct ckrm_io_class {
          * in local units. 
          */
  
          * in local units. 
          */
  
+       cfqlim_t cfqpriv;       /* Data common with cfq priolvl's */    
+
+
         int cnt_guarantee; /* Allocation as parent */
         int cnt_unused;    /* Allocation to default subclass */
         int cnt_guarantee; /* Allocation as parent */
         int cnt_unused;    /* Allocation to default subclass */
+       int cnt_limit;
  
         /* Statistics, for class and default subclass */
         cki_stats_t stats; 
  
         /* Statistics, for class and default subclass */
         cki_stats_t stats; 
@@ -85,19 +91,16 @@ typedef struct ckrm_io_class {
  } cki_icls_t;
  
  
  } cki_icls_t;
  
  
-
  /* Internal functions */
  static inline void cki_reset_stats(cki_stats_t *usg);
  static inline void init_icls_one(cki_icls_t *icls);
  /* Internal functions */
  static inline void cki_reset_stats(cki_stats_t *usg);
  static inline void init_icls_one(cki_icls_t *icls);
-static inline int cki_div(int *a, int b, int c);
-//static inline int cki_recalc(cki_icls_t *icls, int rel2abs);
  static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres);
  
  /* External functions e.g. interface to ioscheduler */
  void *cki_tsk_icls (struct task_struct *tsk);
  int cki_tsk_ioprio (struct task_struct *tsk);
  
  static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres);
  
  /* External functions e.g. interface to ioscheduler */
  void *cki_tsk_icls (struct task_struct *tsk);
  int cki_tsk_ioprio (struct task_struct *tsk);
  
-extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio);
+extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv);
  
  /* CKRM Resource Controller API functions */
  static void * cki_alloc(struct ckrm_core_class *this,
  
  /* CKRM Resource Controller API functions */
  static void * cki_alloc(struct ckrm_core_class *this,
@@ -139,45 +142,27 @@ static inline void init_icls_stats(cki_icls_t *icls)
  
  static inline void init_icls_one(cki_icls_t *icls)
  {
  
  static inline void init_icls_one(cki_icls_t *icls)
  {
-       // Assign zero as initial guarantee otherwise creations
-       // could fail due to inadequate share
-
-       //icls->shares.my_guarantee = 
-       //      (CKI_IOPRIO_MIN * CKRM_SHARE_DFLT_TOTAL_GUARANTEE) / 
-       //      CKI_IOPRIO_DIV ;
-       icls->shares.my_guarantee = 0;
-       icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       /* Zero initial guarantee for scalable creation of
+          multiple classes */
  
  
-       icls->shares.unused_guarantee = icls->shares.total_guarantee - 
-               icls->shares.my_guarantee;
-       icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-
-       icls->cnt_guarantee = icls->cnt_unused = IOPRIO_IDLE;
+       /* Try out a new set */
+       
+       icls->shares.my_guarantee = CKRM_SHARE_DONTCARE;
+       icls->shares.my_limit = CKRM_SHARE_DONTCARE;
+       icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+       icls->shares.unused_guarantee = icls->shares.total_guarantee;
+       icls->shares.cur_max_limit = 0;
  
  
-       //Same rationale icls->ioprio = CKI_IOPRIO_MIN;
-       //IOPRIO_IDLE equivalence to zero my_guarantee (set above) relies
-       //on former being zero.
+       icls->cnt_guarantee = CKRM_SHARE_DONTCARE;
+       icls->cnt_unused = CKRM_SHARE_DONTCARE;
+       icls->cnt_limit = CKRM_SHARE_DONTCARE;
         
         init_icls_stats(icls);
  }
  
         
         init_icls_stats(icls);
  }
  
-
-static inline int cki_div(int *a, int b, int c)
-{
-       u64 temp = (u64) b * c ;
-       do_div(temp,CKI_IOPRIO_DIV);
-       *a = (int) temp;
-
-       return 0;
-}
-       
-
-/* Recalculate absolute shares from relative (rel2abs=1)
- * or vice versa (rel2abs=0) 
- * Caller should have a lock on icls
+/* Recalculate absolute shares from relative
+ * Caller should hold a lock on icls
   */
  
  static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
   */
  
  static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
@@ -186,17 +171,17 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
         ckrm_core_class_t *child = NULL;
         cki_icls_t *childres;
         int resid = cki_rcbs.resid;
         ckrm_core_class_t *child = NULL;
         cki_icls_t *childres;
         int resid = cki_rcbs.resid;
+       u64 temp;
  
         if (parres) {
                 struct ckrm_shares *par = &parres->shares;
                 struct ckrm_shares *self = &res->shares;
  
  
  
         if (parres) {
                 struct ckrm_shares *par = &parres->shares;
                 struct ckrm_shares *self = &res->shares;
  
  
-
                 if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         res->cnt_guarantee = CKRM_SHARE_DONTCARE;
                 } else if (par->total_guarantee) {
                 if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         res->cnt_guarantee = CKRM_SHARE_DONTCARE;
                 } else if (par->total_guarantee) {
-                       u64 temp = (u64) self->my_guarantee * 
+                       temp = (u64) self->my_guarantee * 
                                 parres->cnt_guarantee;
                         do_div(temp, par->total_guarantee);
                         res->cnt_guarantee = (int) temp;
                                 parres->cnt_guarantee;
                         do_div(temp, par->total_guarantee);
                         res->cnt_guarantee = (int) temp;
@@ -204,16 +189,36 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
                         res->cnt_guarantee = 0;
                 }
  
                         res->cnt_guarantee = 0;
                 }
  
+
+               if (parres->cnt_limit == CKRM_SHARE_DONTCARE) {
+                       res->cnt_limit = CKRM_SHARE_DONTCARE;
+                       atomic_set(&res->cfqpriv.sectorate,CKI_MINSECTORATE);
+               } else {
+                       if (par->max_limit) {
+                               temp = (u64) self->my_limit * 
+                                       parres->cnt_limit;
+                               do_div(temp, par->max_limit);
+                               res->cnt_limit = (int) temp;
+                       } else {
+                               res->cnt_limit = 0;
+                       }
+                       atomic_set(&res->cfqpriv.sectorate,res->cnt_limit);
+               }
+               
                 if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         res->cnt_unused = CKRM_SHARE_DONTCARE;
                 if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         res->cnt_unused = CKRM_SHARE_DONTCARE;
-               } else if (self->total_guarantee) {
-                       u64 temp = (u64) self->unused_guarantee * 
-                               res->cnt_guarantee;
-                       do_div(temp, self->total_guarantee);
-                       res->cnt_unused = (int) temp;
                 } else {
                 } else {
-                       res->cnt_unused = 0;
+                       if (self->total_guarantee) {
+                               temp = (u64) self->unused_guarantee * 
+                                       res->cnt_guarantee;
+                               do_div(temp, self->total_guarantee);
+                               res->cnt_unused = (int) temp;
+                       } else {
+                               res->cnt_unused = 0;
+                       }
+
                 }
                 }
+               
         }
         // propagate to children
         ckrm_lock_hier(res->core);
         }
         // propagate to children
         ckrm_lock_hier(res->core);
@@ -228,50 +233,6 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
         ckrm_unlock_hier(res->core);
  }
  
         ckrm_unlock_hier(res->core);
  }
  
-#if 0
-static inline int cki_recalc(cki_icls_t *icls, int rel2abs)
-{
-       u64 temp;
-
-       if (icls->parent == NULL) {
-               /* Root, as parent, always gets all */
-
-               temp = icls->shares.my_guarantee * (IOPRIO_NR-1);
-               do_div(temp, icls->shares.total_guarantee);
-
-               icls->total = IOPRIO_NR-1;
-               icls->ioprio = temp ;
-               icls->unused = icls->total - icls->ioprio;
-//             icls->unused = (IOPRIO_NR-1)-icls->ioprio;
-
-       } else {
-               cki_icls_t *parres;
-               int partot ;
-               
-               parres = ckrm_get_res_class(icls->parent,
-                                           cki_rcbs.resid,
-                                           cki_icls_t);
-               if (!parres) {
-                       printk(KERN_ERR "cki_recalc: error getting "
-                              "resclass from core \n");
-                       return -EINVAL;
-               }
-
-
-               temp = (icls->shares.my_guarantee * 
-                       parres->total);
-               do_div(temp, parres->shares.total_guarantee);
-
-               icls->ioprio = temp;
-               icls->unused = 0;
-
-       }
-       
-       return 0;
-
-}
-#endif
-
  void *cki_tsk_icls(struct task_struct *tsk)
  {
         return (void *) ckrm_get_res_class(class_core(tsk->taskclass),
  void *cki_tsk_icls(struct task_struct *tsk)
  {
         return (void *) ckrm_get_res_class(class_core(tsk->taskclass),
@@ -279,12 +240,19 @@ void *cki_tsk_icls(struct task_struct *tsk)
  }
  
  int cki_tsk_ioprio(struct task_struct *tsk)
  }
  
  int cki_tsk_ioprio(struct task_struct *tsk)
+{
+       /* Don't use I/O priorities for now */
+       return IOPRIO_NORM;
+}
+
+void *cki_tsk_cfqpriv(struct task_struct *tsk)
  {
         cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass),
                                            cki_rcbs.resid, cki_icls_t);
  {
         cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass),
                                            cki_rcbs.resid, cki_icls_t);
-       return icls->cnt_unused;
+       return (void *)&(icls->cfqpriv);
  }
  
  }
  
+
  static void *cki_alloc(struct ckrm_core_class *core,
                          struct ckrm_core_class *parent)
  {
  static void *cki_alloc(struct ckrm_core_class *core,
                          struct ckrm_core_class *parent)
  {
@@ -301,43 +269,13 @@ static void *cki_alloc(struct ckrm_core_class *core,
         icls->parent = parent;
         icls->shares_lock = SPIN_LOCK_UNLOCKED;
  
         icls->parent = parent;
         icls->shares_lock = SPIN_LOCK_UNLOCKED;
  
-       if (parent == NULL) {
-
-               /* Root class gets same as "normal" CFQ priorities to
-                * retain compatibility of behaviour in the absence of 
-                * other classes
-                */
-
-               icls->cnt_guarantee = icls->cnt_unused = IOPRIO_NR-1; 
-
-               /* Default gets normal, not minimum */
-               //icls->unused = IOPRIO_NORM;
-               //icls->unused = icls->guarantee-icls->myguarantee;
-               //icls->limit = icls->mylimit = IOPRIO_NR;
-
-               /* Compute shares in abstract units */
-               icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-               // my_guarantee for root is meaningless. Set to default
-               icls->shares.my_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       init_icls_one(icls);
  
  
-               icls->shares.unused_guarantee = 
-                       CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-               //temp = (u64) icls->cnt_unused * icls->shares.total_guarantee;
-               //do_div(temp, CKI_IOPRIO_DIV); 
-               // temp now has root's default's share
-               //icls->shares.unused_guarantee = 
-               // icls->shares.total_guarantee - temp; 
-
-               icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-               icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-               icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-       } else {
-               init_icls_one(icls);
-               /* No propagation to parent needed if icls'
-                  initial share is zero */
+       if (parent == NULL) {
+               icls->cnt_guarantee =  CKI_ROOTSECTORATE;
+               icls->cnt_unused = CKI_ROOTSECTORATE;
+               icls->cnt_limit = CKI_ROOTSECTORATE;
+               atomic_set(&(icls->cfqpriv.sectorate),icls->cnt_limit);
         }
         try_module_get(THIS_MODULE);
         return icls;
         }
         try_module_get(THIS_MODULE);
         return icls;
@@ -345,7 +283,10 @@ static void *cki_alloc(struct ckrm_core_class *core,
  
  static void cki_free(void *res)
  {
  
  static void cki_free(void *res)
  {
-       cki_icls_t *icls = res, *parres;
+       cki_icls_t *icls = res, *parres, *childres;
+       ckrm_core_class_t *child = NULL;
+       int maxlimit, resid = cki_rcbs.resid;
+
         
         if (!res)
                 return;
         
         if (!res)
                 return;
@@ -361,9 +302,7 @@ static void cki_free(void *res)
          *
          */
  
          *
          */
  
-       parres = ckrm_get_res_class(icls->parent,
-                                   cki_rcbs.resid,
-                                   cki_icls_t);
+       parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t);
         if (!parres) {
                 printk(KERN_ERR "cki_free: error getting "
                        "resclass from core \n");
         if (!parres) {
                 printk(KERN_ERR "cki_free: error getting "
                        "resclass from core \n");
@@ -372,8 +311,23 @@ static void cki_free(void *res)
  
         /* Update parent's shares */
         spin_lock(&parres->shares_lock);
  
         /* Update parent's shares */
         spin_lock(&parres->shares_lock);
+
         child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0);
         parres->cnt_unused += icls->cnt_guarantee;
         child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0);
         parres->cnt_unused += icls->cnt_guarantee;
+
+       // run thru parent's children and get the new max_limit of the parent
+       ckrm_lock_hier(parres->core);
+       maxlimit = 0;
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, resid, cki_icls_t);
+               if (maxlimit < childres->shares.my_limit) {
+                       maxlimit = childres->shares.my_limit;
+               }
+       }
+       ckrm_unlock_hier(parres->core);
+       if (parres->shares.cur_max_limit < maxlimit) {
+               parres->shares.cur_max_limit = maxlimit;
+       }
         spin_unlock(&parres->shares_lock);
  
         kfree(res);
         spin_unlock(&parres->shares_lock);
  
         kfree(res);
@@ -388,26 +342,15 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
         struct ckrm_shares *cur, *par;
         int rc = -EINVAL, resid = cki_rcbs.resid;
  
         struct ckrm_shares *cur, *par;
         int rc = -EINVAL, resid = cki_rcbs.resid;
  
-       if (!icls) {
-               printk(KERN_ERR "No class\n");
+       if (!icls) 
                 return rc;
                 return rc;
-       }
  
         cur = &icls->shares; 
  
         cur = &icls->shares; 
-
-       /* limits not supported */
-       if ((new->max_limit != CKRM_SHARE_UNCHANGED)
-           || (new->my_limit != CKRM_SHARE_UNCHANGED)) {
-               printk(KERN_ERR "limits not supported\n");
-               return -EINVAL;
-       }
-
         if (icls->parent) {
                 parres =
                     ckrm_get_res_class(icls->parent, resid, cki_icls_t);
                 if (!parres) {
         if (icls->parent) {
                 parres =
                     ckrm_get_res_class(icls->parent, resid, cki_icls_t);
                 if (!parres) {
-                       printk(KERN_ERR "cki_setshare: error getting "
-                              "resclass from core \n");
+                       pr_debug("cki_setshare: invalid resclass\n");
                         return -EINVAL;
                 }
                 spin_lock(&parres->shares_lock);
                         return -EINVAL;
                 }
                 spin_lock(&parres->shares_lock);
@@ -420,10 +363,8 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
         }
  
         rc = set_shares(new, cur, par);
         }
  
         rc = set_shares(new, cur, par);
-       printk(KERN_ERR "rc from set_shares %d\n", rc);
  
         if ((!rc) && parres) {
  
         if ((!rc) && parres) {
-               
                 if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         parres->cnt_unused = CKRM_SHARE_DONTCARE;
                 } else if (par->total_guarantee) {
                 if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         parres->cnt_unused = CKRM_SHARE_DONTCARE;
                 } else if (par->total_guarantee) {
@@ -435,17 +376,6 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
                         parres->cnt_unused = 0;
                 }
                 cki_recalc_propagate(res, parres);
                         parres->cnt_unused = 0;
                 }
                 cki_recalc_propagate(res, parres);
-       
-#if 0
-               int old = icls->ioprio;
-               
-               rc = cki_recalc(icls,0);
-
-               if (!rc && parres) {
-                       int raise_tot = icls->ioprio - old ;
-                       parres->unused -= raise_tot ;
-               }
-#endif
         }
         spin_unlock(&icls->shares_lock);
         if (icls->parent) {
         }
         spin_unlock(&icls->shares_lock);
         if (icls->parent) {
@@ -471,15 +401,15 @@ static int cki_getstats(void *res, struct seq_file *sfile)
         if (!icls)
                 return -EINVAL;
  
         if (!icls)
                 return -EINVAL;
  
-/*     
-       seq_printf(sfile, "%d my_read\n",atomic_read(&icls->mystats.blkrd));
-       seq_printf(sfile, "%d my_write\n",atomic_read(&icls->mystats.blkwr));
-       seq_printf(sfile, "%d total_read\n",atomic_read(&icls->stats.blkrd));
-       seq_printf(sfile, "%d total_write\n",atomic_read(&icls->stats.blkwr));
-*/
-       
-       seq_printf(sfile, "%d total ioprio\n",icls->cnt_guarantee);
-       seq_printf(sfile, "%d unused/default ioprio\n",icls->cnt_unused);
+       seq_printf(sfile, "abs limit %d\n",icls->cnt_limit);
+       seq_printf(sfile, "skip %d timdout %d avsec %lu rate %ld "
+                  " sec0 %ld sec1 %ld\n",
+                  icls->cfqpriv.nskip,
+                  icls->cfqpriv.timedout,
+                  icls->cfqpriv.navsec,
+                  atomic_read(&(icls->cfqpriv.sectorate)),
+                  (unsigned long)icls->cfqpriv.sec[0],
+                  (unsigned long)icls->cfqpriv.sec[1]);
  
         return 0;
  }
  
         return 0;
  }
@@ -554,7 +484,7 @@ int __init cki_init(void)
                 resid = ckrm_register_res_ctlr(clstype, &cki_rcbs);
                 if (resid != -1) {
                         cki_rcbs.classtype = clstype;
                 resid = ckrm_register_res_ctlr(clstype, &cki_rcbs);
                 if (resid != -1) {
                         cki_rcbs.classtype = clstype;
-                       cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio);
+                       cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio,cki_tsk_cfqpriv);
                 }
         }
         
                 }
         }
         
@@ -566,7 +496,7 @@ void __exit cki_exit(void)
         ckrm_unregister_res_ctlr(&cki_rcbs);
         cki_rcbs.resid = -1;
         cki_rcbs.classtype = NULL; 
         ckrm_unregister_res_ctlr(&cki_rcbs);
         cki_rcbs.resid = -1;
         cki_rcbs.classtype = NULL; 
-       cki_cfq_set(NULL,NULL);
+       cki_cfq_set(NULL,NULL,NULL);
  }
  
  module_init(cki_init)
  }
  
  module_init(cki_init)
diff --git a/drivers/block/ckrm-iostub.c b/drivers/block/ckrm-iostub.c

index c325d8e..f401254 100644 (file)
--- a/drivers/block/ckrm-iostub.c
+++ b/drivers/block/ckrm-iostub.c
@@ -25,13 +25,14 @@ static spinlock_t stub_lock = SPIN_LOCK_UNLOCKED;
  
  static icls_tsk_t tskiclstub;
  static icls_ioprio_t tskiopriostub;
  
  static icls_tsk_t tskiclstub;
  static icls_ioprio_t tskiopriostub;
+static icls_tsk_t tskcfqprivstub;
  
  
-
-void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio)
+void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv)
  {
         spin_lock(&stub_lock);
         tskiclstub = tskicls;
         tskiopriostub = tskioprio;
  {
         spin_lock(&stub_lock);
         tskiclstub = tskicls;
         tskiopriostub = tskioprio;
+       tskcfqprivstub = tskcfqpriv;
         spin_unlock(&stub_lock);
  }
  
         spin_unlock(&stub_lock);
  }
  
@@ -59,6 +60,19 @@ int cki_ioprio(struct task_struct *tsk)
         return ret;
  }
  
         return ret;
  }
  
+void *cki_cfqpriv(struct task_struct *tsk)
+{
+       void *ret;
+       spin_lock(&stub_lock);
+       if (tskiclstub)
+               ret = (*tskcfqprivstub)(tsk);
+       else 
+               ret = NULL;
+       spin_unlock(&stub_lock);
+       return ret;
+}    
+
  EXPORT_SYMBOL(cki_cfq_set);
  EXPORT_SYMBOL(cki_hash_key);
  EXPORT_SYMBOL(cki_ioprio);
  EXPORT_SYMBOL(cki_cfq_set);
  EXPORT_SYMBOL(cki_hash_key);
  EXPORT_SYMBOL(cki_ioprio);
+EXPORT_SYMBOL(cki_cfqpriv);
diff --git a/drivers/char/.cvsignore b/drivers/char/.cvsignore

new file mode 100644 (file)

index 0000000..83683a2
--- /dev/null
+++ b/drivers/char/.cvsignore
@@ -0,0 +1,2 @@
+consolemap_deftbl.c
+defkeymap.c
diff --git a/drivers/pci/.cvsignore b/drivers/pci/.cvsignore

new file mode 100644 (file)

index 0000000..d5b21d9
--- /dev/null
+++ b/drivers/pci/.cvsignore
@@ -0,0 +1,3 @@
+classlist.h
+devlist.h
+gen-devlist
diff --git a/drivers/scsi/aic7xxx/.cvsignore b/drivers/scsi/aic7xxx/.cvsignore

new file mode 100644 (file)

index 0000000..a1a7fcd
--- /dev/null
+++ b/drivers/scsi/aic7xxx/.cvsignore
@@ -0,0 +1,4 @@
+aic79xx_reg.h
+aic79xx_seq.h
+aic7xxx_reg.h
+aic7xxx_seq.h
diff --git a/fs/aio.c b/fs/aio.c

index 9e7b592..2335a07 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -543,7 +543,7 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id)
         return ioctx;
  }
  
         return ioctx;
  }
  
-static void use_mm(struct mm_struct *mm)
+void use_mm(struct mm_struct *mm)
  {
         struct mm_struct *active_mm;
  
  {
         struct mm_struct *active_mm;
  
diff --git a/include/.cvsignore b/include/.cvsignore

new file mode 100644 (file)

index 0000000..04204c7
--- /dev/null
+++ b/include/.cvsignore
@@ -0,0 +1 @@
+config
diff --git a/include/asm-i386/.cvsignore b/include/asm-i386/.cvsignore

new file mode 100644 (file)

index 0000000..4ec57ad
--- /dev/null
+++ b/include/asm-i386/.cvsignore
@@ -0,0 +1 @@
+asm_offsets.h
diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h

index c689554..9513dd8 100644 (file)
--- a/include/asm-i386/apicdef.h
+++ b/include/asm-i386/apicdef.h
@@ -86,6 +86,7 @@
  #define                        APIC_LVT_REMOTE_IRR             (1<<14)
  #define                        APIC_INPUT_POLARITY             (1<<13)
  #define                        APIC_SEND_PENDING               (1<<12)
  #define                        APIC_LVT_REMOTE_IRR             (1<<14)
  #define                        APIC_INPUT_POLARITY             (1<<13)
  #define                        APIC_SEND_PENDING               (1<<12)
+#define                        APIC_MODE_MASK                  0x700
  #define                        GET_APIC_DELIVERY_MODE(x)       (((x)>>8)&0x7)
  #define                        SET_APIC_DELIVERY_MODE(x,y)     (((x)&~0x700)|((y)<<8))
  #define                                APIC_MODE_FIXED         0x0
  #define                        GET_APIC_DELIVERY_MODE(x)       (((x)>>8)&0x7)
  #define                        SET_APIC_DELIVERY_MODE(x,y)     (((x)&~0x700)|((y)<<8))
  #define                                APIC_MODE_FIXED         0x0
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h

index d1a4dd6..43917d9 100644 (file)
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -39,6 +39,7 @@ union irq_ctx {
         u32                     stack[THREAD_SIZE/sizeof(u32)];
  };
  
         u32                     stack[THREAD_SIZE/sizeof(u32)];
  };
  
+#ifdef CONFIG_IRQSTACKS
  extern union irq_ctx *hardirq_ctx[NR_CPUS];
  extern union irq_ctx *softirq_ctx[NR_CPUS];
  
  extern union irq_ctx *hardirq_ctx[NR_CPUS];
  extern union irq_ctx *softirq_ctx[NR_CPUS];
  
@@ -46,6 +47,10 @@ extern void irq_ctx_init(int cpu);
  
  #define __ARCH_HAS_DO_SOFTIRQ
  
  
  #define __ARCH_HAS_DO_SOFTIRQ
  
+#else
+#define irq_ctx_init(cpu) do { ; } while (0)
+#endif
+
  struct irqaction;
  struct pt_regs;
  asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *,
  struct irqaction;
  struct pt_regs;
  asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *,
diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h

new file mode 100644 (file)

index 0000000..eb8fd98
--- /dev/null
+++ b/include/asm-i386/kexec.h
@@ -0,0 +1,25 @@
+#ifndef _I386_KEXEC_H
+#define _I386_KEXEC_H
+
+#include <asm/fixmap.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
+ * calculation for the amount of memory directly mappable into the
+ * kernel memory space.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+#define KEXEC_CONTROL_CODE_SIZE        4096
+
+#endif /* _I386_KEXEC_H */
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h

index 614d05f..263c6f7 100644 (file)
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -60,7 +60,19 @@ struct mod_arch_specific
  #define MODULE_REGPARM ""
  #endif
  
  #define MODULE_REGPARM ""
  #endif
  
+#if (CONFIG_STACK_SIZE_SHIFT < 12)
+#define MODULE_STACKSIZE "TINYSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 12)
  #define MODULE_STACKSIZE "4KSTACKS "
  #define MODULE_STACKSIZE "4KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 13)
+#define MODULE_STACKSIZE "8KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 14)
+#define MODULE_STACKSIZE "16KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT > 14)
+#define MODULE_STACKSIZE "HUGESTACKS "
+#else
+#define MODULE_STACKSIZE ""
+#endif
  
  #define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
  
  
  #define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
  
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h

index cd8708b..3651a3b 100644 (file)
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -400,10 +400,10 @@ struct tss_struct {
  
  #define ARCH_MIN_TASKALIGN     16
  
  
  #define ARCH_MIN_TASKALIGN     16
  
-
-#define STACK_PAGE_COUNT       (4096/PAGE_SIZE)
-
-
+#if ((1<<CONFIG_STACK_SIZE_SHIFT) < PAGE_SIZE)
+#error (1<<CONFIG_STACK_SIZE_SHIFT) must be at least PAGE_SIZE
+#endif
+#define STACK_PAGE_COUNT       ((1<<CONFIG_STACK_SIZE_SHIFT)/PAGE_SIZE)
  
  
  struct thread_struct {
  
  
  struct thread_struct {
diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h

index abe3440..ed44e47 100644 (file)
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -95,4 +95,6 @@
   */
  #define IDT_ENTRIES 256
  
   */
  #define IDT_ENTRIES 256
  
+#define KERN_PHYS_OFFSET (CONFIG_KERN_PHYS_OFFSET * 0x100000)
+
  #endif
  #endif
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h

index d941e6d..da74573 100644 (file)
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -54,9 +54,10 @@ struct thread_info {
  #endif
  
  #define PREEMPT_ACTIVE         0x4000000
  #endif
  
  #define PREEMPT_ACTIVE         0x4000000
-#define THREAD_SIZE            (4096)
+#define THREAD_SIZE            (1<<CONFIG_STACK_SIZE_SHIFT)
+#define STACK_WARN             (CONFIG_STACK_WARN)
+#define STACK_PANIC            (0x200ul)
  
  
-#define STACK_WARN             (THREAD_SIZE/8)
  /*
   * macros/functions for gaining access to the thread information structure
   *
  /*
   * macros/functions for gaining access to the thread information structure
   *
diff --git a/include/linux/.cvsignore b/include/linux/.cvsignore

new file mode 100644 (file)

index 0000000..c1cddb6
--- /dev/null
+++ b/include/linux/.cvsignore
@@ -0,0 +1,3 @@
+autoconf.h
+compile.h
+version.h
diff --git a/include/linux/ckrm-io.h b/include/linux/ckrm-io.h

index 36040b9..70277c7 100644 (file)
--- a/include/linux/ckrm-io.h
+++ b/include/linux/ckrm-io.h
@@ -34,6 +34,7 @@ typedef int (*icls_ioprio_t) (struct task_struct *tsk);
  
  extern void *cki_tsk_icls (struct task_struct *tsk);
  extern int cki_tsk_ioprio (struct task_struct *tsk);
  
  extern void *cki_tsk_icls (struct task_struct *tsk);
  extern int cki_tsk_ioprio (struct task_struct *tsk);
+extern void *cki_tsk_cfqpriv (struct task_struct *tsk);
  
  #endif /* CONFIG_CKRM_RES_BLKIO */
  
  
  #endif /* CONFIG_CKRM_RES_BLKIO */
  
diff --git a/include/linux/ckrm_ce.h b/include/linux/ckrm_ce.h

index f4e91e9..3a7c743 100644 (file)
--- a/include/linux/ckrm_ce.h
+++ b/include/linux/ckrm_ce.h
@@ -90,7 +90,7 @@ typedef struct rbce_eng_callback {
  extern int ckrm_register_engine(const char *name, ckrm_eng_callback_t *);
  extern int ckrm_unregister_engine(const char *name);
  
  extern int ckrm_register_engine(const char *name, ckrm_eng_callback_t *);
  extern int ckrm_unregister_engine(const char *name);
  
-extern void *ckrm_classobj(char *, int *classtype);
+extern void *ckrm_classobj(const char *, int *classtype);
  extern int get_exe_path_name(struct task_struct *t, char *filename,
                              int max_size);
  
  extern int get_exe_path_name(struct task_struct *t, char *filename,
                              int max_size);
  
diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h

index 3041c81..a02794d 100644 (file)
--- a/include/linux/ckrm_classqueue.h
+++ b/include/linux/ckrm_classqueue.h
@@ -19,7 +19,17 @@
   * Aug 28, 2003
   *        Created.
   * July 07, 2004
   * Aug 28, 2003
   *        Created.
   * July 07, 2004
- *   clean up, add comments     
+ *   clean up, add comments
+ *
+ *
+ * Overview:
+ * ---------
+ *
+ * Please read Documentation/ckrm/cpu_sched for a general overview of
+ * how the O(1) CKRM scheduler.
+ *
+ * ckrm_classqueue.h provides the definition to maintain the 
+ * per cpu class runqueue.
   *   
   */
  
   *   
   */
  
@@ -28,14 +38,13 @@
  
  #include <linux/list.h>
  
  
  #include <linux/list.h>
  
-#define CLASSQUEUE_SIZE 1024   // acb: changed from 128
-//#define CLASSQUEUE_SIZE 128
+#warning mef: is classqueue_size big enough for PlanetLab
+#define CLASSQUEUE_SIZE_SHIFT  7
+#define CLASSQUEUE_SIZE ( 1 << CLASSQUEUE_SIZE_SHIFT )
  #define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long))
  
  /**
   * struct cq_prio_array: duplicates prio_array defined in sched.c 
  #define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long))
  
  /**
   * struct cq_prio_array: duplicates prio_array defined in sched.c 
- *
- * I duplicate this data structure to make ckrm_classqueue implementation more modular
   */
  struct cq_prio_array {
         int nr_active;
   */
  struct cq_prio_array {
         int nr_active;
@@ -49,42 +58,50 @@ struct cq_prio_array {
   * @base: base priority
   * @base_offset: index in array for the base
   *
   * @base: base priority
   * @base_offset: index in array for the base
   *
- * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks)
- * as task runqueue, each processor has a classqueue
- * a class enters the classqueue when the first task in this class local runqueue shows up
- * a class enters the classqueue when the last task in the local runqueue leaves
- * class local runqueues are ordered based their priority
- *
- * status:
- *   hzheng: is 32bit base long enough?
+ * classqueue can be thought of as runqueue of lrq's (per cpu object of
+ * a CKRM class as task runqueue (instead of runqueue of tasks)
+ * - a class's local lrq is enqueued into the local classqueue when a
+ *   first task is enqueued lrq.
+ * - a class's local lrq is removed from the local classqueue when the 
+ *   last task is dequeued from the lrq.
+ * - lrq's are ordered based on their priority (determined elsewhere)
+ *   ( CKRM: caculated based on it's progress (cvt) and urgency (top_priority)
   */
   */
+
  struct classqueue_struct {
  struct classqueue_struct {
-       struct cq_prio_array array;
+       int enabled;                   // support dynamic on/off
         unsigned long base;
         unsigned long base_offset;
         unsigned long base;
         unsigned long base_offset;
+       struct cq_prio_array array;
  };
  
  /** 
  };
  
  /** 
- * struct cq_node_struct - the link object between class local runqueue and classqueue
+ * struct cq_node_struct:
+ * - the link object between class local runqueue and classqueue
   * @list: links the class local runqueue to classqueue
   * @list: links the class local runqueue to classqueue
- * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority)
+ * @prio: class priority
   * @index: real index into the classqueue array, calculated based on priority
   * @index: real index into the classqueue array, calculated based on priority
- *
- * NOTE: make sure list is empty when it's not in classqueue
   */
  struct cq_node_struct {
         struct list_head list;
         int prio;
         int index;
   */
  struct cq_node_struct {
         struct list_head list;
         int prio;
         int index;
+       /*
+        * set when the class jump out of the class queue window
+        * class with this value set should be repositioned whenever classqueue slides window
+        * real_prio is valid when need_repos is set
+        */
+       int real_prio;
+       int need_repos; 
  };
  typedef struct cq_node_struct cq_node_t;
  
  };
  typedef struct cq_node_struct cq_node_t;
  
-typedef unsigned long long CVT_t;      // cummulative virtual time
-
  static inline void cq_node_init(cq_node_t * node)
  {
         node->prio = 0;
         node->index = -1;
  static inline void cq_node_init(cq_node_t * node)
  {
         node->prio = 0;
         node->index = -1;
+       node->real_prio = 0;
+       node->need_repos = 0;
         INIT_LIST_HEAD(&node->list);
  }
  
         INIT_LIST_HEAD(&node->list);
  }
  
@@ -95,23 +112,18 @@ static inline int cls_in_classqueue(cq_node_t * node)
  }
  
  /*initialize the data structure*/
  }
  
  /*initialize the data structure*/
-int classqueue_init(struct classqueue_struct *cq);
+int classqueue_init(struct classqueue_struct *cq, int enabled);
  
  
-/*add the class to classqueue*/
-void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio);
+/*add the class to classqueue at given priority */
+void classqueue_enqueue(struct classqueue_struct *cq, 
+                       cq_node_t * node, int prio);
  
  
-/**
- * classqueue_dequeue - remove the class from classqueue
- * 
- * internal:
- *   called when the last task is removed from the queue
- *   checked on load balancing and schedule
- *   hzheng: why don't I call it on class_dequeue_task?
- */
+/*remove the class from classqueue */
  void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node);
  
  /*change the position of the class in classqueue*/
  void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node);
  
  /*change the position of the class in classqueue*/
-void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio);
+void classqueue_update_prio(struct classqueue_struct *cq, 
+                           cq_node_t * node, int new_prio);
  
  /*return the first class in classqueue*/
  cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
  
  /*return the first class in classqueue*/
  cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
@@ -122,7 +134,8 @@ void classqueue_update_base(struct classqueue_struct *cq);
  /**
   * class_compare_prio: compare the priority of this two nodes
   */
  /**
   * class_compare_prio: compare the priority of this two nodes
   */
-static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2)
+static inline int class_compare_prio(struct cq_node_struct* node1, 
+                                    struct cq_node_struct* node2)
  {
         return ( node1->prio - node2->prio);
  }
  {
         return ( node1->prio - node2->prio);
  }
diff --git a/include/linux/ckrm_rc.h b/include/linux/ckrm_rc.h

index 1bf2d07..a134dbc 100644 (file)
--- a/include/linux/ckrm_rc.h
+++ b/include/linux/ckrm_rc.h
@@ -113,7 +113,6 @@ typedef struct ckrm_res_ctlr {
  #define CKRM_MAX_TYPENAME_LEN       32
  
  typedef struct ckrm_classtype {
  #define CKRM_MAX_TYPENAME_LEN       32
  
  typedef struct ckrm_classtype {
-       /* Hubertus:   Rearrange slots later for cache friendliness */
  
         /* resource controllers */
         spinlock_t res_ctlrs_lock;  // protect res ctlr related data
  
         /* resource controllers */
         spinlock_t res_ctlrs_lock;  // protect res ctlr related data
@@ -238,27 +237,6 @@ extern int ckrm_init_core_class(struct ckrm_classtype *clstype,
                                 struct ckrm_core_class *parent,
                                 const char *name);
  extern int ckrm_release_core_class(struct ckrm_core_class *);  
                                 struct ckrm_core_class *parent,
                                 const char *name);
  extern int ckrm_release_core_class(struct ckrm_core_class *);  
-// Hubertus .. can disappear after cls del debugging
-extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type,
-                                                const char *resname);
-
-#if 0
-
-// Hubertus ... need to straighten out all these I don't think we will even 
-// call this or are we 
-
-/* interface to the RCFS filesystem */
-extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *,
-                                                    const char *, int);
-
-// Reclassify the given pid to the given core class by force
-extern void ckrm_forced_reclassify_pid(int, struct ckrm_core_class *);
-
-// Reclassify the given net_struct  to the given core class by force
-extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *,
-                                      struct ckrm_core_class *);
-
-#endif
  
  extern void ckrm_lock_hier(struct ckrm_core_class *);
  extern void ckrm_unlock_hier(struct ckrm_core_class *);
  
  extern void ckrm_lock_hier(struct ckrm_core_class *);
  extern void ckrm_unlock_hier(struct ckrm_core_class *);
@@ -290,12 +268,6 @@ extern int ckrm_class_set_shares(struct ckrm_core_class *core,
  extern int ckrm_class_reset_stats(struct ckrm_core_class *core,
                                   const char *resname, const char *unused);
  
  extern int ckrm_class_reset_stats(struct ckrm_core_class *core,
                                   const char *resname, const char *unused);
  
-#if 0
-extern void ckrm_ns_hold(struct ckrm_net_struct *);
-extern void ckrm_ns_put(struct ckrm_net_struct *);
-extern void *ckrm_set_rootcore_byname(char *, void *);
-#endif
-
  static inline void ckrm_core_grab(struct ckrm_core_class *core)
  {
         if (core)
  static inline void ckrm_core_grab(struct ckrm_core_class *core)
  {
         if (core)
@@ -329,7 +301,6 @@ static inline unsigned int ckrm_is_core_valid(ckrm_core_class_t * core)
             )
  
  extern struct ckrm_classtype *ckrm_classtypes[];       
             )
  
  extern struct ckrm_classtype *ckrm_classtypes[];       
-/* should provide a different interface */
  
  /*-----------------------------------------------------------------------------
   * CKRM event callback specification for the classtypes or resource controllers 
  
  /*-----------------------------------------------------------------------------
   * CKRM event callback specification for the classtypes or resource controllers 
diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h

index 3611c2d..dc00aea 100644 (file)
--- a/include/linux/ckrm_sched.h
+++ b/include/linux/ckrm_sched.h
@@ -3,8 +3,6 @@
   * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   * Copyright (C) Hubertus Franke,  IBM Corp. 2004
   * 
   * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   * Copyright (C) Hubertus Franke,  IBM Corp. 2004
   * 
- * Latest version, more details at http://ckrm.sf.net
- * 
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
@@ -12,6 +10,17 @@
   *
   */
  
   *
   */
  
+/*
+ * Overview:
+ * ---------
+ *
+ * Please read Documentation/ckrm/cpu_sched for a general overview of
+ * how the O(1) CKRM scheduler.
+ *
+ * ckrm_sched.h provides the definition for the per class local runqueue.
+ *
+ */
+   
  #ifndef _CKRM_SCHED_H
  #define _CKRM_SCHED_H
  
  #ifndef _CKRM_SCHED_H
  #define _CKRM_SCHED_H
  
@@ -27,18 +36,31 @@ struct prio_array {
         struct list_head queue[MAX_PRIO];
  };
  
         struct list_head queue[MAX_PRIO];
  };
  
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-#define rq_active(p,rq)   (get_task_lrq(p)->active)
-#define rq_expired(p,rq)  (get_task_lrq(p)->expired)
-int __init init_ckrm_sched_res(void);
-#else
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+
  #define rq_active(p,rq)   (rq->active)
  #define rq_expired(p,rq)  (rq->expired)
  static inline void init_ckrm_sched_res(void) {}
  static inline int ckrm_cpu_monitor_init(void) {return 0;}
  #define rq_active(p,rq)   (rq->active)
  #define rq_expired(p,rq)  (rq->expired)
  static inline void init_ckrm_sched_res(void) {}
  static inline int ckrm_cpu_monitor_init(void) {return 0;}
-#endif //CONFIG_CKRM_CPU_SCHEDULE
  
  
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#else
+
+#define rq_active(p,rq)   (get_task_lrq(p)->active)
+#define rq_expired(p,rq)  (get_task_lrq(p)->expired)
+
+enum ckrm_sched_mode {
+       CKRM_SCHED_MODE_DISABLED, /* always use default linux scheduling     */
+                                 /* effectively disables the ckrm scheduler */
+       CKRM_SCHED_MODE_ENABLED  /* always uses ckrm scheduling behavior    */
+};
+
+extern unsigned int ckrm_sched_mode;     /* true internal sched_mode (DIS/EN ABLED) */
+
+int __init init_ckrm_sched_res(void);
+
+typedef unsigned long long CVT_t;      // cummulative virtual time
+
  struct ckrm_runqueue {
         cq_node_t classqueue_linkobj;   /*links in classqueue */
         struct ckrm_cpu_class *cpu_class;       // class it belongs to
  struct ckrm_runqueue {
         cq_node_t classqueue_linkobj;   /*links in classqueue */
         struct ckrm_cpu_class *cpu_class;       // class it belongs to
@@ -52,6 +74,7 @@ struct ckrm_runqueue {
            reset to jiffies if expires
          */
         unsigned long expired_timestamp;
            reset to jiffies if expires
          */
         unsigned long expired_timestamp;
+        int best_expired_prio;
  
         /* 
          * highest priority of tasks in active
  
         /* 
          * highest priority of tasks in active
@@ -62,23 +85,38 @@ struct ckrm_runqueue {
         CVT_t local_cvt;
  
         unsigned long lrq_load;
         CVT_t local_cvt;
  
         unsigned long lrq_load;
-       int local_weight; 
  
  
+       /* Three different weights are distinguished:
+        * local_weight, skewed_weight, over_weight:
+        *
+        * - local_weight:  main weight to drive CVT progression
+        * - over_weight:   weight to reduce savings when over its guarantee
+        * - skewed_weight: weight to use when local_weight to small
+        *                  avoids starvation problems.
+        */
+       int local_weight;   
+       int over_weight;
+       int skewed_weight;
  
         /*
  
         /*
-        * unused CPU time accumulated while thoe class 
+        * unused CPU time accumulated while the class 
          * is inactive goes to savings
          * 
          * initialized to be 0
          * a class can't accumulate more than SAVING_THRESHOLD of savings
          */
          * is inactive goes to savings
          * 
          * initialized to be 0
          * a class can't accumulate more than SAVING_THRESHOLD of savings
          */
-       unsigned long long savings;
+       CVT_t savings;
  
         unsigned long magic;    //for debugging
  
         unsigned long magic;    //for debugging
-};
+} ____cacheline_aligned_in_smp;
+
+#define CKRM_LRQ_MAGIC (0xACDC0702)
  
  typedef struct ckrm_runqueue ckrm_lrq_t;
  
  
  typedef struct ckrm_runqueue ckrm_lrq_t;
  
+#define ckrm_cpu_disabled() (ckrm_sched_mode == CKRM_SCHED_MODE_DISABLED)   
+#define ckrm_cpu_enabled()  (ckrm_sched_mode == CKRM_SCHED_MODE_ENABLED)   
+
  /**
   * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
   * 
  /**
   * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
   * 
@@ -103,24 +141,31 @@ struct ckrm_cpu_class_stat {
          */
         int eshare;
         int meshare;
          */
         int eshare;
         int meshare;
+
+       /* a boolean indicates if the class has savings or not */
+       int has_savings; 
+
+       /*
+        * a temporary value used by reorder_surplus_queue 
+        */
+       int demand_per_share;
  };
  
  #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
  
  };
  
  #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
  
-#define USAGE_SAMPLE_FREQ HZ  //sample every 1 seconds
-#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
-#define USAGE_WINDOW_SIZE 60  //keep the last 60 sample
+#define USAGE_SAMPLE_FREQ  (HZ)  //sample every 1 seconds
+#define USAGE_MAX_HISTORY  (60)  // keep the last 60 usage samples
+#define NS_PER_SAMPLE      (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
  
  struct ckrm_usage {
  
  struct ckrm_usage {
-       unsigned long samples[USAGE_WINDOW_SIZE]; //record usages 
-       unsigned long sample_pointer; //pointer for the sliding window
-       unsigned long long last_ns; //ns for last sample
-       long long last_sample_jiffies; //in number of jiffies
+       unsigned long samples[USAGE_MAX_HISTORY]; //record usages 
+       unsigned long sample_pointer;  // pointer for the sliding window
+       unsigned long long last_ns;    // ns for last sample
+       long long last_sample_jiffies; // in number of jiffies
  };
  
  /*
  };
  
  /*
- * manages the class status
- * there should be only one instance of this object for each class in the whole system  
+ * CPU controller object allocated for each CLASS
   */
  struct ckrm_cpu_class {
         struct ckrm_core_class *core;
   */
  struct ckrm_cpu_class {
         struct ckrm_core_class *core;
@@ -129,12 +174,16 @@ struct ckrm_cpu_class {
         spinlock_t cnt_lock;    // always grab parent's lock first and then child's
         struct ckrm_cpu_class_stat stat;
         struct list_head links; // for linking up in cpu classes
         spinlock_t cnt_lock;    // always grab parent's lock first and then child's
         struct ckrm_cpu_class_stat stat;
         struct list_head links; // for linking up in cpu classes
-       ckrm_lrq_t local_queues[NR_CPUS];       // runqueues 
+       struct list_head surplus_queue; //used for surplus allocation
+       ckrm_lrq_t* local_queues[NR_CPUS];      // runqueues 
         struct ckrm_usage usage;
         unsigned long magic;    //for debugging
         struct ckrm_usage usage;
         unsigned long magic;    //for debugging
+#ifdef __SIMULATOR__
+       int class_id;
+#endif
  };
  
  };
  
-#define cpu_class_weight(cls) (cls->stat.meshare)
+#define cpu_class_weight(cls)   (SHARE_TO_WEIGHT(cls->stat.meshare))
  #define local_class_weight(lrq) (lrq->local_weight)
  
  static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
  #define local_class_weight(lrq) (lrq->local_weight)
  
  static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
@@ -150,7 +199,7 @@ static inline void ckrm_usage_init(struct ckrm_usage* usage)
  {
         int i;
  
  {
         int i;
  
-       for (i=0; i < USAGE_WINDOW_SIZE; i++)
+       for (i=0; i < USAGE_MAX_HISTORY; i++)
                 usage->samples[i] = 0;
         usage->sample_pointer = 0;
         usage->last_ns = 0;
                 usage->samples[i] = 0;
         usage->sample_pointer = 0;
         usage->last_ns = 0;
@@ -188,49 +237,21 @@ static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
         //      printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
  
         usage->sample_pointer ++;
         //      printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
  
         usage->sample_pointer ++;
-       if (usage->sample_pointer >= USAGE_WINDOW_SIZE)
+       if (usage->sample_pointer >= USAGE_MAX_HISTORY)
                 usage->sample_pointer = 0;
  }
  
                 usage->sample_pointer = 0;
  }
  
-//duration is specified in number of jiffies
-//return the usage in percentage
-static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration)
-{
-       int nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
-       struct ckrm_usage* usage = &clsptr->usage;
-       unsigned long long total = 0;
-       int i, idx;
-
-       if (nr_samples > USAGE_WINDOW_SIZE)
-               nr_samples = USAGE_WINDOW_SIZE;
-
-       idx = usage->sample_pointer;    
-       for (i = 0; i< nr_samples; i++) {
-               if (! idx)
-                       idx = USAGE_WINDOW_SIZE;
-               idx --;
-               total += usage->samples[idx];
-       }
-        total *= 100;
-        do_div(total,nr_samples);
-        do_div(total,NS_PER_SAMPLE);
-       do_div(total,cpus_weight(cpu_online_map));
-        return total;
-}
-
-
  #define lrq_nr_running(lrq) \
               (lrq->active->nr_active + lrq->expired->nr_active)
  
  #define lrq_nr_running(lrq) \
               (lrq->active->nr_active + lrq->expired->nr_active)
  
-static inline ckrm_lrq_t *
-get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
+static inline ckrm_lrq_t *get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
  {
  {
-       return &(cls->local_queues[cpu]);
+       return cls->local_queues[cpu];
  }
  
  static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
  {
  }
  
  static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
  {
-       return &(p->cpu_class->local_queues[task_cpu(p)]);
+       return p->cpu_class->local_queues[task_cpu(p)];
  }
  
  #define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
  }
  
  #define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
@@ -247,16 +268,16 @@ void init_cpu_classes(void);
  void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
  void ckrm_cpu_change_class(void *task, void *old, void *new);
  
  void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
  void ckrm_cpu_change_class(void *task, void *old, void *new);
  
-
  #define CPU_DEMAND_ENQUEUE 0
  #define CPU_DEMAND_DEQUEUE 1
  #define CPU_DEMAND_DESCHEDULE 2
  #define CPU_DEMAND_INIT 3
  
  /*functions exported by ckrm_cpu_monitor.c*/
  #define CPU_DEMAND_ENQUEUE 0
  #define CPU_DEMAND_DEQUEUE 1
  #define CPU_DEMAND_DESCHEDULE 2
  #define CPU_DEMAND_INIT 3
  
  /*functions exported by ckrm_cpu_monitor.c*/
+int update_effectives(void);
  void ckrm_cpu_monitor(int check_min);
  int ckrm_cpu_monitor_init(void);
  void ckrm_cpu_monitor(int check_min);
  int ckrm_cpu_monitor_init(void);
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares);
  void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
  void adjust_local_weight(void);
  
  void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
  void adjust_local_weight(void);
  
@@ -290,61 +311,53 @@ void adjust_local_weight(void);
   *
   *******************************************************************/
  
   *
   *******************************************************************/
  
-#define CLASS_QUANTIZER 16     //shift from ns to increase class bonus
-#define PRIORITY_QUANTIZER 2   //controls how much a high prio task can borrow
-
-#define CKRM_SHARE_ACCURACY 13
-#define NSEC_PER_MS 1000000
-#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
-
-
-#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC)  // 10 seconds
-
-#define CVT_UPDATE_TICK     ((HZ/2)?:1)
-
-// ABSOLUTE_CKRM_TUNING determines whether classes can make up
-// lost time in absolute time or in relative values
-
-#define ABSOLUTE_CKRM_TUNING         // preferred due to more predictable behavior
-
-#ifdef ABSOLUTE_CKRM_TUNING
-
-#define MAX_SAVINGS        MAX_SAVINGS_ABSOLUTE
-//an absolute bonus of 200ms for classes when reactivated
-#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
-#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
-
-#define scale_cvt(val,lrq)   ((val)*local_class_weight(lrq))
-#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
-
-#else
-
-#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) 
  /*
  /*
- * to improve system responsiveness
- * an inactive class is put a little bit ahead of the current class when it wakes up
- * the amount is set in normalized term to simplify the calculation
- * for class with 100% share, it can be 2s ahead
- * while for class with 10% share, it can be 200ms ahead
+ * The class priority is biasd toward classes with high priority tasks. 
+ * But we need to prevent this bias from starving other classes.
+ * If a class has nice value of -20, how much it can starve the default class?
+ * priority bonus =  (120-100) >> PRIORITY_QUANTIZER, 
+ * if PRIORITY_QUANTIZER = 2, then it's 5 steps ahead
+ * A class without bonus thus can't get to run until: 
+ * bonus * CKRM_MAX_WEIGHT * CVT_INC_PERSHARE = (120-100) >> PRIORITY_QUANTIZER
+ *  (1 << CKRM_WEIGHT_SHIFT)
+ *  (1 << CLASS_QUANTIZER) 
+*/
+
+/* 
+ * CKRM_WEIGHT_SHIFT and CLASS_QUANTIZER control how much a class with 
+ * high priority task can starve a normal priority class, so it should
+ * be constant CLASS_QUANTIZER should not be too small otherwise we 
+ * don't have enough bins in the classqueue.
+ * The ideal value of CLASS_QUANTIZER is 20, but a little smaller is acceptable
   */
   */
-#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS)  
  
  
-/*
- * normalized savings can't be more than MAX_NORMALIZED_SAVINGS
- * based on the current configuration
- * this means that a class with share 100% will accumulate 10s at most
- * while a class with 1% of the share can only accumulate 100ms
+#define CLASS_QUANTIZER     (18)// shift from ns to increase class bonus
+#define PRIORITY_QUANTIZER  (2) // how much a high prio task can borrow
+#define CKRM_WEIGHT_SHIFT   (8) // 1/2^x == finest weight granularity
+#define CKRM_MAX_WEIGHT     (1<<CKRM_WEIGHT_SHIFT)  // - " -
+
+/* SHARES:
+ * shares are set in a hierarchical path. Since specified share settings 
+ * of a class (c) are relative to the parent (p) and its totals
+ * the shares can get very small, dependent on how many classes are 
+ * specified.
   */
   */
+ 
+#define CKRM_SHARE_SHIFT (13)  
+#define CKRM_SHARE_MAX   (1 << CKRM_SHARE_SHIFT)
  
  
-//a class with share 100% can get 100ms every 500ms
-//while a class with share 10% can only get 10ms every 500ms
-#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY)
+#define SHARE_TO_WEIGHT(x) ((x) >> (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
+#define WEIGHT_TO_SHARE(x) ((x) << (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
  
  
-#define scale_cvt(val,lrq)   (val)
-#define unscale_cvt(val,lrq) (val)
+/* Other constants */
  
  
-#endif
+#define NSEC_PER_MS          (1000000)
+#define NSEC_PER_JIFFIES     (NSEC_PER_SEC/HZ)
  
  
+#define MAX_SAVINGS_ABSOLUTE (4LLU*NSEC_PER_SEC)  // 4 seconds
+#define CVT_UPDATE_TICK      ((HZ/2)?:1)
+#define MAX_SAVINGS          MAX_SAVINGS_ABSOLUTE
+#define SAVINGS_LEAK_SPEED   (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
  
  /**
   * get_effective_prio: return the effective priority of a class local queue
  
  /**
   * get_effective_prio: return the effective priority of a class local queue
@@ -361,6 +374,7 @@ static inline int get_effective_prio(ckrm_lrq_t * lrq)
         int prio;
  
         prio = lrq->local_cvt >> CLASS_QUANTIZER;  // cumulative usage
         int prio;
  
         prio = lrq->local_cvt >> CLASS_QUANTIZER;  // cumulative usage
+#define URGENCY_SUPPORT 1
  #ifndef URGENCY_SUPPORT
  #warning "ACB removing urgency calculation from get_effective_prio"
  #else
  #ifndef URGENCY_SUPPORT
  #warning "ACB removing urgency calculation from get_effective_prio"
  #else
@@ -414,84 +428,11 @@ static inline unsigned long task_load(struct task_struct* p)
  }
  
  /*
  }
  
  /*
- * runqueue load is the local_weight of all the classes on this cpu
- * must be called with class_list_lock held
+ * moved to ckrm_sched.c
+ * but may need to make it static inline to improve performance
   */
   */
-static inline unsigned long ckrm_cpu_load(int cpu)
-{
-       struct ckrm_cpu_class *clsptr;
-       ckrm_lrq_t* lrq;
-       struct ckrm_cpu_demand_stat* l_stat;
-       int total_load = 0;
-       int load;
-
-       list_for_each_entry(clsptr,&active_cpu_classes,links) {
-               lrq =  get_ckrm_lrq(clsptr,cpu);
-               l_stat = get_cls_local_stat(clsptr,cpu);
-               load = lrq->local_weight;
-               if (l_stat->cpu_demand < load)
-                       load = l_stat->cpu_demand;
-               total_load += load;
-       }       
-       return total_load;
-}
-
-static inline void class_enqueue_task(struct task_struct *p,
-                                     prio_array_t * array)
-{
-       ckrm_lrq_t *lrq;
-       int effective_prio;
-
-       lrq = get_task_lrq(p);
-
-       cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
-       lrq->lrq_load += task_load(p);
-
-       if ((p->prio < lrq->top_priority) && (array == lrq->active))
-               set_top_priority(lrq, p->prio); 
-
-       if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
-               cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
-               effective_prio = get_effective_prio(lrq);
-               classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
-       } 
-
-}
-
-static inline void class_dequeue_task(struct task_struct *p,
-                                     prio_array_t * array)
-{
-       ckrm_lrq_t *lrq = get_task_lrq(p);
-       unsigned long load = task_load(p);
-
-       BUG_ON(lrq->lrq_load < load);
-       lrq->lrq_load -= load;
-
-       cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
-
-       if ((array == lrq->active) && (p->prio == lrq->top_priority)
-           && list_empty(&(array->queue[p->prio])))
-               set_top_priority(lrq,
-                                find_next_bit(array->bitmap, MAX_PRIO,
-                                              p->prio));
-}
-
-/*
- *  called after a task is switched out. Update the local cvt accounting 
- *  we need to stick with long instead of long long due to nonexistent 64-bit division
- */
-static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
-{
-       ckrm_lrq_t * lrq = get_task_lrq(p);
-
-       unsigned long cvt_inc = nsec / local_class_weight(lrq);
-
-       lrq->local_cvt += cvt_inc;
-       lrq->uncounted_ns += nsec;
-
-       update_class_priority(lrq);
-}
-
+void update_local_cvt(struct task_struct *p, unsigned long nsec);
+                                                                                
  static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
  {
         struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
  static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
  {
         struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
@@ -518,11 +459,14 @@ static inline int get_ckrm_rand(unsigned long val)
         return rand;
  }
  
         return rand;
  }
  
-void update_class_cputime(int this_cpu);
+void update_class_cputime(int this_cpu, int idle);
  
  /**********************************************/
  /*          PID_LOAD_BALANCING                */
  /**********************************************/
  
  /**********************************************/
  /*          PID_LOAD_BALANCING                */
  /**********************************************/
+
+#define CPU_PID_CTRL_TICK 32
+
  struct ckrm_load_struct {
         unsigned long load_p;   /*propotional*/
         unsigned long load_i;   /*integral   */
  struct ckrm_load_struct {
         unsigned long load_p;   /*propotional*/
         unsigned long load_i;   /*integral   */
@@ -538,26 +482,12 @@ static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
  }
  
  void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
  }
  
  void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
-long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
+long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group);
  #define rq_ckrm_load(rq) (&((rq)->ckrm_load))
  
  #define rq_ckrm_load(rq) (&((rq)->ckrm_load))
  
-static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load)
-{
-       read_lock(&class_list_lock);
-       
-#ifdef CONFIG_SMP
-       ckrm_load_sample(ckrm_load,this_cpu);
-#endif
  
  
-       if (! (j % CVT_UPDATE_TICK)) {
-               //              printk("ckrm_sched j=%lu\n",j);
-               classqueue_update_base(get_cpu_classqueue(this_cpu));
-               update_class_cputime(this_cpu);
-       }
+#endif /*CONFIG_CKRM_CPU_SCHEDULE */
  
  
-       read_unlock(&class_list_lock);
-}
+#endif
  
  
-#endif //CONFIG_CKRM_CPU_SCHEDULE
  
  
-#endif
diff --git a/include/linux/ckrm_tc.h b/include/linux/ckrm_tc.h

index 5650dd3..0caa797 100644 (file)
--- a/include/linux/ckrm_tc.h
+++ b/include/linux/ckrm_tc.h
@@ -1,3 +1,17 @@
+/* include/linux/ckrm_tc.h - general definitions for the CKRM TaskClass
+ *
+ * Copyright (C) Hubertus Franke,  IBM Corp. 2004
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _CKRM_TC_H
+#define _CKRM_TC_H
+
  #include <linux/ckrm_rc.h>
  
  #define TASK_CLASS_TYPE_NAME "taskclass"
  #include <linux/ckrm_rc.h>
  
  #define TASK_CLASS_TYPE_NAME "taskclass"
@@ -11,3 +25,5 @@ typedef struct ckrm_task_class {
  #define TC_MF_IDX  0
  
  extern int ckrm_forced_reclassify_pid(int pid, struct ckrm_task_class *cls);
  #define TC_MF_IDX  0
  
  extern int ckrm_forced_reclassify_pid(int pid, struct ckrm_task_class *cls);
+
+#endif // _CKRM_TC_H
diff --git a/include/linux/fs.h b/include/linux/fs.h

index ece31a7..11067b7 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1603,6 +1603,15 @@ static inline void free_secdata(void *secdata)
  asmlinkage int sys_ioprio_set(int ioprio);
  asmlinkage int sys_ioprio_get(void);
  
  asmlinkage int sys_ioprio_set(int ioprio);
  asmlinkage int sys_ioprio_get(void);
  
+/* common structure for cfq & ckrm I/O controller */
+typedef struct cfqlim {
+       int nskip;
+       unsigned long navsec;
+       int timedout;
+       atomic_t sectorate;
+       u64 sec[2];
+} cfqlim_t ;
+
  
  #endif /* __KERNEL__ */
  #endif /* _LINUX_FS_H */
  
  #endif /* __KERNEL__ */
  #endif /* _LINUX_FS_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h

new file mode 100644 (file)

index 0000000..8bd6c6b
--- /dev/null
+++ b/include/linux/kexec.h
@@ -0,0 +1,56 @@
+#ifndef LINUX_KEXEC_H
+#define LINUX_KEXEC_H
+
+#ifdef CONFIG_KEXEC
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/kexec.h>
+
+/*
+ * This structure is used to hold the arguments that are used when loading
+ * kernel binaries.
+ */
+
+typedef unsigned long kimage_entry_t;
+#define IND_DESTINATION  0x1
+#define IND_INDIRECTION  0x2
+#define IND_DONE         0x4
+#define IND_SOURCE       0x8
+
+#define KEXEC_SEGMENT_MAX 8
+struct kexec_segment {
+       void *buf;
+       size_t bufsz;
+       void *mem;
+       size_t memsz;
+};
+
+struct kimage {
+       kimage_entry_t head;
+       kimage_entry_t *entry;
+       kimage_entry_t *last_entry;
+
+       unsigned long destination;
+
+       unsigned long start;
+       struct page *control_code_page;
+
+       unsigned long nr_segments;
+       struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+
+       struct list_head control_pages;
+       struct list_head dest_pages;
+       struct list_head unuseable_pages;
+};
+
+
+/* kexec interface functions */
+extern void machine_kexec(struct kimage *image);
+extern int machine_kexec_prepare(struct kimage *image);
+extern void machine_kexec_cleanup(struct kimage *image);
+extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
+       struct kexec_segment *segments);
+extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
+extern struct kimage *kexec_image;
+#endif
+#endif /* LINUX_KEXEC_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 3fb1893..83c64bb 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -581,7 +581,7 @@ int clear_page_dirty_for_io(struct page *page);
   */
  typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask);
  
   */
  typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask);
  
-extern long do_mprotect(struct mm_struct *mm, unsigned long start, 
+asmlinkage long do_mprotect(struct mm_struct *mm, unsigned long start, 
                         size_t len, unsigned long prot);
  
  /*
                         size_t len, unsigned long prot);
  
  /*
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h

index a325de5..f2ded11 100644 (file)
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -52,19 +52,23 @@ enum ip_conntrack_status {
  
  #include <linux/netfilter_ipv4/ip_conntrack_tcp.h>
  #include <linux/netfilter_ipv4/ip_conntrack_icmp.h>
  
  #include <linux/netfilter_ipv4/ip_conntrack_tcp.h>
  #include <linux/netfilter_ipv4/ip_conntrack_icmp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
  
  /* per conntrack: protocol private data */
  union ip_conntrack_proto {
         /* insert conntrack proto private data here */
  
  /* per conntrack: protocol private data */
  union ip_conntrack_proto {
         /* insert conntrack proto private data here */
+       struct ip_ct_gre gre;
         struct ip_ct_tcp tcp;
         struct ip_ct_icmp icmp;
  };
  
  union ip_conntrack_expect_proto {
         /* insert expect proto private data here */
         struct ip_ct_tcp tcp;
         struct ip_ct_icmp icmp;
  };
  
  union ip_conntrack_expect_proto {
         /* insert expect proto private data here */
+       struct ip_ct_gre_expect gre;
  };
  
  /* Add protocol helper include file here */
  };
  
  /* Add protocol helper include file here */
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
  #include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
  #include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
  #include <linux/netfilter_ipv4/ip_conntrack_irc.h>
  #include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
  #include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
  #include <linux/netfilter_ipv4/ip_conntrack_irc.h>
@@ -72,6 +76,7 @@ union ip_conntrack_expect_proto {
  /* per expectation: application helper private data */
  union ip_conntrack_expect_help {
         /* insert conntrack helper private data (expect) here */
  /* per expectation: application helper private data */
  union ip_conntrack_expect_help {
         /* insert conntrack helper private data (expect) here */
+       struct ip_ct_pptp_expect exp_pptp_info;
         struct ip_ct_amanda_expect exp_amanda_info;
         struct ip_ct_ftp_expect exp_ftp_info;
         struct ip_ct_irc_expect exp_irc_info;
         struct ip_ct_amanda_expect exp_amanda_info;
         struct ip_ct_ftp_expect exp_ftp_info;
         struct ip_ct_irc_expect exp_irc_info;
@@ -86,16 +91,19 @@ union ip_conntrack_expect_help {
  /* per conntrack: application helper private data */
  union ip_conntrack_help {
         /* insert conntrack helper private data (master) here */
  /* per conntrack: application helper private data */
  union ip_conntrack_help {
         /* insert conntrack helper private data (master) here */
+       struct ip_ct_pptp_master ct_pptp_info;
         struct ip_ct_ftp_master ct_ftp_info;
         struct ip_ct_irc_master ct_irc_info;
  };
  
  #ifdef CONFIG_IP_NF_NAT_NEEDED
  #include <linux/netfilter_ipv4/ip_nat.h>
         struct ip_ct_ftp_master ct_ftp_info;
         struct ip_ct_irc_master ct_irc_info;
  };
  
  #ifdef CONFIG_IP_NF_NAT_NEEDED
  #include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_pptp.h>
  
  /* per conntrack: nat application helper private data */
  union ip_conntrack_nat_help {
         /* insert nat helper private data here */
  
  /* per conntrack: nat application helper private data */
  union ip_conntrack_nat_help {
         /* insert nat helper private data here */
+       struct ip_nat_pptp nat_pptp_info;
  };
  #endif
  
  };
  #endif
  
@@ -157,6 +165,12 @@ struct ip_conntrack_expect
         union ip_conntrack_expect_help help;
  };
  
         union ip_conntrack_expect_help help;
  };
  
+struct ip_conntrack_counter
+{
+       u_int64_t packets;
+       u_int64_t bytes;
+};
+
  struct ip_conntrack_helper;
  
  struct ip_conntrack
  struct ip_conntrack_helper;
  
  struct ip_conntrack
@@ -174,6 +188,11 @@ struct ip_conntrack
         /* Timer function; drops refcnt when it goes off. */
         struct timer_list timeout;
  
         /* Timer function; drops refcnt when it goes off. */
         struct timer_list timeout;
  
+#ifdef CONFIG_IP_NF_CT_ACCT
+       /* Accounting Information (same cache line as other written members) */
+       struct ip_conntrack_counter counters[IP_CT_DIR_MAX];
+#endif
+
         /* If we're expecting another related connection, this will be
             in expected linked list */
         struct list_head sibling_list;
         /* If we're expecting another related connection, this will be
             in expected linked list */
         struct list_head sibling_list;
@@ -249,8 +268,10 @@ extern int invert_tuplepr(struct ip_conntrack_tuple *inverse,
                           const struct ip_conntrack_tuple *orig);
  
  /* Refresh conntrack for this many jiffies */
                           const struct ip_conntrack_tuple *orig);
  
  /* Refresh conntrack for this many jiffies */
-extern void ip_ct_refresh(struct ip_conntrack *ct,
-                         unsigned long extra_jiffies);
+extern void ip_ct_refresh_acct(struct ip_conntrack *ct,
+                              enum ip_conntrack_info ctinfo,
+                              const struct sk_buff *skb,
+                              unsigned long extra_jiffies);
  
  /* These are for NAT.  Icky. */
  /* Call me when a conntrack is destroyed. */
  
  /* These are for NAT.  Icky. */
  /* Call me when a conntrack is destroyed. */
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h

index 1e76911..d2bd0be 100644 (file)
--- a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
@@ -14,7 +14,7 @@
  union ip_conntrack_manip_proto
  {
         /* Add other protocols here. */
  union ip_conntrack_manip_proto
  {
         /* Add other protocols here. */
-       u_int16_t all;
+       u_int32_t all;
  
         struct {
                 u_int16_t port;
  
         struct {
                 u_int16_t port;
@@ -25,6 +25,9 @@ union ip_conntrack_manip_proto
         struct {
                 u_int16_t id;
         } icmp;
         struct {
                 u_int16_t id;
         } icmp;
+       struct {
+               u_int32_t key;
+       } gre;
  };
  
  /* The manipulable part of the tuple. */
  };
  
  /* The manipulable part of the tuple. */
@@ -44,7 +47,7 @@ struct ip_conntrack_tuple
                 u_int32_t ip;
                 union {
                         /* Add other protocols here. */
                 u_int32_t ip;
                 union {
                         /* Add other protocols here. */
-                       u_int16_t all;
+                       u_int32_t all;
  
                         struct {
                                 u_int16_t port;
  
                         struct {
                                 u_int16_t port;
@@ -55,6 +58,9 @@ struct ip_conntrack_tuple
                         struct {
                                 u_int8_t type, code;
                         } icmp;
                         struct {
                                 u_int8_t type, code;
                         } icmp;
+                       struct {
+                               u_int32_t key;
+                       } gre;
                 } u;
  
                 /* The protocol. */
                 } u;
  
                 /* The protocol. */
@@ -80,10 +86,16 @@ enum ip_conntrack_dir
  #ifdef __KERNEL__
  
  #define DUMP_TUPLE(tp)                                         \
  #ifdef __KERNEL__
  
  #define DUMP_TUPLE(tp)                                         \
-DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n",    \
+DEBUGP("tuple %p: %u %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n",      \
         (tp), (tp)->dst.protonum,                               \
         (tp), (tp)->dst.protonum,                               \
-       NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all),          \
-       NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
+       NIPQUAD((tp)->src.ip), ntohl((tp)->src.u.all),          \
+       NIPQUAD((tp)->dst.ip), ntohl((tp)->dst.u.all))
+
+#define DUMP_TUPLE_RAW(x)                                              \
+       DEBUGP("tuple %p: %u %u.%u.%u.%u:0x%08x -> %u.%u.%u.%u:0x%08x\n",\
+       (x), (x)->dst.protonum,                                         \
+       NIPQUAD((x)->src.ip), ntohl((x)->src.u.all),                    \
+       NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.all))
  
  #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
  
  
  #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
  
diff --git a/include/linux/reboot.h b/include/linux/reboot.h

index d60fafc..5460e94 100644 (file)
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -51,6 +51,8 @@ extern void machine_restart(char *cmd);
  extern void machine_halt(void);
  extern void machine_power_off(void);
  
  extern void machine_halt(void);
  extern void machine_power_off(void);
  
+extern void machine_shutdown(void);
+
  #endif
  
  #endif /* _LINUX_REBOOT_H */
  #endif
  
  #endif /* _LINUX_REBOOT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index dd50052..eda93cb 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -607,7 +607,6 @@ struct task_struct {
         spinlock_t  ckrm_tsklock; 
         void       *ce_data;
  #ifdef CONFIG_CKRM_TYPE_TASKCLASS
         spinlock_t  ckrm_tsklock; 
         void       *ce_data;
  #ifdef CONFIG_CKRM_TYPE_TASKCLASS
-       // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS 
         struct ckrm_task_class *taskclass;
         struct list_head        taskclass_link;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
         struct ckrm_task_class *taskclass;
         struct list_head        taskclass_link;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

index 111bb73..5156e43 100644 (file)
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1106,6 +1106,20 @@ extern void             skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
  extern void           skb_split(struct sk_buff *skb,
                                  struct sk_buff *skb1, const u32 len);
  
  extern void           skb_split(struct sk_buff *skb,
                                  struct sk_buff *skb1, const u32 len);
  
+static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
+                                      int len, void *buffer)
+{
+       int hlen = skb_headlen(skb);
+
+       if (offset + len <= hlen)
+               return skb->data + offset;
+
+       if (skb_copy_bits(skb, offset, buffer, len) < 0)
+               return NULL;
+
+       return buffer;
+}
+
  extern void skb_init(void);
  extern void skb_add_mtu(int mtu);
  
  extern void skb_init(void);
  extern void skb_add_mtu(int mtu);
  
diff --git a/init/Kconfig b/init/Kconfig

index 64ca2fc..5d28bb7 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -214,6 +214,18 @@ config CKRM_MEM_LRUORDER_CHANGE
           Changing this to yes reduces the checking overhead but violates the approximate
           LRU order that is maintained by the paging subsystem.
  
           Changing this to yes reduces the checking overhead but violates the approximate
           LRU order that is maintained by the paging subsystem.
  
+config CKRM_CPU_SCHEDULE_AT_BOOT
+       bool "Turn on at boot time"
+       depends on CKRM_CPU_SCHEDULE
+       default n
+       help
+         Enable CKRM CPU Scheduler at boot time. Otherwise
+         it can be turned on dynamically at runtime. If not
+         turned on the default Linux Scheduler behavior 
+         will be obtained.
+
+         Say N if unsure, Y to use this feature
+
  config CKRM_TYPE_SOCKETCLASS
         bool "Class Manager for socket groups"
         depends on CKRM
  config CKRM_TYPE_SOCKETCLASS
         bool "Class Manager for socket groups"
         depends on CKRM
diff --git a/kernel/.cvsignore b/kernel/.cvsignore

new file mode 100644 (file)

index 0000000..21426e9
--- /dev/null
+++ b/kernel/.cvsignore
@@ -0,0 +1,2 @@
+config_data.gz
+config_data.h
diff --git a/kernel/Makefile b/kernel/Makefile

index ec50010..455ec1e 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_MODULE_SIG) += module-verify.o
  obj-$(CONFIG_KALLSYMS) += kallsyms.o
  obj-$(CONFIG_PM) += power/
  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
  obj-$(CONFIG_KALLSYMS) += kallsyms.o
  obj-$(CONFIG_PM) += power/
  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
  obj-$(CONFIG_COMPAT) += compat.o
  obj-$(CONFIG_IKCONFIG) += configs.o
  obj-$(CONFIG_IKCONFIG_PROC) += configs.o
  obj-$(CONFIG_COMPAT) += compat.o
  obj-$(CONFIG_IKCONFIG) += configs.o
  obj-$(CONFIG_IKCONFIG_PROC) += configs.o
diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile

index b325309..4956dcb 100644 (file)
--- a/kernel/ckrm/Makefile
+++ b/kernel/ckrm/Makefile
@@ -8,6 +8,6 @@ endif
      obj-$(CONFIG_CKRM_TYPE_TASKCLASS)  += ckrm_tc.o
      obj-$(CONFIG_CKRM_RES_NUMTASKS)    += ckrm_numtasks.o
      obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
      obj-$(CONFIG_CKRM_TYPE_TASKCLASS)  += ckrm_tc.o
      obj-$(CONFIG_CKRM_RES_NUMTASKS)    += ckrm_numtasks.o
      obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
-    obj-$(CONFIG_CKRM_RES_LISTENAQ)    += ckrm_laq.o
+    obj-$(CONFIG_CKRM_RES_LISTENAQ)    += ckrm_listenaq.o
      obj-$(CONFIG_CKRM_CPU_SCHEDULE)     += ckrm_cpu_class.o ckrm_cpu_monitor.o
      obj-$(CONFIG_CKRM_RES_MEM)                 += ckrm_mem.o
      obj-$(CONFIG_CKRM_CPU_SCHEDULE)     += ckrm_cpu_class.o ckrm_cpu_monitor.o
      obj-$(CONFIG_CKRM_RES_MEM)                 += ckrm_mem.o
diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c

index f1cfb26..e732fdf 100644 (file)
--- a/kernel/ckrm/ckrm.c
+++ b/kernel/ckrm/ckrm.c
@@ -82,6 +82,7 @@ inline unsigned int is_res_regd(struct ckrm_classtype *clstype, int resid)
             );
  }
  
             );
  }
  
+static 
  struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
                                           const char *resname)
  {
  struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
                                           const char *resname)
  {
@@ -101,10 +102,8 @@ struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
         return NULL;
  }
  
         return NULL;
  }
  
-EXPORT_SYMBOL(ckrm_resctlr_lookup);
-
  /* given a classname return the class handle and its classtype*/
  /* given a classname return the class handle and its classtype*/
-void *ckrm_classobj(char *classname, int *classTypeID)
+void *ckrm_classobj(const char *classname, int *classTypeID)
  {
         int i;
  
  {
         int i;
  
@@ -864,7 +863,10 @@ int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq)
                 atomic_inc(&clstype->nr_resusers[i]);
                 rcbs = clstype->res_ctlrs[i];
                 if (rcbs && rcbs->get_share_values) {
                 atomic_inc(&clstype->nr_resusers[i]);
                 rcbs = clstype->res_ctlrs[i];
                 if (rcbs && rcbs->get_share_values) {
-                       (*rcbs->get_share_values) (core->res_class[i], &shares);
+                       int rc = (*rcbs->get_share_values)(core->res_class[i], 
+                                                          &shares);
+                       if (rc == -ENOSYS) 
+                               continue;
                         seq_printf(seq,"res=%s,guarantee=%d,limit=%d,"
                                    "total_guarantee=%d,max_limit=%d\n",
                                    rcbs->res_name, shares.my_guarantee,
                         seq_printf(seq,"res=%s,guarantee=%d,limit=%d,"
                                    "total_guarantee=%d,max_limit=%d\n",
                                    rcbs->res_name, shares.my_guarantee,
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c

index 917875b..1bf482f 100644 (file)
--- a/kernel/ckrm/ckrm_cpu_class.c
+++ b/kernel/ckrm/ckrm_cpu_class.c
@@ -22,9 +22,35 @@
  #include <linux/ckrm_sched.h>
  #include <linux/ckrm_classqueue.h>
  #include <linux/seq_file.h>
  #include <linux/ckrm_sched.h>
  #include <linux/ckrm_classqueue.h>
  #include <linux/seq_file.h>
+#include <linux/parser.h>
+
+#define CPU_CTRL_NAME  "cpu"
  
  struct ckrm_res_ctlr cpu_rcbs;
  
  
  struct ckrm_res_ctlr cpu_rcbs;
  
+#define CKRM_CPU_USAGE_DETAIL_MAX 3
+static int usage_detail = 3;  /* 0: show usage 
+                              * 1: show settings
+                              * 2: show effectives
+                              * 3: show per runqueue stats
+                              */
+
+static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode);
+
+/*
+ * update effective share setting after:
+ * -- remove class
+ * -- change class share
+ * we don't need to call update_effectives() when add new class since 
+ * the defaults grt of new class is 0
+ * CAUTION: might need a lock here
+ */
+static inline void update_class_effectives(void) 
+{
+       //      update_effectives();
+       ckrm_cpu_monitor(0);
+}
+
  /**
   * insert_cpu_class - insert a class to active_cpu_class list
   *
  /**
   * insert_cpu_class - insert a class to active_cpu_class list
   *
@@ -38,49 +64,81 @@ static inline void insert_cpu_class(struct ckrm_cpu_class *cls)
  /*
   *  initialize a class object and its local queues
   */
  /*
   *  initialize a class object and its local queues
   */
+
+CVT_t get_min_cvt_locking(int cpu);
+ckrm_lrq_t *rq_get_dflt_lrq(int cpu);
+
+static void init_cpu_class_lrq(struct ckrm_cpu_class *cls, 
+                              int cpu, int isdflt)
+{
+       int j,k;
+       ckrm_lrq_t *queue = cls->local_queues[cpu];
+
+       queue->active   = queue->arrays;
+       queue->expired  = queue->arrays+1;
+       
+       for (j = 0; j < 2; j++) {
+               prio_array_t *array = queue->arrays + j;
+               for (k = 0; k < MAX_PRIO; k++) {
+                       INIT_LIST_HEAD(array->queue + k);
+                       __clear_bit(k, array->bitmap);
+               }
+               // delimiter for bitsearch
+               __set_bit(MAX_PRIO, array->bitmap);
+               array->nr_active = 0;
+       }
+       
+       queue->expired_timestamp = 0;
+       queue->best_expired_prio = MAX_PRIO;
+       
+       queue->cpu_class = cls;
+       queue->classqueue = get_cpu_classqueue(cpu);
+       queue->top_priority = MAX_PRIO;
+       cq_node_init(&queue->classqueue_linkobj);
+       queue->local_cvt = isdflt ? 0 : get_min_cvt_locking(cpu);
+       queue->lrq_load = 0;
+       queue->local_weight = cpu_class_weight(cls);
+       if (queue->local_weight == 0)
+               queue->local_weight = 1;
+       queue->over_weight = 0;
+       queue->skewed_weight = CKRM_MAX_WEIGHT/2; /*otherwise class might starve on start*/
+       queue->uncounted_ns = 0;
+       queue->savings = 0;
+       queue->magic = CKRM_LRQ_MAGIC;
+}
+
  void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
  {
  void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
  {
-       int i,j,k;      
-       prio_array_t *array;    
-       ckrm_lrq_t* queue;
+       int i;      
+       int isdflt;
+       struct ckrm_cpu_class *dfltcls;
+
+       dfltcls = get_default_cpu_class();
+
+       isdflt = (cls==dfltcls);
  
         cls->shares = *shares;
         cls->cnt_lock = SPIN_LOCK_UNLOCKED;
  
         cls->shares = *shares;
         cls->cnt_lock = SPIN_LOCK_UNLOCKED;
-       ckrm_cpu_stat_init(&cls->stat);
+       ckrm_cpu_stat_init(&cls->stat,isdflt ? CKRM_SHARE_MAX : 1);
         ckrm_usage_init(&cls->usage);
         cls->magic = CKRM_CPU_CLASS_MAGIC;
  
         ckrm_usage_init(&cls->usage);
         cls->magic = CKRM_CPU_CLASS_MAGIC;
  
-       for (i = 0 ; i < NR_CPUS ; i++) {
-               queue = &cls->local_queues[i];
-               queue->active  = queue->arrays;
-               queue->expired = queue->arrays+1;
-               
-               for (j = 0; j < 2; j++) {
-                       array = queue->arrays + j;
-                       for (k = 0; k < MAX_PRIO; k++) {
-                               INIT_LIST_HEAD(array->queue + k);
-                               __clear_bit(k, array->bitmap);
-                       }
-                       // delimiter for bitsearch
-                       __set_bit(MAX_PRIO, array->bitmap);
-                       array->nr_active = 0;
+       memset(cls->local_queues,0,NR_CPUS*sizeof(ckrm_lrq_t*));
+       
+       if (isdflt) {
+               for (i=0; i< NR_CPUS; i++) {
+                       cls->local_queues[i] = rq_get_dflt_lrq(i);
+                       init_cpu_class_lrq(cls,i,1);
+               }
+       } else {
+               for_each_cpu(i) {
+                       cls->local_queues[i] = kmalloc(sizeof(ckrm_lrq_t),
+                                                      GFP_KERNEL);
+                       BUG_ON(cls->local_queues[i]==NULL);
+                       init_cpu_class_lrq(cls,i,0);
                 }
                 }
-
-               queue->expired_timestamp = 0;
-               
-               queue->cpu_class = cls;
-               queue->classqueue = get_cpu_classqueue(i);
-               queue->top_priority = MAX_PRIO;
-               cq_node_init(&queue->classqueue_linkobj);
-               queue->local_cvt = 0;
-               queue->lrq_load = 0;
-               queue->local_weight = cpu_class_weight(cls);
-               queue->uncounted_ns = 0;
-               queue->savings = 0;
-               queue->magic = 0x43FF43D7;
         }
  
         }
  
-       // add to class list
         write_lock(&class_list_lock);
         insert_cpu_class(cls);
         write_unlock(&class_list_lock);
         write_lock(&class_list_lock);
         insert_cpu_class(cls);
         write_unlock(&class_list_lock);
@@ -100,14 +158,14 @@ struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core)
  {
         struct ckrm_cpu_class * cls;
         cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
  {
         struct ckrm_cpu_class * cls;
         cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
-       if (valid_cpu_class(cls))
-               return cls;
+       if (valid_cpu_class(cls))
+               return (ckrm_cpu_enabled() ? cls : get_default_cpu_class());
         else
                 return NULL;
  }
  
         else
                 return NULL;
  }
  
-
-void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) 
+void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, 
+                          struct ckrm_core_class *parent) 
  {              
         struct ckrm_cpu_class *cls;
  
  {              
         struct ckrm_cpu_class *cls;
  
@@ -128,7 +186,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
                         set_default_share(&shares);
                         init_cpu_class(cls,&shares);
                         cls->core = core;
                         set_default_share(&shares);
                         init_cpu_class(cls,&shares);
                         cls->core = core;
-                       cls->parent = parent;
+                       cls->parent = parent;                   
                 }
         } else
                 printk(KERN_ERR"alloc_cpu_class failed\n");
                 }
         } else
                 printk(KERN_ERR"alloc_cpu_class failed\n");
@@ -136,15 +194,14 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
         return cls;
  }              
  
         return cls;
  }              
  
-/*
- * hzheng: this is not a stable implementation
- *         need to check race condition issue here
- */            
+void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr);
+
  static void ckrm_free_cpu_class(void *my_res) 
  {                      
         struct ckrm_cpu_class *cls = my_res, *parres, *childres;
         ckrm_core_class_t *child = NULL;
         int maxlimit;
  static void ckrm_free_cpu_class(void *my_res) 
  {                      
         struct ckrm_cpu_class *cls = my_res, *parres, *childres;
         ckrm_core_class_t *child = NULL;
         int maxlimit;
+       int i;
  
         if (!cls) 
                 return;
  
         if (!cls) 
                 return;
@@ -179,10 +236,19 @@ static void ckrm_free_cpu_class(void *my_res)
         list_del(&cls->links);
         write_unlock(&class_list_lock);
  
         list_del(&cls->links);
         write_unlock(&class_list_lock);
  
+       ckrm_cpu_class_queue_delete_sync(cls);
+
+       for_each_cpu(i) {
+               ckrm_lrq_t *lrq = get_ckrm_lrq(cls,i);
+               if (!lrq) continue;
+               lrq->magic = -99;
+               kfree(lrq);
+       }
         kfree(cls);
  
         kfree(cls);
  
-       //call ckrm_cpu_monitor after class removed
-       ckrm_cpu_monitor(0);
+       //call ckrm_cpu_monitor after class is removed
+       if (ckrm_cpu_enabled())
+               update_class_effectives();
  }                              
  
  /*
  }                              
  
  /*
@@ -194,8 +260,12 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
          struct ckrm_shares *cur = &cls->shares, *par;
          int rc = -EINVAL;
  
          struct ckrm_shares *cur = &cls->shares, *par;
          int rc = -EINVAL;
  
-        if (!cls) 
-                return rc;
+       if (ckrm_cpu_disabled())
+               return -ENOSYS;
+        if (!cls)
+               return rc;
+       if (new_share->total_guarantee > CKRM_SHARE_MAX)
+               return -E2BIG;
  
          if (cls->parent) {
                  parres = ckrm_get_cpu_class(cls->parent);
  
          if (cls->parent) {
                  parres = ckrm_get_cpu_class(cls->parent);
@@ -215,7 +285,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
                 new_share->my_guarantee = 0;
  
         rc = set_shares(new_share, cur, par);
                 new_share->my_guarantee = 0;
  
         rc = set_shares(new_share, cur, par);
-       if (cur->my_limit == CKRM_SHARE_DONTCARE)
+       if (!rc && cur->my_limit == CKRM_SHARE_DONTCARE)
                 cur->my_limit = cur->max_limit;
  
  
                 cur->my_limit = cur->max_limit;
  
  
@@ -225,7 +295,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
         }
  
         //call ckrm_cpu_monitor after changes are changed
         }
  
         //call ckrm_cpu_monitor after changes are changed
-       ckrm_cpu_monitor(0);
+       update_class_effectives();
  
         return rc;
  }                                                      
  
         return rc;
  }                                                      
@@ -235,22 +305,90 @@ static int ckrm_cpu_get_share(void *my_res,
  {                      
         struct ckrm_cpu_class *cls = my_res;
  
  {                      
         struct ckrm_cpu_class *cls = my_res;
  
-       if (!cls) 
+       if (ckrm_cpu_disabled())
+               return -ENOSYS;
+        if (!cls)
                 return -EINVAL;
                 return -EINVAL;
+
         *shares = cls->shares;
         return 0;
  }                              
  
         *shares = cls->shares;
         return 0;
  }                              
  
+/*
+ *   get_ckrm_usage():
+ *     obtain a sequence of <num> usage informations
+ *     returns number of usages reported.
+ *
+ *     report IN:  specifies the sequence of jiffies for which to report
+ *                 must be ordered (smallest first)
+ *            OUT: returns the usage in each field
+ *
+ */
+
+
+int ckrm_cpu_get_usage(struct ckrm_cpu_class* clsptr, 
+                      int num, ulong report[])
+{
+       struct ckrm_usage* usage = &clsptr->usage;
+       unsigned long long total = 0;
+       int i, idx, cur, num_ofs;
+
+       num_ofs = cur = i = 0;
+       idx = usage->sample_pointer;    
+
+       for ( num_ofs = 0; num_ofs < num ; num_ofs++ ) {
+               int nr_samples;
+               int duration = report[num_ofs]; 
+               unsigned long long totval = 0;
+
+               nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
+               
+               if (nr_samples > USAGE_MAX_HISTORY)
+                       nr_samples = USAGE_MAX_HISTORY;
+
+               for ( ; i< nr_samples; i++) {
+                       if (! idx)
+                               idx = USAGE_MAX_HISTORY;
+                       idx --;
+                       total += usage->samples[idx];
+               }
+               totval = total * 1000;
+               do_div(totval,NS_PER_SAMPLE);
+               do_div(totval,nr_samples * cpus_weight(cpu_online_map));
+               report[num_ofs] = totval;
+       }
+
+        return num;
+}
+
  int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
  {
         struct ckrm_cpu_class *cls = my_res;
         struct ckrm_cpu_class_stat* stat = &cls->stat;
         ckrm_lrq_t* lrq;
         int i;
  int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
  {
         struct ckrm_cpu_class *cls = my_res;
         struct ckrm_cpu_class_stat* stat = &cls->stat;
         ckrm_lrq_t* lrq;
         int i;
+       ulong usage[3] = { 2*HZ, 10*HZ, 60*HZ };
  
  
-       if (!cls) 
+       if (!cls || ckrm_cpu_disabled()) 
                 return -EINVAL;
  
                 return -EINVAL;
  
+       ckrm_cpu_get_usage(cls,3,usage);
+
+       /* this will after full stabilization become the only cpu usage stats
+        */
+
+       seq_printf(sfile, "cpu-usage(2,10,60)= %lu %lu %lu\n",
+                  usage[0],usage[1],usage[2]);
+
+       if (usage_detail < 1) 
+               return 0;
+
+       /* the extended statistics we can decide whether we want to make the 
+        * additional statistics available over config options
+        * eitherway they should be reported in a more concised form
+        * during stabilization, this is OK
+        */
+
         seq_printf(sfile, "-------- CPU Class Status Start---------\n");
         seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
                    cls->shares.my_guarantee,
         seq_printf(sfile, "-------- CPU Class Status Start---------\n");
         seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
                    cls->shares.my_guarantee,
@@ -261,26 +399,35 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
                    cls->shares.unused_guarantee,
                    cls->shares.cur_max_limit);
  
                    cls->shares.unused_guarantee,
                    cls->shares.cur_max_limit);
  
+       if (usage_detail < 2) 
+               goto out;
+
         seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
         seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
         seq_printf(sfile, "\tehl= %d\n",stat->ehl);
         seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
         seq_printf(sfile, "\teshare= %d\n",stat->eshare);
         seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
         seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
         seq_printf(sfile, "\tehl= %d\n",stat->ehl);
         seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
         seq_printf(sfile, "\teshare= %d\n",stat->eshare);
-       seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls));
+       seq_printf(sfile, "\tmeshare= %d\n",stat->meshare);
         seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
         seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
         seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
         seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
-       seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n",
-                  get_ckrm_usage(cls,2*HZ),
-                  get_ckrm_usage(cls,10*HZ),
-                  get_ckrm_usage(cls,60*HZ)
-                  );
+       seq_printf(sfile, "\tusage(2,10,60)= %lu %lu %lu\n",
+                  usage[0],usage[1],usage[2]);
+
+       if (usage_detail < 3) 
+               goto out;
+
+       /* provide per run queue information */
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(cls,i);              
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(cls,i);              
-               seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings);
+               seq_printf(sfile, "\tlrq %d demand= %lu weight= %d "
+                          "lrq_load= %lu cvt= %llu sav= %llu\n",
+                          i,stat->local_stats[i].cpu_demand,
+                          local_class_weight(lrq),lrq->lrq_load,
+                          lrq->local_cvt,lrq->savings);
         }
  
         }
  
+out:
         seq_printf(sfile, "-------- CPU Class Status END ---------\n");
         seq_printf(sfile, "-------- CPU Class Status END ---------\n");
-
         return 0;
  }
  
         return 0;
  }
  
@@ -296,10 +443,34 @@ void ckrm_cpu_change_class(void *task, void *old, void *new)
         if (!task || ! old || !new)
                 return; 
  
         if (!task || ! old || !new)
                 return; 
  
+       if (ckrm_cpu_disabled())
+               newcls = get_default_cpu_class();
         _ckrm_cpu_change_class(tsk,newcls);
  }                                                      
  
         _ckrm_cpu_change_class(tsk,newcls);
  }                                                      
  
-/*dummy function, not used*/
+enum config_token_t {
+       config_usage_detail,   /* define usage level                      */
+       config_disable,        /* always use default linux scheduling     */
+                              /* effectively disables the ckrm scheduler */
+       config_enable,         /* always uses ckrm scheduling behavior    */
+       config_err             /* parsing error */
+};
+
+#define CKRM_SCHED_MODE_DISABLED_STR "disabled"
+#define CKRM_SCHED_MODE_ENABLED_STR  "enabled"
+
+static char *ckrm_sched_mode_str[] = { 
+               CKRM_SCHED_MODE_DISABLED_STR,
+               CKRM_SCHED_MODE_ENABLED_STR
+};
+
+static match_table_t config_tokens = {
+       { config_disable,      "mode="CKRM_SCHED_MODE_DISABLED_STR },
+       { config_enable,       "mode="CKRM_SCHED_MODE_ENABLED_STR  },
+       { config_usage_detail, "usage_detail=%u"                   },
+       { config_err,          NULL                                }
+};
+
  static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
  {
         struct ckrm_cpu_class *cls = my_res;
  static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
  {
         struct ckrm_cpu_class *cls = my_res;
@@ -307,23 +478,61 @@ static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
         if (!cls) 
                 return -EINVAL;
  
         if (!cls) 
                 return -EINVAL;
  
-       seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class");
+       seq_printf(sfile, "res=%s,mode=%s",
+                  CPU_CTRL_NAME,ckrm_sched_mode_str[ckrm_sched_mode]);
+       if (!ckrm_cpu_disabled())  /* enabled || mixed */
+               seq_printf(sfile, ",usage_detail=%u",usage_detail);
+       seq_printf(sfile,"\n");
         return 0;
  }
  
         return 0;
  }
  
-/*dummy function, not used*/
  static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
  {
         struct ckrm_cpu_class *cls = my_res;
  static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
  {
         struct ckrm_cpu_class *cls = my_res;
+       char *p;
+       char **cfgstr_p = (char**)&cfgstr;
+       substring_t args[MAX_OPT_ARGS];
+       int option,rc;
+       enum ckrm_sched_mode new_sched_mode;
  
         if (!cls) 
                 return -EINVAL;
  
         if (!cls) 
                 return -EINVAL;
-       printk(KERN_DEBUG "ckrm_cpu config='%s'\n",cfgstr);
-       return 0;
+
+       new_sched_mode = ckrm_sched_mode;       
+       rc = 0;
+
+       while ((p = strsep(cfgstr_p, ",")) != NULL) {
+               int token;
+               if (!*p)
+                       continue;
+               
+               token = match_token(p, config_tokens, args);
+               switch (token) {
+               case config_usage_detail:
+                       if (ckrm_cpu_disabled() || 
+                           (match_int(&args[0], &option)) ||
+                           (option > CKRM_CPU_USAGE_DETAIL_MAX))
+                       {
+                               return -EINVAL;
+                       }
+                       usage_detail = option;
+                       break;
+               case config_disable:
+                       new_sched_mode = CKRM_SCHED_MODE_DISABLED;
+                       break;
+               case config_enable:
+                       new_sched_mode = CKRM_SCHED_MODE_ENABLED;
+                       break;
+               case config_err:
+                       return -EINVAL;
+               }
+       }
+       rc = ckrm_cpu_set_mode(new_sched_mode);
+       return rc;
  }
         
  struct ckrm_res_ctlr cpu_rcbs = {
  }
         
  struct ckrm_res_ctlr cpu_rcbs = {
-       .res_name          = "cpu",
+       .res_name          = CPU_CTRL_NAME,
         .res_hdepth        = 1,
         .resid             = -1,
         .res_alloc         = ckrm_alloc_cpu_class,
         .res_hdepth        = 1,
         .resid             = -1,
         .res_alloc         = ckrm_alloc_cpu_class,
@@ -364,14 +573,69 @@ void init_cpu_classes(void)
  
         //init classqueues for each processor
         for (i=0; i < NR_CPUS; i++)
  
         //init classqueues for each processor
         for (i=0; i < NR_CPUS; i++)
-               classqueue_init(get_cpu_classqueue(i)); 
+               classqueue_init(get_cpu_classqueue(i),ckrm_cpu_enabled()); 
  
  
-       /*
-        * hzheng: initialize the default cpu class
-        *  required for E14/E15 since ckrm_init is called after sched_init
-        */
         ckrm_alloc_cpu_class(NULL,NULL);
  }
  
         ckrm_alloc_cpu_class(NULL,NULL);
  }
  
+void ckrm_cpu_class_queue_update(int on);
+void ckrm_cpu_start_monitor(void);
+void ckrm_cpu_kill_monitor(void);
+
+static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode) 
+{
+        struct task_struct *proc, *tsk;
+       struct ckrm_cpu_class *new_cls = NULL;
+       int i;
+
+       if (mode == ckrm_sched_mode)
+               return 0;
+
+       printk("ckrm_cpu_set_mode from <%s> to <%s> pid=%d\n",
+                  ckrm_sched_mode_str[ckrm_sched_mode],
+                  ckrm_sched_mode_str[mode], 
+                  current->pid);
+
+       if (mode == CKRM_SCHED_MODE_DISABLED) {
+               ckrm_cpu_kill_monitor();
+               new_cls = get_default_cpu_class();
+       } else {
+               ckrm_cpu_class_queue_update(1);
+       }
+                             
+       /* run twice through the list to catch everyone,
+        * current and transient once
+         */
+
+        read_lock(&tasklist_lock);
+
+       ckrm_sched_mode = mode;
+       /* we have to run through the list twice
+        * first catch all existing tasks
+        * and then deal with some potential race condition
+        */
+       for ( i=2 ; i-- ; ) {
+               /* lock class_list_lock ? */
+       
+               do_each_thread(proc, tsk) {
+                       if (mode == CKRM_SCHED_MODE_ENABLED) {
+                               new_cls = ckrm_get_res_class(class_core(tsk->taskclass),
+                                                            cpu_rcbs.resid,
+                                                            struct ckrm_cpu_class);
+                       }       
+                       _ckrm_cpu_change_class(tsk,new_cls);
+               } while_each_thread(proc, tsk);
+       }
+        read_unlock(&tasklist_lock);
+
+       if (mode == CKRM_SCHED_MODE_DISABLED) 
+               ckrm_cpu_class_queue_update(0);
+       else 
+               ckrm_cpu_start_monitor();
+       return 0;
+}
  
  EXPORT_SYMBOL(ckrm_get_cpu_class);
  
  EXPORT_SYMBOL(ckrm_get_cpu_class);
+
+
+
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c

index d8c199a..d8d6bd3 100644 (file)
--- a/kernel/ckrm/ckrm_cpu_monitor.c
+++ b/kernel/ckrm/ckrm_cpu_monitor.c
@@ -28,21 +28,30 @@
  #include <asm/div64.h>
  #include <linux/ckrm_sched.h>
  
  #include <asm/div64.h>
  #include <linux/ckrm_sched.h>
  
+// #define CONFIG_CKRM_SUPPORT_MAXLIMITS
+
  #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
  #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
-#define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
  
  #define CKRM_CPU_DEMAND_RUN 0
  #define CKRM_CPU_DEMAND_SLEEP 1
  
  #define CKRM_CPU_DEMAND_RUN 0
  #define CKRM_CPU_DEMAND_SLEEP 1
-//sample task cpu demand every 64ms
-#define CPU_DEMAND_TASK_RECALC  (64000000LL)
-#define CPU_DEMAND_CLASS_RECALC (256000000LL)
+//sample task cpu demand every 32ms
+#define CPU_DEMAND_TASK_RECALC  ( 32*1000*1000LL)
+#define CPU_DEMAND_CLASS_RECALC (256*1000*1000LL)
  #define CPU_DEMAND_TP_CLASS 0
  #define CPU_DEMAND_TP_TASK 1
  
  #define CPU_DEMAND_TP_CLASS 0
  #define CPU_DEMAND_TP_TASK 1
  
+static void update_ckrm_idle(unsigned long surplus);
+
+void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu);
+int alloc_surplus(struct ckrm_core_class *root_core);
  extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
  extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
-void update_ckrm_idle(unsigned long surplus);
  
  /*interface to share definition*/
  
  /*interface to share definition*/
+static inline int get_my_grt(struct ckrm_cpu_class *cls)
+{
+       return cls->shares.unused_guarantee;
+}
+
  static inline int get_soft_limit(struct ckrm_cpu_class *cls)
  {
         return cls->shares.my_limit;
  static inline int get_soft_limit(struct ckrm_cpu_class *cls)
  {
         return cls->shares.my_limit;
@@ -63,6 +72,57 @@ static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
         return cls->shares.total_guarantee;
  }
  
         return cls->shares.total_guarantee;
  }
  
+static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
+                                      int new_share)
+{
+       if (!new_share)
+               new_share = 1;
+
+       BUG_ON(new_share < 0);
+       stat->eshare = new_share;
+}
+
+static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
+                                           int new_share)
+{
+       if (!new_share)
+               new_share = 1;
+
+       BUG_ON(new_share < 0);
+       stat->meshare = new_share;
+}
+
+/**
+ *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
+ *
+ * self_cpu_demand = sum(cpu demand of all local queues) 
+ */
+static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
+{
+       int cpu_demand = 0;
+       int i;
+       int cpuonline = 0;
+
+       for_each_online_cpu(i) {
+               cpu_demand_check_sleep(stat,i);
+               cpu_demand += stat->local_stats[i].cpu_demand;
+               cpuonline ++;
+       }
+
+       return (cpu_demand/cpuonline);
+}
+
+/*
+ * my max demand = min(cpu_demand, my effective hard limit)
+ */
+static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
+{
+       unsigned long mmax_demand = get_self_cpu_demand(stat);
+       if (mmax_demand > stat->mehl)
+               mmax_demand = stat->mehl;
+
+       return mmax_demand;
+}
  
  static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
  {
  
  static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
  {
@@ -85,7 +145,7 @@ static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat,
         }
  }
  
         }
  }
  
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares)
  {
         int i;
  
  {
         int i;
  
@@ -93,7 +153,7 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
         stat->total_ns = 0;
         stat->max_demand = 0;
  
         stat->total_ns = 0;
         stat->max_demand = 0;
  
-       for (i=0; i< NR_CPUS; i++) {
+       for (i=0; i<NR_CPUS; i++) {
                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
         }
  
                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
         }
  
@@ -102,10 +162,517 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
  
         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
  
-       stat->eshare = CKRM_SHARE_MAX;
-       stat->meshare = CKRM_SHARE_MAX;
+       stat->eshare = eshares;
+       stat->meshare = eshares;
+
+       stat->has_savings = 0;  
+       stat->demand_per_share = 0;
+
+}
+
+#if 0  // keep handy for debugging if necessary
+void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num)
+{
+       struct ckrm_cpu_class_stat* stat = &clsptr->stat;
+       printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num,
+               clsptr, (clsptr == get_default_cpu_class()),
+               clsptr->shares.my_guarantee, 
+               clsptr->shares.my_limit, 
+               clsptr->shares.total_guarantee,
+               clsptr->shares.max_limit, 
+               clsptr->shares.unused_guarantee);
+       printk("      egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n",
+               stat->egrt,stat->megrt,stat->ehl,stat->mehl,
+               stat->eshare,stat->meshare);
+}
+#endif
+
+/**********************************************/
+/*          surplus allocation                */
+/**********************************************/
+
+/*
+ * surplus = egrt - demand
+ * if surplus < 0, surplus = 0 
+ */
+static inline int get_node_surplus(struct ckrm_cpu_class *cls)
+{
+       int surplus = cls->stat.egrt - cls->stat.max_demand;
+
+       if (surplus < 0)
+               surplus = 0;
+
+       return surplus;
+}
+
+/*
+ * consume savings in advance because this class give surplus to others
+ * this is a quick hack, should be integrated with balance_savings()
+ */
+static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr, 
+                                           int savings_consumed) 
+{
+       long long total_savings;
+       ckrm_lrq_t* lrq;
+       int i;
+       int cpu_online = 0;
+       
+       total_savings = 0;
+       for_each_online_cpu(i) {
+               lrq = get_ckrm_lrq(clsptr,i);
+               total_savings += lrq->savings;
+               cpu_online ++;
+       }
+       
+       total_savings -= savings_consumed;
+       if (total_savings < 0)
+               total_savings = 0;
+
+       //get the average savings
+       do_div(total_savings,cpu_online);       
+       for_each_online_cpu(i) {
+               lrq = get_ckrm_lrq(clsptr,i);
+               lrq->savings = total_savings;
+       }
+}
+
+static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
+{
+       int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
+       int savings_consumed;
+
+       if (surplus < 0)
+               surplus = 0;
+
+       /*
+        * a quick hack about the hierarchy savings distribution 
+        * may not be the right way to do
+        *
+        * since this node give its surplus to other nodes, 
+        * it's savings should be consumed
+        * suppose CPU_MONITOR_INTERVAL = (HZ) 
+        * savings_consumed is roughly how much savings will be consumed for the next second
+        */
+       if (surplus) {
+               savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT);
+               consumed_surplus_savings(cls, savings_consumed) ;
+       }
+
+       return surplus;
+}
+
+/*
+ * all the class in the queue consume the surplus in order
+ * each class consume the amount propotional to its egrt
+ */
+static int consume_surplus_in_order(struct list_head* queue,
+                                          struct ckrm_cpu_class *p_cls,
+                                          int total_surplus)
+{
+       int total_grt = 0;
+       struct ckrm_cpu_class *clsptr;  
+
+       /*
+        * get total_grt of the classes in the queue
+        * total_grt can be maintained instead of re-calcuated each time
+        */
+       list_for_each_entry(clsptr,queue,surplus_queue) {
+               if (unlikely(clsptr == p_cls))
+                       total_grt += clsptr->stat.megrt;
+               else
+                       total_grt += clsptr->stat.egrt;
+       }
+
+       if (! total_grt)
+               goto consume_out;
+       
+       //allocate in order
+       list_for_each_entry(clsptr,queue,surplus_queue) {               
+               int surplus_per_share;
+               int consumed, my_grt;
+
+               BUG_ON(! total_grt);
+               surplus_per_share = 
+                       (total_surplus << CKRM_SHARE_SHIFT) / total_grt;
+
+               if (surplus_per_share <= 0)
+                       break;
+
+               if (unlikely(clsptr == p_cls))  //self_node consuming
+                       my_grt =  clsptr->stat.megrt;
+               else
+                       my_grt = clsptr->stat.egrt;
+
+               BUG_ON(clsptr->stat.demand_per_share <= 0);
+
+               if (clsptr->stat.demand_per_share < surplus_per_share)
+                       surplus_per_share = clsptr->stat.demand_per_share;
+
+               consumed = surplus_per_share * my_grt;
+               consumed >>= CKRM_SHARE_SHIFT;
+               total_surplus -= consumed;
+               BUG_ON(total_surplus < 0);
+               total_grt -= my_grt;
+
+               if (unlikely(clsptr == p_cls))
+                       set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed);                     
+               else
+                       set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed);
+       }       
+ consume_out:  
+       if (total_surplus <= 1) //if total_suplus too small, no need to allocate again
+               total_surplus = 0;
+       return total_surplus;
+}
+
+/*
+ * link all the children of parent and the parent itself using their surplus_queue field
+ * link the whole queue using src_queue
+ * if anything wrong return -1
+ */
+static int get_class_surplus_queue(struct ckrm_core_class *parent,
+                                  struct list_head* src_queue)
+{
+       struct ckrm_core_class *child_core = NULL;
+       struct ckrm_cpu_class *p_cls,*c_cls;
+       int ret = -1;
+
+       p_cls = ckrm_get_cpu_class(parent);
+       if (! p_cls)
+               goto link_out;
+
+       INIT_LIST_HEAD(src_queue);
+
+       //add the parent node itself
+       list_add(&p_cls->surplus_queue,src_queue);
+       do {
+               child_core = ckrm_get_next_child(parent, child_core);
+               if (child_core) {
+                       c_cls = ckrm_get_cpu_class(child_core);                         
+                       if (! c_cls)
+                               goto link_out;
+                       list_add(&c_cls->surplus_queue,src_queue);
+               }
+       } while (child_core);
+
+       ret = 0;
+
+ link_out:
+       return ret;
+}
+
+/*
+ * insert the class to queue based on stat->demand_per_share
+ * status: tested
+ */
+static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr)
+{
+       struct ckrm_cpu_class *cur_cls = NULL;  
+       int end_of_queue = 1;
+
+       list_for_each_entry(cur_cls,queue,surplus_queue) {
+               if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) {
+                       end_of_queue = 0;
+                       break;
+               }
+       }
+
+       //insert the clsptr
+       if (! cur_cls || end_of_queue)
+               list_add_tail(&clsptr->surplus_queue,queue);
+       else
+               list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue);
+}
+
+/*
+ * copy all classes in src_queue to dst_queue,
+ * reorder the classes based on their normalized demand 
+ * if a class already saturate (eshare >= demand), also remove it from src_queue
+ * return the total guarantee of the selected classes
+ *
+ * @src_queue: source queue
+ * @dst_queue: destination queue
+ * @check_sl: check soft limit
+ * @check_savings: only class has savings should be considered
+ */
+
+static unsigned long reorder_surplus_queue(struct list_head* src_queue, 
+                                          struct list_head* dst_queue, 
+                                          int check_sl, int check_savings, 
+                                          struct ckrm_cpu_class *p_cls) 
+{
+       struct ckrm_cpu_class *clsptr, *tmp;    
+
+       INIT_LIST_HEAD(dst_queue);
+
+       list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) {
+               struct ckrm_cpu_class_stat* stat = &clsptr->stat;
+               int inc_limit;
+               int max_demand, eshare, esl,grt;
+
+               if (unlikely(clsptr == p_cls)) {
+                       max_demand = get_mmax_demand(stat);
+                       eshare  = stat->meshare;
+                       esl = get_mysoft_limit(clsptr);
+                       grt = stat->megrt;
+               } else {
+                       max_demand = stat->max_demand;
+                       eshare = stat->eshare;
+                       esl = get_soft_limit(clsptr);
+                       grt = stat->egrt;
+               }
+
+               //hard limit and demand limit
+               inc_limit = max_demand - eshare;
+               
+               //no additional share needed
+               if (inc_limit <= 0 || ! grt) {
+                       list_del(&clsptr->surplus_queue);
+                       continue;
+               }
+                       
+               //or no more savings
+               if (check_savings && ! stat->has_savings)
+                       continue;
+               
+               //check soft limit
+               if (check_sl) {
+                       int soft_limit;
+
+                       soft_limit = p_cls->stat.eshare * esl
+                               / p_cls->shares.total_guarantee;
+
+                       if (soft_limit < max_demand)
+                               inc_limit = soft_limit - eshare;
+                       if ( inc_limit <= 0)   /* can turn negative */
+                               continue;
+               }
+
+               BUG_ON(! grt);
+               //get the stat->demand_per_share
+               stat->demand_per_share = 
+                       (inc_limit << CKRM_SHARE_SHIFT) / grt;  
+
+               list_del_init(&clsptr->surplus_queue);
+               //insert the class to the queue
+               insert_surplus_queue(dst_queue,clsptr);
+       }
+       return 0;
+}
+
+/*
+ * get all the surplus that should be reallocated to the children
+ */
+static inline int get_total_surplus(struct ckrm_cpu_class *p_cls,
+                                   struct ckrm_core_class *parent) 
+{
+       struct ckrm_cpu_class *c_cls;
+       int total_surplus;
+       struct ckrm_core_class *child_core = NULL;
+
+       //additional share assigned to this sub node from parent
+       total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
+       BUG_ON(total_surplus < 0);
+
+       //surplus of this node
+       total_surplus += get_my_node_surplus(p_cls);
+       do {
+               child_core = ckrm_get_next_child(parent, child_core);
+               if (child_core) {
+                       c_cls = ckrm_get_cpu_class(child_core);                         
+                       if (! c_cls) {
+                               total_surplus = 0;
+                               break;
+                       }
+
+                       total_surplus += get_node_surplus(c_cls);                       
+               }
+       } while (child_core);
+
+       return total_surplus;
+}
+/**
+ * alloc_surplus_node: re-allocate the shares for a single level
+ * @parent: parent node
+ * return the remaining surplus
+ *
+ * The surplus reallocation policy is like below.
+ * -- the classes that have eshare >= demand don't need any additional share. 
+ *     So they don't participate the surplus allocation.
+ * -- all the other classes received share in this order:
+ * 1. has savings, not over soft limit
+ * 2. has savings, but over soft limit
+ * 3. no savings, not over soft limit
+ * 4. no savings, over soft limit
+ * 
+ * In each of the 4 levels above, classes get surplus propotionally to its guarantee
+ */
+static int alloc_surplus_node(struct ckrm_core_class *parent)
+{
+       struct ckrm_cpu_class *p_cls;
+       int total_surplus;
+       int ret = -1;
+       struct list_head src_queue, dst_queue;
+
+       p_cls = ckrm_get_cpu_class(parent);
+       if (! p_cls) //safty check
+               goto realloc_out;
+
+       ret = 0;
+       total_surplus = get_total_surplus(p_cls,parent);
+
+       if (! total_surplus) //no surplus to be allocated 
+               goto realloc_out;
+
+       /* 
+        * first round, allocated to tasks with savings, check_sl
+        */
+       get_class_surplus_queue(parent,&src_queue);
+       reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls);
+       if (! list_empty(&dst_queue)) {
+               total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+               if (! total_surplus)
+                       goto realloc_out;
+       }
+
+       /* 
+        * second round, check savings, but no check_sl
+        */
+       //merge the src_queue and dst_queue and reorder
+       list_splice(&dst_queue, &src_queue);
+       reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls);
+       if (! list_empty(&dst_queue)) {
+               total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+               if (! total_surplus)
+                       goto realloc_out;
+       }
+
+       /* 
+        * third round, no check savings, but check_sl
+        */
+       //merge the src_queue and dst_queue and reorder
+       list_splice(&dst_queue, &src_queue);
+       reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls);
+       if (! list_empty(&dst_queue)) {
+               total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+               if (! total_surplus)
+                       goto realloc_out;
+       }
+       /* 
+        * fourth round, no check savings, no check_sl
+        */
+       //merge the src_queue and dst_queue and reorder
+       list_splice(&dst_queue, &src_queue);
+       reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls);
+       if (! list_empty(&dst_queue))
+               total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);       
+       
+ realloc_out:
+       return ret;
+}
+
+/*
+ * return true if the class total savings > MIN_SAVINGS 
+ */
+static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online)
+{
+       unsigned long long total_savings;
+       ckrm_lrq_t* lrq;
+       int i;
+#define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS)
+       
+       total_savings = 0;
+       for_each_online_cpu(i) {
+               lrq = get_ckrm_lrq(clsptr,i);
+               total_savings += lrq->savings;
+       }
+
+       if (total_savings < CLASS_MIN_SAVINGS)
+               return 0;
+
+       //get the average savings
+       do_div(total_savings,cpu_online);       
+       for_each_online_cpu(i) {
+               lrq = get_ckrm_lrq(clsptr,i);
+               lrq->savings = total_savings;
+       }
+
+       /*
+        * hzheng: this is another quick hack
+        * only say I have savings when this node has more demand
+        * ignoring the requirement of child classes
+        */
+       if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat))
+               return 1;
+       else
+               return 0;
+}
+
+/*
+ * check savings status
+ * set has_savings field if the class or its sub class has savings
+ */
+static void check_savings_status(struct ckrm_core_class *root_core)
+{
+       struct ckrm_cpu_class *clsptr;
+       int cpu_online;
+
+       cpu_online = cpus_weight(cpu_online_map);       
+
+       //class status: demand, share,total_ns prio, index
+       list_for_each_entry(clsptr,&active_cpu_classes,links) 
+               clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online);
+}
+
+/**
+ * alloc_surplus - reallocate unused shares
+ *
+ * class A's usused share should be allocated to its siblings
+ * the re-allocation goes downward from the top
+ */
+int alloc_surplus(struct ckrm_core_class *root_core)
+{
+       struct ckrm_core_class *cur_core, *child_core;
+       //      struct ckrm_cpu_class *cls;
+       int ret = -1;
+
+       check_savings_status(root_core);
+
+       /*initialize*/
+       cur_core = root_core;
+       child_core = NULL;
+       //      cls = ckrm_get_cpu_class(cur_core);
+
+       /*the ckrm idle tasks get all what's remaining*/
+       /*hzheng: uncomment the following like for hard limit support */
+       //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
+       
+ repeat:
+       //check exit
+       if (!cur_core)
+               return 0;
+
+       //visit this node only once
+       if (! child_core) 
+               if ( alloc_surplus_node(cur_core) < 0 )
+                       return ret;
+
+       //next child
+       child_core = ckrm_get_next_child(cur_core, child_core);
+       if (child_core) {
+               //go down
+               cur_core = child_core;
+               child_core = NULL;
+               goto repeat;
+       } else {                //no more child, go back
+               child_core = cur_core;
+               cur_core = child_core->hnode.parent;
+       }
+       goto repeat;
  }
  
  }
  
+
+
  /**********************************************/
  /*          cpu demand                        */
  /**********************************************/
  /**********************************************/
  /*          cpu demand                        */
  /**********************************************/
@@ -134,27 +701,29 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
   * how often should we recalculate the cpu demand
   * the number is in ns
   */
   * how often should we recalculate the cpu demand
   * the number is in ns
   */
-static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
+static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,
+                                         int state, unsigned long long len)
  {      
         local_stat->total += len;
         if (state == CKRM_CPU_DEMAND_RUN)
                 local_stat->run += len;
  
         if (local_stat->total >= local_stat->recalc_interval) {
  {      
         local_stat->total += len;
         if (state == CKRM_CPU_DEMAND_RUN)
                 local_stat->run += len;
  
         if (local_stat->total >= local_stat->recalc_interval) {
-               local_stat->total >>= CKRM_SHARE_ACCURACY;
-               if (unlikely(local_stat->run > 0xFFFFFFFF))
-                       local_stat->run = 0xFFFFFFFF;
+               local_stat->total >>= CKRM_SHARE_SHIFT;
+               if (unlikely(local_stat->run > ULONG_MAX))
+                       local_stat->run = ULONG_MAX;
  
  
-               if (local_stat->total > 0xFFFFFFFF) 
-                       local_stat->total = 0xFFFFFFFF;
+               if (unlikely(local_stat->total > ULONG_MAX))
+                       local_stat->total = ULONG_MAX;
                         
                 do_div(local_stat->run,(unsigned long)local_stat->total);
  
                         
                 do_div(local_stat->run,(unsigned long)local_stat->total);
  
-               if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
+               if (unlikely(local_stat->total > ULONG_MAX)) {
+                       //happens after very long sleep
                         local_stat->cpu_demand = local_stat->run;
                         local_stat->cpu_demand = local_stat->run;
-               else {
-                       local_stat->cpu_demand += local_stat->run;
-                       local_stat->cpu_demand >>= 1;
+               } else { 
+                       local_stat->cpu_demand = 
+                            (local_stat->cpu_demand + local_stat->run) >> 1;
                 }
                 local_stat->total = 0;
                 local_stat->run = 0;
                 }
                 local_stat->total = 0;
                 local_stat->run = 0;
@@ -190,57 +759,25 @@ void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsign
                 break;
         default:
                 BUG();
                 break;
         default:
                 BUG();
-       }
-}
-
-/** 
- * check all the class local queue
- * 
- * to deal with excessive long run/sleep state
- * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
- */
-static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
-{
-       struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
-       unsigned long long sleep,now;
-       if (local_stat->last_sleep) {
-               now = sched_clock();
-               sleep = now - local_stat->last_sleep;
-               local_stat->last_sleep = now;
-               update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
-       }
-}
-
-/**
- *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
- *
- * self_cpu_demand = sum(cpu demand of all local queues) 
- */
-static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
-{
-       int cpu_demand = 0;
-       int i;
-       int cpuonline = 0;
-
-       for_each_online_cpu(i) {
-               cpu_demand_check_sleep(stat,i);
-               cpu_demand += stat->local_stats[i].cpu_demand;
-               cpuonline ++;
-       }
-
-       return (cpu_demand/cpuonline);
+       }
  }
  
  }
  
-/*
- * my max demand = min(cpu_demand, my effective hard limit)
+/** 
+ * check all the class local queue
+ * 
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
   */
   */
-static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
+void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
  {
  {
-       unsigned long mmax_demand = get_self_cpu_demand(stat);
-       if (mmax_demand > stat->mehl)
-               mmax_demand = stat->mehl;
-
-       return mmax_demand;
+       struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
+       unsigned long long sleep,now;
+       if (local_stat->last_sleep) {
+               now = sched_clock();
+               sleep = now - local_stat->last_sleep;
+               local_stat->last_sleep = now;
+               update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
+       }
  }
  
  /**
  }
  
  /**
@@ -301,26 +838,6 @@ static int update_max_demand(struct ckrm_core_class *root_core)
  /**********************************************/
  /*          effective guarantee & limit       */
  /**********************************************/
  /**********************************************/
  /*          effective guarantee & limit       */
  /**********************************************/
-static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
-                                      int new_share)
-{
-       if (!new_share)
-               new_share = 1;
-
-       BUG_ON(new_share < 0);
-       stat->eshare = new_share;
-}
-
-static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
-                                           int new_share)
-{
-       if (!new_share)
-               new_share = 1;
-
-       BUG_ON(new_share < 0);
-       stat->meshare = new_share;
-}
-
  /**
   *update_child_effective - update egrt, ehl, mehl for all children of parent
   *@parent: the parent node
  /**
   *update_child_effective - update egrt, ehl, mehl for all children of parent
   *@parent: the parent node
@@ -346,7 +863,7 @@ static int update_child_effective(struct ckrm_core_class *parent)
                     p_cls->stat.egrt *
                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
  
                     p_cls->stat.egrt *
                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
  
-               c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
+               c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls)
                         / c_cls->shares.total_guarantee;
                 
                 c_cls->stat.ehl =
                         / c_cls->shares.total_guarantee;
                 
                 c_cls->stat.ehl =
@@ -372,8 +889,9 @@ static int update_child_effective(struct ckrm_core_class *parent)
   *
   * return -1 if anything wrong happened (eg: the structure changed during the process)
   */
   *
   * return -1 if anything wrong happened (eg: the structure changed during the process)
   */
-static int update_effectives(struct ckrm_core_class *root_core)
+int update_effectives(void)
  {
  {
+       struct ckrm_core_class *root_core = get_default_cpu_class()->core;
         struct ckrm_core_class *cur_core, *child_core;
         struct ckrm_cpu_class *cls;
         int ret = -1;
         struct ckrm_core_class *cur_core, *child_core;
         struct ckrm_cpu_class *cls;
         int ret = -1;
@@ -384,7 +902,7 @@ static int update_effectives(struct ckrm_core_class *root_core)
  
         //initialize the effectives for root 
         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
  
         //initialize the effectives for root 
         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
-       cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
+       cls->stat.megrt = cls->stat.egrt * get_my_grt(cls)
                 / cls->shares.total_guarantee;
         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
                 / cls->shares.total_guarantee;
                 / cls->shares.total_guarantee;
         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
                 / cls->shares.total_guarantee;
@@ -418,288 +936,11 @@ static int update_effectives(struct ckrm_core_class *root_core)
  }
  
  /**********************************************/
  }
  
  /**********************************************/
-/*          surplus allocation                */
+/*           CKRM Idle Tasks                  */
  /**********************************************/
  
  /**********************************************/
  
-/*
- * surplus = egrt - demand
- * if surplus < 0, surplus = 0 
- */
-static inline int get_node_surplus(struct ckrm_cpu_class *cls)
-{
-       int surplus = cls->stat.egrt - cls->stat.max_demand;
-
-       if (surplus < 0)
-               surplus = 0;
-
-       return surplus;
-}
-
-static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
-{
-       int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
-
-       if (surplus < 0)
-               surplus = 0;
-
-       return surplus;
-}
-
-/**
- * consume_surplus: decides how much surplus a node can consume
- * @ckeck_sl: if check_sl is set, then check soft_limitx
- * return how much consumed
- *
- * implements all the CKRM Scheduling Requirement
- * assume c_cls is valid
- */
-static inline int consume_surplus(int surplus,
-                                      struct ckrm_cpu_class *c_cls,
-                                      struct ckrm_cpu_class *p_cls,
-                                      int check_sl
-                                      )
-{
-       int consumed = 0;
-       int inc_limit;
-       int total_grt = p_cls->shares.total_guarantee;
-
-       BUG_ON(surplus < 0);
-
-       /*can't consume more than demand or hard limit*/
-       if (c_cls->stat.eshare >= c_cls->stat.max_demand)
-               goto out;
-
-       //the surplus allocation is propotional to grt
-       consumed =
-               surplus * c_cls->shares.my_guarantee / total_grt;
-
-       if (! consumed) //no more share
-               goto out;
-
-       //hard limit and demand limit
-       inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
-
-       if (check_sl) {
-               int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
-                       /total_grt;
-               if (esl < c_cls->stat.max_demand)
-                       inc_limit = esl - c_cls->stat.eshare;
-       }
-
-       if (consumed > inc_limit)
-               consumed = inc_limit;
-
-        BUG_ON(consumed < 0);
- out:          
-       return consumed;
-}
-
-/*
- * how much a node can consume for itself?
- */
-static inline int consume_self_surplus(int surplus,
-                                      struct ckrm_cpu_class *p_cls,
-                                      int check_sl
-                                      )
-{
-       int consumed = 0;
-       int inc_limit;
-       int total_grt = p_cls->shares.total_guarantee;
-       int max_demand = get_mmax_demand(&p_cls->stat);
-
-       BUG_ON(surplus < 0);
-
-       /*can't consume more than demand or hard limit*/
-       if (p_cls->stat.meshare >= max_demand)
-               goto out;
-
-       //the surplus allocation is propotional to grt
-       consumed =
-               surplus * p_cls->shares.unused_guarantee / total_grt;
-
-       if (! consumed) //no more share
-               goto out;
-
-       //hard limit and demand limit
-       inc_limit = max_demand - p_cls->stat.meshare;
-
-       if (check_sl) {
-               int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
-                       /total_grt;
-               if (mesl < max_demand)
-                       inc_limit = mesl - p_cls->stat.meshare;
-       }
-
-       if (consumed > inc_limit)
-               consumed = inc_limit;
-
-        BUG_ON(consumed < 0);
- out:          
-       return consumed;
-}
-
-
-/*
- * allocate surplus to all its children and also its default class
- */
-static int alloc_surplus_single_round(
-                                     int surplus,
-                                     struct ckrm_core_class *parent,
-                                     struct ckrm_cpu_class *p_cls,
-                                     int check_sl)
-{
-       struct ckrm_cpu_class *c_cls;
-       struct ckrm_core_class *child_core = NULL;
-       int total_consumed = 0,consumed;
-
-       //first allocate to the default class
-       consumed  =
-               consume_self_surplus(surplus,p_cls,check_sl);
-
-       if (consumed > 0) {
-               set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
-               total_consumed += consumed;
-       }
-
-       do {
-               child_core = ckrm_get_next_child(parent, child_core);
-               if (child_core)  {
-                       c_cls = ckrm_get_cpu_class(child_core);
-                       if (! c_cls)
-                               return -1;
-
-                       consumed    =
-                               consume_surplus(surplus, c_cls,
-                                                    p_cls,check_sl);
-                       if (consumed > 0) {
-                               set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
-                               total_consumed += consumed;
-                       }
-               }
-       } while (child_core);
-
-       return total_consumed;
-}
-
-/**
- * alloc_surplus_node: re-allocate the shares for children under parent
- * @parent: parent node
- * return the remaining surplus
- *
- * task:
- *  1. get total surplus
- *  2. allocate surplus
- *  3. set the effective_share of each node
- */
-static int alloc_surplus_node(struct ckrm_core_class *parent)
-{
-       struct ckrm_cpu_class *p_cls,*c_cls;
-       int total_surplus,consumed;
-       int check_sl;
-       int ret = -1;
-       struct ckrm_core_class *child_core = NULL;
-
-       p_cls = ckrm_get_cpu_class(parent);
-       if (! p_cls)
-               goto realloc_out;
-
-       /*
-        * get total surplus
-        */
-       total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
-       BUG_ON(total_surplus < 0);
-       total_surplus += get_my_node_surplus(p_cls);
-
-       do {
-               child_core = ckrm_get_next_child(parent, child_core);
-               if (child_core) {
-                       c_cls = ckrm_get_cpu_class(child_core);                         
-                       if (! c_cls)
-                               goto realloc_out;
-
-                       total_surplus += get_node_surplus(c_cls);
-               }
-       } while (child_core);
-
-
-       if (! total_surplus) {
-               ret = 0;
-               goto realloc_out;
-       }
-
-       /* 
-        * distributing the surplus 
-        * first with the check_sl enabled
-        * once all the tasks has research the soft limit, disable check_sl and try again
-        */
-       
-       check_sl = 1;
-       do {
-               consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
-               if (consumed < 0) //something is wrong
-                       goto realloc_out;
-
-               if (! consumed)
-                       check_sl = 0;
-               else
-                       total_surplus -= consumed;
-
-       } while ((total_surplus > 0) && (consumed || check_sl) );
-
-       ret = 0;
-       
- realloc_out:
-       return ret;
-}
-
-/**
- * alloc_surplus - reallocate unused shares
- *
- * class A's usused share should be allocated to its siblings
- * the re-allocation goes downward from the top
- */
-static int alloc_surplus(struct ckrm_core_class *root_core)
-{
-       struct ckrm_core_class *cur_core, *child_core;
-       //      struct ckrm_cpu_class *cls;
-       int ret = -1;
-
-       /*initialize*/
-       cur_core = root_core;
-       child_core = NULL;
-       //      cls = ckrm_get_cpu_class(cur_core);
-
-       /*the ckrm idle tasks get all what's remaining*/
-       /*hzheng: uncomment the following like for hard limit support */
-       //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
-       
- repeat:
-       //check exit
-       if (!cur_core)
-               return 0;
-
-       //visit this node only once
-       if (! child_core) 
-               if ( alloc_surplus_node(cur_core) < 0 )
-                       return ret;
-
-       //next child
-       child_core = ckrm_get_next_child(cur_core, child_core);
-       if (child_core) {
-               //go down
-               cur_core = child_core;
-               child_core = NULL;
-               goto repeat;
-       } else {                //no more child, go back
-               child_core = cur_core;
-               cur_core = child_core->hnode.parent;
-       }
-       goto repeat;
-}
+#ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS
  
  
-/**********************************************/
-/*           CKRM Idle Tasks                  */
-/**********************************************/
  struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
  struct task_struct* ckrm_idle_tasks[NR_CPUS];
  
  struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
  struct task_struct* ckrm_idle_tasks[NR_CPUS];
  
@@ -710,7 +951,7 @@ static inline int get_nr_idle(unsigned long surplus)
         int nr_idle = 0; 
         
         nr_idle = surplus * cpu_online;
         int nr_idle = 0; 
         
         nr_idle = surplus * cpu_online;
-       nr_idle >>= CKRM_SHARE_ACCURACY;
+       nr_idle >>= CKRM_SHARE_SHIFT;
  
         if (surplus) 
                 nr_idle ++;
  
         if (surplus) 
                 nr_idle ++;
@@ -722,7 +963,8 @@ static inline int get_nr_idle(unsigned long surplus)
  }
  
  /**
  }
  
  /**
- * update_ckrm_idle: update the status of the idle class according to the new surplus
+ * update_ckrm_idle: update the status of the idle class according 
+ *                   to the new surplus
   * surplus: new system surplus
   *
   * Task:
   * surplus: new system surplus
   *
   * Task:
@@ -816,6 +1058,20 @@ void ckrm_start_ckrm_idle(void)
         }
  }
  
         }
  }
  
+void ckrm_stop_ckrm_idle(void)
+{
+       BUG_ON(1);   // not yet implemented
+}
+
+#else
+
+static inline void ckrm_start_ckrm_idle(void) { };
+static inline void ckrm_stop_ckrm_idle(void) { };
+static inline void update_ckrm_idle(unsigned long surplus) { };
+
+#endif
+
+
  /**********************************************/
  /*          Local Weight                      */
  /**********************************************/
  /**********************************************/
  /*          Local Weight                      */
  /**********************************************/
@@ -831,8 +1087,19 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
         int i;
         unsigned long class_weight;
         unsigned long long lw;  
         int i;
         unsigned long class_weight;
         unsigned long long lw;  
-
-       //get total pressure
+       struct ckrm_cpu_class_stat *stat;
+       unsigned long oweight;
+       unsigned long skewed_limit;
+       /*
+        * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare
+        * then we set the skewed_share 
+        */
+#define SKEWED_SHARE_RATIO 8
+#define SKEWED_WEIGHT_MIN 3
+       
+       /* get total pressure of the class, if there is not pressure (.. class is
+        * idle, then leave the weights as is
+        */
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(clsptr,i);
                 total_pressure += lrq->lrq_load;
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(clsptr,i);
                 total_pressure += lrq->lrq_load;
@@ -841,32 +1108,61 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
         if (! total_pressure)
                 return;
         
         if (! total_pressure)
                 return;
         
+       stat = &clsptr->stat;
+
         class_weight = cpu_class_weight(clsptr) * cpu_online;
  
         class_weight = cpu_class_weight(clsptr) * cpu_online;
  
+       /* calculate or skewed limit weight */
+       skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO);
+       if (skewed_limit < SKEWED_WEIGHT_MIN)
+               skewed_limit = SKEWED_WEIGHT_MIN;
+
+       /* calculate over_weight */     
+       BUG_ON(stat->meshare < stat->megrt);
+       oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare;
+       oweight = SHARE_TO_WEIGHT(oweight);
+
         /*
          * update weight for each cpu, minimun is 1
          */
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(clsptr,i);
         /*
          * update weight for each cpu, minimun is 1
          */
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(clsptr,i);
-               if (! lrq->lrq_load)
-                       /*give idle class a high share to boost interactiveness */
+               lrq->over_weight = oweight;
+               if (! lrq->lrq_load) {
+                       /* give idle class a high share to boost 
+                        * interactiveness 
+                        */
                         lw = cpu_class_weight(clsptr); 
                         lw = cpu_class_weight(clsptr); 
-               else {
-                       lw = lrq->lrq_load * class_weight;
+                       if (unlikely(lw==0))
+                               lw = 1;
+               } else {
+                       lw = lrq->lrq_load;
+                       lw *= class_weight;
                         do_div(lw,total_pressure);
                         do_div(lw,total_pressure);
-                       if (!lw)
+                       if (unlikely(lw==0))
                                 lw = 1;
                                 lw = 1;
-                       else if (lw > CKRM_SHARE_MAX)
-                               lw = CKRM_SHARE_MAX;
-               }
-               
+                       else if (unlikely(lw > CKRM_MAX_WEIGHT))
+                               lw = CKRM_MAX_WEIGHT;
+               }       
+               BUG_ON(lw > CKRM_MAX_WEIGHT);
+
+               /* 
+                * set is_skewed and local_weight in proper order
+                * to avoid race condition
+                */
                 lrq->local_weight = lw;
                 lrq->local_weight = lw;
+               if (lw < skewed_limit) 
+                       lrq->skewed_weight = skewed_limit;
+               else
+                       lrq->skewed_weight = 0;
+               BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight));
         }
  }
  
  /*
   * assume called with class_list_lock read lock held
   */
         }
  }
  
  /*
   * assume called with class_list_lock read lock held
   */
+
  void adjust_local_weight(void)
  {
         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
  void adjust_local_weight(void)
  {
         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
@@ -904,9 +1200,11 @@ void ckrm_cpu_monitor(int check_min)
         static unsigned long long last_check = 0;
         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
         unsigned long long now; 
         static unsigned long long last_check = 0;
         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
         unsigned long long now; 
-#define MIN_CPU_MONITOR_INTERVAL 100000000UL
+       int loc;
+
+#define MIN_CPU_MONITOR_INTERVAL (100*1000*1000)  /* 100 MSEC */
  
  
-       if (!root_core)
+       if (ckrm_cpu_disabled() || !root_core)
                 return;
  
         //do nothing if someone already holding the lock
                 return;
  
         //do nothing if someone already holding the lock
@@ -918,29 +1216,37 @@ void ckrm_cpu_monitor(int check_min)
         now = sched_clock();
  
         //consecutive check should be at least 100ms apart
         now = sched_clock();
  
         //consecutive check should be at least 100ms apart
-       if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL))
-               goto outunlock;
+       if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL))
+               goto outunlock_np;
  
         last_check = now;
  
  
         last_check = now;
  
-       if (update_effectives(root_core) != 0)
+       if (update_effectives() != 0) {
+               loc = 0;
                 goto outunlock;
                 goto outunlock;
+       }
         
         
-       if (update_max_demand(root_core) != 0)
+       if (update_max_demand(root_core) != 0) {
+               loc = 1;
                 goto outunlock;
                 goto outunlock;
+       }
         
         
-#ifndef ALLOC_SURPLUS_SUPPORT
-#warning "MEF taking out alloc_surplus"
-#else
-       if (alloc_surplus(root_core) != 0)
+#warning mef: alloc_surplus call back in system;
+       if (alloc_surplus(root_core) != 0) {
+               loc = 2;
                 goto outunlock;
                 goto outunlock;
-#endif
+       }
         
         adjust_local_weight();
  
         
         adjust_local_weight();
  
- outunlock:    
+ outunlock_np:
         read_unlock(&class_list_lock);
         spin_unlock(&lock);
         read_unlock(&class_list_lock);
         spin_unlock(&lock);
+       return;
+
+ outunlock:    
+       printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc);
+       goto outunlock_np;
  }
  
  /*****************************************************/
  }
  
  /*****************************************************/
@@ -952,6 +1258,8 @@ static int thread_exit = 0;
  static int ckrm_cpu_monitord(void *nothing)
  {
         daemonize("ckrm_cpu_ctrld");
  static int ckrm_cpu_monitord(void *nothing)
  {
         daemonize("ckrm_cpu_ctrld");
+       printk("cpu_monitord started\n");
+       thread_exit = 0;
         for (;;) {
                 /*sleep for sometime before next try*/
                 set_current_state(TASK_INTERRUPTIBLE);
         for (;;) {
                 /*sleep for sometime before next try*/
                 set_current_state(TASK_INTERRUPTIBLE);
@@ -967,15 +1275,19 @@ static int ckrm_cpu_monitord(void *nothing)
         return 0;
  }
  
         return 0;
  }
  
-void ckrm_start_monitor(void)
+void ckrm_cpu_start_monitor(void)
  {
  {
+       if (cpu_monitor_pid != -1) {
+               /* already started ... */
+               return;
+       }       
         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
         if (cpu_monitor_pid < 0) {
                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
         }
  }
  
         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
         if (cpu_monitor_pid < 0) {
                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
         }
  }
  
-void ckrm_kill_monitor(void)
+void ckrm_cpu_kill_monitor(void)
  {
         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
         if (cpu_monitor_pid > 0) {
  {
         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
         if (cpu_monitor_pid > 0) {
@@ -987,22 +1299,12 @@ void ckrm_kill_monitor(void)
         }
  }
  
         }
  }
  
-int ckrm_cpu_monitor_init(void)
+static int __init ckrm_cpu_init_monitor(void)
  {
  {
-       ckrm_start_monitor();
-       /*hzheng: uncomment the following like for hard limit support */
-       //      ckrm_start_ckrm_idle();
+       if (ckrm_cpu_enabled()) 
+               ckrm_cpu_start_monitor();
         return 0;
  }
  
         return 0;
  }
  
-void ckrm_cpu_monitor_exit(void)
-{
-       ckrm_kill_monitor();
-}
-
-module_init(ckrm_cpu_monitor_init);
-module_exit(ckrm_cpu_monitor_exit);
+__initcall(ckrm_cpu_init_monitor);
  
  
-MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
-MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
-MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm/ckrm_laq.c b/kernel/ckrm/ckrm_laq.c

deleted file mode 100644 (file)

index b64205a..0000000
--- a/kernel/ckrm/ckrm_laq.c
+++ /dev/null
@@ -1,495 +0,0 @@
-/* ckrm_socketaq.c - accept queue resource controller
- *
- * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
- * 
- * Latest version, more details at http://ckrm.sf.net
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-/* Changes
- * Initial version
- */
-
-/* Code Description: TBD
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/errno.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/ckrm.h>
-#include <linux/ckrm_rc.h>
-#include <net/tcp.h>
-
-#include <linux/ckrm_net.h>
-
-#define hnode_2_core(ptr) \
-        ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
-
-#define CKRM_SAQ_MAX_DEPTH     3       // 0 => /rcfs
-                                 // 1 => socket_aq
-                                 // 2 => socket_aq/listen_class
-                                 // 3 => socket_aq/listen_class/accept_queues
-                                 // 4 => Not allowed
-
-typedef struct ckrm_laq_res {
-       spinlock_t reslock;
-       atomic_t refcnt;
-       struct ckrm_shares shares;
-       struct ckrm_core_class *core;
-       struct ckrm_core_class *pcore;
-       int my_depth;
-       int my_id;
-       unsigned int min_ratio;
-} ckrm_laq_res_t;
-
-static int my_resid = -1;
-
-extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int);
-extern struct ckrm_core_class *rcfs_make_core(struct dentry *,
-                                             struct ckrm_core_class *);
-
-void laq_res_hold(struct ckrm_laq_res *res)
-{
-       atomic_inc(&res->refcnt);
-       return;
-}
-
-void laq_res_put(struct ckrm_laq_res *res)
-{
-       if (atomic_dec_and_test(&res->refcnt))
-               kfree(res);
-       return;
-}
-
-/* Initialize rescls values
- */
-static void laq_res_initcls(void *my_res)
-{
-       ckrm_laq_res_t *res = my_res;
-
-       res->shares.my_guarantee = CKRM_SHARE_DONTCARE;
-       res->shares.my_limit = CKRM_SHARE_DONTCARE;
-       res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
-       res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       res->shares.cur_max_limit = 0;
-}
-
-static int atoi(char *s)
-{
-       int k = 0;
-       while (*s)
-               k = *s++ - '0' + (k * 10);
-       return k;
-}
-
-static char *laq_get_name(struct ckrm_core_class *c)
-{
-       char *p = (char *)c->name;
-
-       while (*p)
-               p++;
-       while (*p != '/' && p != c->name)
-               p--;
-
-       return ++p;
-}
-
-static void *laq_res_alloc(struct ckrm_core_class *core,
-                          struct ckrm_core_class *parent)
-{
-       ckrm_laq_res_t *res, *pres;
-       int pdepth;
-
-       if (parent)
-               pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t);
-       else
-               pres = NULL;
-
-       if (core == core->classtype->default_class)
-               pdepth = 1;
-       else {
-               if (!parent)
-                       return NULL;
-               pdepth = 1 + pres->my_depth;
-       }
-
-       res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC);
-       if (res) {
-               memset(res, 0, sizeof(res));
-               spin_lock_init(&res->reslock);
-               laq_res_hold(res);
-               res->my_depth = pdepth;
-               if (pdepth == 2)        // listen class
-                       res->my_id = 0;
-               else if (pdepth == 3)
-                       res->my_id = atoi(laq_get_name(core));
-               res->core = core;
-               res->pcore = parent;
-
-               // rescls in place, now initialize contents other than 
-               // hierarchy pointers
-               laq_res_initcls(res);   // acts as initialising value
-       }
-
-       return res;
-}
-
-static void laq_res_free(void *my_res)
-{
-       ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res;
-       ckrm_laq_res_t *parent;
-
-       if (!res)
-               return;
-
-       if (res->my_depth != 3) {
-               kfree(res);
-               return;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)            // Should never happen
-               return;
-
-       spin_lock(&parent->reslock);
-       spin_lock(&res->reslock);
-
-       // return child's guarantee to parent node
-       // Limits have no meaning for accept queue control
-       child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0);
-
-       spin_unlock(&res->reslock);
-       laq_res_put(res);
-       spin_unlock(&parent->reslock);
-       return;
-}
-
-/**************************************************************************
- *                     SHARES                                          ***
- **************************************************************************/
-
-void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio)
-{
-       int i;
-       struct tcp_opt *tp;
-
-       tp = tcp_sk(ns->ns_sk);
-       for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
-               tp->acceptq[i].aq_ratio = aq_ratio[i];
-       return;
-}
-void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio)
-{
-
-       struct ckrm_net_struct *ns;
-       struct ckrm_core_class *core = parent->core;
-
-       class_lock(core);
-       list_for_each_entry(ns, &core->objlist, ckrm_link) {
-               laq_set_aq_value(ns, aq_ratio);
-       }
-       class_unlock(core);
-       return;
-}
-
-static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio)
-{
-       struct ckrm_hnode *chnode;
-       ckrm_laq_res_t *child;
-       unsigned int min;
-       int i;
-
-       min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee;
-
-       list_for_each_entry(chnode, &res->core->hnode.children, siblings) {
-               child = hnode_2_core(chnode)->res_class[my_resid];
-
-               aq_ratio[child->my_id] =
-                   (unsigned int)child->shares.my_guarantee;
-               if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE)
-                       aq_ratio[child->my_id] = 0;
-               if (aq_ratio[child->my_id] &&
-                   ((unsigned int)aq_ratio[child->my_id] < min))
-                       min = (unsigned int)child->shares.my_guarantee;
-       }
-
-       if (min == 0) {
-               min = 1;
-               // default takes all if nothing specified
-               aq_ratio[0] = 1;        
-       }
-       res->min_ratio = min;
-
-       for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
-               aq_ratio[i] = aq_ratio[i] / min;
-}
-
-static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
-{
-       ckrm_laq_res_t *res = my_res;
-       ckrm_laq_res_t *parent;
-       unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
-       int rc = 0;
-
-       if (!res)
-               return -EINVAL;
-
-       if (!res->pcore) {
-               // something is badly wrong
-               printk(KERN_ERR "socketaq internal inconsistency\n");
-               return -EBADF;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)            // socketclass does not have a share interface
-               return -EINVAL;
-
-       // Ensure that we ignore limit values
-       shares->my_limit = CKRM_SHARE_DONTCARE;
-       shares->max_limit = CKRM_SHARE_UNCHANGED;
-
-       if (res->my_depth == 0) {
-               printk(KERN_ERR "socketaq bad entry\n");
-               return -EBADF;
-       } else if (res->my_depth == 1) {
-               // can't be written to. This is an internal default.
-               return -EINVAL;
-       } else if (res->my_depth == 2) {
-               //nothin to inherit
-               if (!shares->total_guarantee) {
-                       return -EINVAL;
-               }
-               parent = res;
-               shares->my_guarantee = CKRM_SHARE_DONTCARE;
-       } else if (res->my_depth == 3) {
-               // accept queue itself. 
-               shares->total_guarantee = CKRM_SHARE_UNCHANGED;
-       }
-
-       ckrm_lock_hier(parent->pcore);
-       spin_lock(&parent->reslock);
-       rc = set_shares(shares, &res->shares,
-                       (parent == res) ? NULL : &parent->shares);
-       if (rc) {
-               spin_unlock(&res->reslock);
-               ckrm_unlock_hier(res->pcore);
-               return rc;
-       }
-       calculate_aq_ratios(parent, aq_ratio);
-       laq_set_aq_values(parent, aq_ratio);
-       spin_unlock(&parent->reslock);
-       ckrm_unlock_hier(parent->pcore);
-
-       return rc;
-}
-
-static int laq_get_share_values(void *my_res, struct ckrm_shares *shares)
-{
-       ckrm_laq_res_t *res = my_res;
-
-       if (!res)
-               return -EINVAL;
-       *shares = res->shares;
-       return 0;
-}
-
-/**************************************************************************
- *                     STATS                                           ***
- **************************************************************************/
-
-void
-laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i)
-{
-       seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t"
-                  "queued: %u\n\twait_time: %u\n",
-                  i, taq->acceptq_count, taq->acceptq_qcount,
-                  jiffies_to_msecs(taq->acceptq_wait_time));
-
-       if (i)
-               return;
-
-       for (i = 1; i < NUM_ACCEPT_QUEUES; i++) {
-               taq[0].acceptq_wait_time += taq[i].acceptq_wait_time;
-               taq[0].acceptq_qcount += taq[i].acceptq_qcount;
-               taq[0].acceptq_count += taq[i].acceptq_count;
-       }
-
-       seq_printf(sfile, "Totals :\n\taccepted: %u\n\t"
-                  "queued: %u\n\twait_time: %u\n",
-                  taq->acceptq_count, taq->acceptq_qcount,
-                  jiffies_to_msecs(taq->acceptq_wait_time));
-
-       return;
-}
-
-void
-laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres,
-                struct tcp_acceptq_info *taq)
-{
-       struct ckrm_net_struct *ns;
-       struct ckrm_core_class *core = pres->core;
-       struct tcp_opt *tp;
-       int a = mres->my_id;
-       int z;
-
-       if (a == 0)
-               z = NUM_ACCEPT_QUEUES;
-       else
-               z = a + 1;
-
-       // XXX Instead of holding a  class_lock introduce a rw
-       // lock to be write locked by listen callbacks and read locked here.
-       // - VK
-       class_lock(pres->core);
-       list_for_each_entry(ns, &core->objlist, ckrm_link) {
-               tp = tcp_sk(ns->ns_sk);
-               for (; a < z; a++) {
-                       taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time;
-                       taq->acceptq_qcount += tp->acceptq[a].aq_qcount;
-                       taq->acceptq_count += tp->acceptq[a].aq_count;
-                       taq++;
-               }
-       }
-       class_unlock(pres->core);
-}
-
-static int laq_get_stats(void *my_res, struct seq_file *sfile)
-{
-       ckrm_laq_res_t *res = my_res;
-       ckrm_laq_res_t *parent;
-       struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES];
-       int rc = 0;
-
-       if (!res)
-               return -EINVAL;
-
-       if (!res->pcore) {
-               // something is badly wrong
-               printk(KERN_ERR "socketaq internal inconsistency\n");
-               return -EBADF;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent) {          // socketclass does not have a stat interface
-               printk(KERN_ERR "socketaq internal fs inconsistency\n");
-               return -EINVAL;
-       }
-
-       memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES);
-
-       switch (res->my_depth) {
-
-       default:
-       case 0:
-               printk(KERN_ERR "socket class bad entry\n");
-               rc = -EBADF;
-               break;
-
-       case 1:         // can't be read from. this is internal default.
-               // return -EINVAL
-               rc = -EINVAL;
-               break;
-
-       case 2:         // return the default and total
-               ckrm_lock_hier(res->core);      // block any deletes
-               laq_get_aq_stats(res, res, &taq[0]);
-               laq_print_aq_stats(sfile, &taq[0], 0);
-               ckrm_unlock_hier(res->core);    // block any deletes
-               break;
-
-       case 3:
-               ckrm_lock_hier(parent->core);   // block any deletes
-               laq_get_aq_stats(parent, res, &taq[res->my_id]);
-               laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id);
-               ckrm_unlock_hier(parent->core); // block any deletes
-               break;
-       }
-
-       return rc;
-}
-
-/*
- * The network connection is reclassified to this class. Update its shares.
- * The socket lock is held. 
- */
-static void laq_change_resclass(void *n, void *old, void *r)
-{
-       struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n;
-       struct ckrm_laq_res *res = (struct ckrm_laq_res *)r;
-       unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
-
-       if (res->my_depth != 2)
-               return;
-
-       // a change to my_depth == 3 ie. the accept classes cannot happen.
-       // there is no target file
-       if (res->my_depth == 2) {       // it is one of the socket classes
-               ckrm_lock_hier(res->pcore);
-               // share rule: hold parent resource lock. then self.
-               // However, since my_depth == 1 is a generic class it is not
-               // needed here. Self lock is enough.
-               spin_lock(&res->reslock);
-               calculate_aq_ratios(res, aq_ratio);
-               class_lock(res->pcore);
-               laq_set_aq_value(ns, aq_ratio);
-               class_unlock(res->pcore);
-               spin_unlock(&res->reslock);
-               ckrm_unlock_hier(res->pcore);
-       }
-
-       return;
-}
-
-struct ckrm_res_ctlr laq_rcbs = {
-       .res_name = "laq",
-       .resid = -1,            // dynamically assigned
-       .res_alloc = laq_res_alloc,
-       .res_free = laq_res_free,
-       .set_share_values = laq_set_share_values,
-       .get_share_values = laq_get_share_values,
-       .get_stats = laq_get_stats,
-       .change_resclass = laq_change_resclass,
-       //.res_initcls       = laq_res_initcls,  //HUBERTUS: unnecessary !!
-};
-
-int __init init_ckrm_laq_res(void)
-{
-       struct ckrm_classtype *clstype;
-       int resid;
-
-       clstype = ckrm_find_classtype_by_name("socketclass");
-       if (clstype == NULL) {
-               printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
-               return -ENOENT;
-       }
-
-       if (my_resid == -1) {
-               resid = ckrm_register_res_ctlr(clstype, &laq_rcbs);
-               if (resid >= 0)
-                       my_resid = resid;
-               printk(KERN_DEBUG "........init_ckrm_listen_aq_res -> %d\n", my_resid);
-       }
-       return 0;
-
-}
-
-void __exit exit_ckrm_laq_res(void)
-{
-       ckrm_unregister_res_ctlr(&laq_rcbs);
-       my_resid = -1;
-}
-
-module_init(init_ckrm_laq_res)
-    module_exit(exit_ckrm_laq_res)
-
-    MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c

index 0fe8586..103e3f9 100644 (file)
--- a/kernel/ckrm/ckrm_listenaq.c
+++ b/kernel/ckrm/ckrm_listenaq.c
@@ -1,4 +1,4 @@
-/* ckrm_socketaq.c - accept queue resource controller
+/* ckrm_listenaq.c - accept queue resource controller
   *
   * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
   * 
   *
   * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
   * 
@@ -251,7 +251,7 @@ static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
         }
  
         parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
         }
  
         parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)            // socket_class does not have a share interface
+       if (!parent)            // socketclass does not have a share interface
                 return -EINVAL;
  
         // Ensure that we ignore limit values
                 return -EINVAL;
  
         // Ensure that we ignore limit values
@@ -380,7 +380,7 @@ static int laq_get_stats(void *my_res, struct seq_file *sfile)
         }
  
         parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
         }
  
         parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent) {          // socket_class does not have a stat interface
+       if (!parent) {          // socketclass does not have a stat interface
                 printk(KERN_ERR "socketaq internal fs inconsistency\n");
                 return -EINVAL;
         }
                 printk(KERN_ERR "socketaq internal fs inconsistency\n");
                 return -EINVAL;
         }
@@ -451,7 +451,7 @@ static void laq_change_resclass(void *n, void *old, void *r)
  }
  
  struct ckrm_res_ctlr laq_rcbs = {
  }
  
  struct ckrm_res_ctlr laq_rcbs = {
-       .res_name = "laq",
+       .res_name = "listenaq",
         .resid = -1,            // dynamically assigned
         .res_alloc = laq_res_alloc,
         .res_free = laq_res_free,
         .resid = -1,            // dynamically assigned
         .res_alloc = laq_res_alloc,
         .res_free = laq_res_free,
@@ -467,9 +467,9 @@ int __init init_ckrm_laq_res(void)
         struct ckrm_classtype *clstype;
         int resid;
  
         struct ckrm_classtype *clstype;
         int resid;
  
-       clstype = ckrm_find_classtype_by_name("socket_class");
+       clstype = ckrm_find_classtype_by_name("socketclass");
         if (clstype == NULL) {
         if (clstype == NULL) {
-               printk(KERN_INFO " Unknown ckrm classtype<socket_class>");
+               printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
                 return -ENOENT;
         }
  
                 return -ENOENT;
         }
  
diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c

index 555ba0a..143b259 100644 (file)
--- a/kernel/ckrm/rbce/rbcemod.c
+++ b/kernel/ckrm/rbce/rbcemod.c
@@ -422,7 +422,7 @@ static struct rbce_class *create_rbce_class(const char *classname,
         return cls;
  }
  
         return cls;
  }
  
-static struct rbce_class *get_class(char *classname, int *classtype)
+static struct rbce_class *get_class(const char *classname, int *classtype)
  {
         struct rbce_class *cls;
         void *classobj;
  {
         struct rbce_class *cls;
         void *classobj;
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c

index 0400844..fd7f8a2 100644 (file)
--- a/kernel/ckrm_classqueue.c
+++ b/kernel/ckrm_classqueue.c
@@ -27,14 +27,19 @@
  #include <linux/ckrm_classqueue.h>
  
  #define cq_nr_member(cq) (cq->array.nr_active)
  #include <linux/ckrm_classqueue.h>
  
  #define cq_nr_member(cq) (cq->array.nr_active)
+#define CLASSQUEUE_MASK   (CLASSQUEUE_SIZE - 1)  
  
  /**
  
  /**
- * get_index - translate the logical priority to the real index in the queue
+ * get_node_index - 
+ *      translate the logical priority to the real index in the queue
   * 
   * validate the position
   * a valid prio is [cq->base,cq->base + size -1]
   * 
   * validate the position
   * a valid prio is [cq->base,cq->base + size -1]
+ * check whether node is supposed to be enqeued beyond above window and 
+ * if so set the need_repos flag 
   */
   */
-static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
+static inline unsigned long get_node_index(struct classqueue_struct *cq, 
+                                          cq_node_t * node)
  {
         unsigned long index;
         int max_prio;
  {
         unsigned long index;
         int max_prio;
@@ -43,22 +48,24 @@ static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
                 return 0;
  
         max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
                 return 0;
  
         max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
-       if (*prio > max_prio)
-               *prio = max_prio;
-       if (*prio < cq->base)
-               *prio = cq->base;
+       if (unlikely(node->prio > max_prio)) {
+               node->real_prio = node->prio;
+               node->prio = max_prio;
+               node->need_repos = 1;
+       } else
+               node->need_repos = 0;
  
  
-               index = (cq->base_offset + (*prio - cq->base)) ;
-       if (index >= CLASSQUEUE_SIZE)
-               index -= CLASSQUEUE_SIZE;
+       if (unlikely(node->prio < cq->base))
+               node->prio = cq->base;
  
  
-       return index;
+               index = (cq->base_offset + (node->prio - cq->base)) ;
+       return ( index & CLASSQUEUE_MASK );   // ensure its in limits
  }
  
  /**
   * initialize a class queue object
   */
  }
  
  /**
   * initialize a class queue object
   */
-int classqueue_init(struct classqueue_struct *cq)
+int classqueue_init(struct classqueue_struct *cq, int enabled)
  {
         int i;
         struct cq_prio_array *array;
  {
         int i;
         struct cq_prio_array *array;
@@ -73,7 +80,8 @@ int classqueue_init(struct classqueue_struct *cq)
         array->nr_active = 0;
  
         cq->base = 0;
         array->nr_active = 0;
  
         cq->base = 0;
-       cq->base_offset = -1;   //not valid yet
+       cq->base_offset = 0;
+       cq->enabled = enabled;
  
         return 0;
  }
  
         return 0;
  }
@@ -87,8 +95,8 @@ void classqueue_enqueue(struct classqueue_struct *cq,
         int index;
  
         //get real index
         int index;
  
         //get real index
-       if (cq_nr_member(cq)) {
-               index = get_index(cq, &prio);
+       if (cq_nr_member(cq)) {         
+               index = get_node_index(cq, node);
         } else {                //the first one
                 cq->base = prio;
                 cq->base_offset = 0;
         } else {                //the first one
                 cq->base = prio;
                 cq->base_offset = 0;
@@ -123,8 +131,8 @@ void classqueue_update_prio(struct classqueue_struct *cq,
         if (! cls_in_classqueue(node)) 
                 return;
  
         if (! cls_in_classqueue(node)) 
                 return;
  
-       index = get_index(cq, &new_pos);
         node->prio = new_pos;
         node->prio = new_pos;
+       index = get_node_index(cq, node);
  
         //remove from the original position
         list_del_init(&(node->list));
  
         //remove from the original position
         list_del_init(&(node->list));
@@ -137,10 +145,32 @@ void classqueue_update_prio(struct classqueue_struct *cq,
         node->index = index;
  }
  
         node->index = index;
  }
  
+
+static inline void __classqueue_update_base(struct classqueue_struct *cq, 
+                                           int new_base)
+{
+       int max_prio; 
+       if (unlikely(new_base <= cq->base)) // base will never move back
+               return; 
+       if (unlikely(!cq_nr_member(cq))) {  
+               cq->base_offset = 0;
+               cq->base = new_base;        // is this necessary ??
+               return;
+       }
+           
+       max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
+       if (unlikely(new_base > max_prio))
+               new_base = max_prio;
+
+               cq->base_offset = (cq->base_offset + (new_base - cq->base)) & CLASSQUEUE_MASK; 
+       cq->base = new_base;
+}
+ 
  /**
   *classqueue_get_min_prio: return the priority of the last node in queue
   *
   * this function can be called without runqueue lock held
  /**
   *classqueue_get_min_prio: return the priority of the last node in queue
   *
   * this function can be called without runqueue lock held
+ * return 0 if there's nothing in the queue
   */
  static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
  {
   */
  static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
  {
@@ -171,9 +201,13 @@ static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
   */
  cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
  {
   */
  cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
  {
-       cq_node_t *result = NULL;
+       cq_node_t *node;
         int pos;
         int pos;
+       int index;
+       int new_base;
  
  
+search_again:
+       node = NULL;
         /* 
          * search over the bitmap to get the first class in the queue
          */
         /* 
          * search over the bitmap to get the first class in the queue
          */
@@ -183,10 +217,38 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
                 pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
  
         if (pos < CLASSQUEUE_SIZE) {
                 pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
  
         if (pos < CLASSQUEUE_SIZE) {
-               BUG_ON(list_empty(&cq->array.queue[pos]));
-               result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+               //BUG_ON(list_empty(&cq->array.queue[pos]));
+               node = list_entry(cq->array.queue[pos].next, cq_node_t, list);
         }
         }
-       return result;
+
+       //check if the node need to be repositioned
+       if (likely(! node || ! node->need_repos)) 
+               return node;
+
+       // We need to reposition this node in the class queue
+       // BUG_ON(node->prio == node->real_prio);
+       
+       //remove from the original position
+       list_del_init(&(node->list));
+       if (list_empty(&cq->array.queue[node->index]))
+         __clear_bit(node->index, cq->array.bitmap);
+       
+       new_base = classqueue_get_min_prio(cq);
+       node->prio = node->real_prio;
+       
+       if (! new_base)
+               new_base  = node->real_prio;
+       else if (node->real_prio < new_base)
+               new_base  = node->real_prio;
+       __classqueue_update_base(cq,new_base);
+       
+       index = get_node_index(cq, node);               
+       //add to new positon, round robin for classes with same priority
+       list_add_tail(&(node->list), &cq->array.queue[index]);
+       __set_bit(index, cq->array.bitmap);     
+       node->index = index;
+       
+       goto search_again;              
  }
  
  /**
  }
  
  /**
@@ -198,14 +260,11 @@ void classqueue_update_base(struct classqueue_struct *cq)
         int new_base;
         
         if (! cq_nr_member(cq)) {
         int new_base;
         
         if (! cq_nr_member(cq)) {
-               cq->base_offset = -1;   //not defined
+               cq->base = 0;
+               cq->base_offset = 0;
                 return;
         }
  
         new_base = classqueue_get_min_prio(cq);
                 return;
         }
  
         new_base = classqueue_get_min_prio(cq);
-       
-       if (new_base > cq->base) {
-               cq->base_offset = get_index(cq, &new_base);
-               cq->base = new_base;
-       }
+               __classqueue_update_base(cq,new_base);
  }
  }
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c

index 5142b2e..26ffc69 100644 (file)
--- a/kernel/ckrm_sched.c
+++ b/kernel/ckrm_sched.c
@@ -20,6 +20,28 @@ LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
  
  struct ckrm_cpu_class default_cpu_class_obj;
  
  
  struct ckrm_cpu_class default_cpu_class_obj;
  
+unsigned int ckrm_sched_mode __cacheline_aligned_in_smp = 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT
+                       CKRM_SCHED_MODE_ENABLED;
+#else
+                       CKRM_SCHED_MODE_DISABLED;
+#endif
+
+static int __init ckrm_cpu_enabled_setup(char *str)
+{
+       ckrm_sched_mode = CKRM_SCHED_MODE_ENABLED;
+       return 1;
+}
+
+static int __init ckrm_cpu_disabled_setup(char *str)
+{
+       ckrm_sched_mode = CKRM_SCHED_MODE_DISABLED;
+       return 1;
+}
+
+__setup("ckrmcpu",  ckrm_cpu_enabled_setup);
+__setup("nockrmcpu",ckrm_cpu_disabled_setup);
+
  struct ckrm_cpu_class * get_default_cpu_class(void) {
         return (&default_cpu_class_obj);
  }
  struct ckrm_cpu_class * get_default_cpu_class(void) {
         return (&default_cpu_class_obj);
  }
@@ -28,7 +50,10 @@ struct ckrm_cpu_class * get_default_cpu_class(void) {
  /*                CVT Management                       */
  /*******************************************************/
  
  /*                CVT Management                       */
  /*******************************************************/
  
-static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
+//an absolute bonus of 200ms for classes when reactivated
+#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
+
+static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
  {
         CVT_t min_cvt;
         CVT_t bonus;
  {
         CVT_t min_cvt;
         CVT_t bonus;
@@ -37,6 +62,7 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
         if (unlikely(! cur_cvt))
                 return; 
  
         if (unlikely(! cur_cvt))
                 return; 
  
+#define INTERACTIVE_BONUS_SUPPORT 1
  #ifndef INTERACTIVE_BONUS_SUPPORT
  #warning "ACB taking out interactive bonus calculation"        
         bonus = 0;
  #ifndef INTERACTIVE_BONUS_SUPPORT
  #warning "ACB taking out interactive bonus calculation"        
         bonus = 0;
@@ -50,51 +76,40 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
  #endif
  
         //cvt can't be negative
  #endif
  
         //cvt can't be negative
-       if (cur_cvt > bonus)
+       if (likely(cur_cvt > bonus))
                 min_cvt = cur_cvt - bonus;
         else
                 min_cvt = 0;
                 min_cvt = cur_cvt - bonus;
         else
                 min_cvt = 0;
-       
-       if (lrq->local_cvt < min_cvt) {
+
+       if (lrq->local_cvt < min_cvt) { 
+               //      if (lrq->local_cvt < min_cvt && ! lrq_nr_running(lrq)) {
                 CVT_t lost_cvt;
  
                 CVT_t lost_cvt;
  
-               lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq);
+               if (unlikely(lrq->local_cvt == 0)) {
+                       lrq->local_cvt = cur_cvt;
+                       return;
+               }
+               lost_cvt = min_cvt - lrq->local_cvt;
+               lost_cvt *= local_class_weight(lrq);
                 lrq->local_cvt = min_cvt;
                 lrq->local_cvt = min_cvt;
+               BUG_ON(lost_cvt < 0);
  
                 /* add what the class lost to its savings*/
  
                 /* add what the class lost to its savings*/
-               lrq->savings += lost_cvt;
+#if 1 /*zhq debugging*/
+               lrq->savings += lost_cvt;              
+#endif
                 if (lrq->savings > MAX_SAVINGS)
                         lrq->savings = MAX_SAVINGS; 
                 if (lrq->savings > MAX_SAVINGS)
                         lrq->savings = MAX_SAVINGS; 
-       } else if (lrq->savings) {
-               /*
-                *if a class saving and falling behind
-                * then start to use it saving in a leaking bucket way
-                */
-               CVT_t savings_used;
-
-               savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq);
-               if (savings_used > lrq->savings)
-                       savings_used = lrq->savings;
-               
-               if (savings_used > SAVINGS_LEAK_SPEED)
-                       savings_used = SAVINGS_LEAK_SPEED;
-
-               BUG_ON(lrq->savings < savings_used);
-               lrq->savings -= savings_used;
-               unscale_cvt(savings_used,lrq);
-               BUG_ON(lrq->local_cvt < savings_used);
-#ifndef CVT_SAVINGS_SUPPORT
-#warning "ACB taking out cvt saving"
-#else
-               lrq->local_cvt -= savings_used;
+#if 0 /* zhq debugging*/
+               printk("lrq= %x savings: %llu lost= %llu\n",(int)lrq,lrq->savings,lost_cvt);
  #endif
  #endif
-       }               
+       }
  }
  
  /*
   * return the max_cvt of all the classes
   */
  }
  
  /*
   * return the max_cvt of all the classes
   */
-static inline CVT_t get_max_cvt(int this_cpu)
+CVT_t get_max_cvt(int this_cpu)
  {
          struct ckrm_cpu_class *clsptr;
          ckrm_lrq_t * lrq;
  {
          struct ckrm_cpu_class *clsptr;
          ckrm_lrq_t * lrq;
@@ -102,7 +117,6 @@ static inline CVT_t get_max_cvt(int this_cpu)
  
          max_cvt = 0;
  
  
          max_cvt = 0;
  
-        /*update class time, at the same time get max_cvt */
          list_for_each_entry(clsptr, &active_cpu_classes, links) {
                  lrq = get_ckrm_lrq(clsptr, this_cpu);
                  if (lrq->local_cvt > max_cvt)
          list_for_each_entry(clsptr, &active_cpu_classes, links) {
                  lrq = get_ckrm_lrq(clsptr, this_cpu);
                  if (lrq->local_cvt > max_cvt)
@@ -112,6 +126,23 @@ static inline CVT_t get_max_cvt(int this_cpu)
         return max_cvt;
  }
  
         return max_cvt;
  }
  
+CVT_t get_min_cvt(int this_cpu)
+{
+        struct ckrm_cpu_class *clsptr;
+        ckrm_lrq_t * lrq;
+        CVT_t max_cvt;
+
+        max_cvt = 0xFFFFFFFFFFFFFLLU;
+
+        list_for_each_entry(clsptr, &active_cpu_classes, links) {
+                lrq = get_ckrm_lrq(clsptr, this_cpu);
+                if (lrq->local_cvt < max_cvt)
+                        max_cvt = lrq->local_cvt;
+        }
+
+       return max_cvt;
+}
+
  /**
   * update_class_cputime - updates cvt of inactive classes
   * -- an inactive class shouldn't starve others when it comes back
  /**
   * update_class_cputime - updates cvt of inactive classes
   * -- an inactive class shouldn't starve others when it comes back
@@ -120,7 +151,7 @@ static inline CVT_t get_max_cvt(int this_cpu)
   * 
   * class_list_lock must have been acquired 
   */
   * 
   * class_list_lock must have been acquired 
   */
-void update_class_cputime(int this_cpu)
+void update_class_cputime(int this_cpu, int idle)
  {
         struct ckrm_cpu_class *clsptr;
         ckrm_lrq_t * lrq;
  {
         struct ckrm_cpu_class *clsptr;
         ckrm_lrq_t * lrq;
@@ -178,24 +209,45 @@ void update_class_cputime(int this_cpu)
  /*******************************************************/
  /*                PID load balancing stuff             */
  /*******************************************************/
  /*******************************************************/
  /*                PID load balancing stuff             */
  /*******************************************************/
-#define PID_SAMPLE_T 32
  #define PID_KP 20
  #define PID_KI 60
  #define PID_KD 20
  
  #define PID_KP 20
  #define PID_KI 60
  #define PID_KD 20
  
+/*
+ * runqueue load is the local_weight of all the classes on this cpu
+ * must be called with class_list_lock held
+ */
+static unsigned long ckrm_cpu_load(int cpu)
+{
+       struct ckrm_cpu_class *clsptr;
+       ckrm_lrq_t* lrq;
+       struct ckrm_cpu_demand_stat* l_stat;
+       int total_load = 0;
+       int load;
+
+       list_for_each_entry(clsptr,&active_cpu_classes,links) {
+               lrq =  get_ckrm_lrq(clsptr,cpu);
+               l_stat = get_cls_local_stat(clsptr,cpu);
+
+               load = WEIGHT_TO_SHARE(lrq->local_weight);
+               
+               if (l_stat->cpu_demand < load)
+                       load = l_stat->cpu_demand;
+               total_load += load;
+       }       
+       return total_load;
+}
+
+
  /**
   * sample pid load periodically
   */
  /**
   * sample pid load periodically
   */
+
  void ckrm_load_sample(ckrm_load_t* pid,int cpu)
  {
         long load;
         long err;
  
  void ckrm_load_sample(ckrm_load_t* pid,int cpu)
  {
         long load;
         long err;
  
-       if (jiffies % PID_SAMPLE_T)
-               return;
-
-       adjust_local_weight();  
-
         load = ckrm_cpu_load(cpu);
         err = load - pid->load_p;
         pid->load_d = err;
         load = ckrm_cpu_load(cpu);
         err = load - pid->load_p;
         pid->load_d = err;
@@ -205,7 +257,7 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu)
         pid->load_i /= 10;
  }
  
         pid->load_i /= 10;
  }
  
-long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
+long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group)
  {
         long pressure;
         pressure = ckrm_load->load_p * PID_KP;
  {
         long pressure;
         pressure = ckrm_load->load_p * PID_KP;
@@ -214,3 +266,58 @@ long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
         pressure /= 100;
         return pressure;
  }
         pressure /= 100;
         return pressure;
  }
+
+/*
+ *  called after a task is switched out. Update the local cvt accounting 
+ *  we need to stick with long instead of long long due to nonexistent 
+ *  64-bit division
+ */
+void update_local_cvt(struct task_struct *p, unsigned long nsec)
+{
+       ckrm_lrq_t * lrq = get_task_lrq(p);
+       unsigned long cvt_inc;
+
+       /*
+        * consume from savings if eshare is larger than egrt
+        */
+       if (lrq->savings && lrq->over_weight) {
+               unsigned long savings_used;
+
+               savings_used = nsec;
+               savings_used >>= CKRM_WEIGHT_SHIFT;
+               savings_used *= lrq->over_weight;
+               if (savings_used > lrq->savings)
+                       savings_used = lrq->savings;
+               lrq->savings -= savings_used;   
+       }
+
+       //BUG_ON(local_class_weight(lrq) == 0);
+       cvt_inc = nsec / local_class_weight(lrq); 
+
+       /* 
+        * For a certain processor, CKRM allocates CPU time propotional 
+        * to the class's local_weight. So once a class consumed nsec, 
+        * it will wait for X (nsec) for its next turn.
+        *
+        * X is calculated based on the following fomular
+        *     nsec / local_weight < X / (CKRM_MAX_WEIGHT - local_weight)
+        * if local_weight is small, then approximated as
+        *     nsec / local_weight < X / (CKRM_MAX_WEIGHT)
+        */
+#define CVT_STARVATION_LIMIT (200LL*NSEC_PER_MS)
+#define CVT_STARVATION_INC_LIMIT (CVT_STARVATION_LIMIT >> CKRM_WEIGHT_SHIFT)
+
+       if (unlikely(lrq->skewed_weight)) {
+               unsigned long long starvation_limit = CVT_STARVATION_INC_LIMIT;
+               
+               starvation_limit *= local_class_weight(lrq);
+               if (unlikely(cvt_inc > starvation_limit))         
+                       cvt_inc = nsec / lrq->skewed_weight;
+       }
+
+       /* now update the CVT accounting */
+
+       lrq->local_cvt += cvt_inc;
+       lrq->uncounted_ns += nsec;
+       update_class_priority(lrq);
+}
diff --git a/kernel/kexec.c b/kernel/kexec.c

new file mode 100644 (file)

index 0000000..b59023f
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,640 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <net/checksum.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(
+       struct kimage *image, unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+
+
+static int kimage_alloc(struct kimage **rimage,
+       unsigned long nr_segments, struct kexec_segment *segments)
+{
+       int result;
+       struct kimage *image;
+       size_t segment_bytes;
+       unsigned long i;
+
+       /* Allocate a controlling structure */
+       result = -ENOMEM;
+       image = kmalloc(sizeof(*image), GFP_KERNEL);
+       if (!image) {
+               goto out;
+       }
+       memset(image, 0, sizeof(*image));
+       image->head = 0;
+       image->entry = &image->head;
+       image->last_entry = &image->head;
+
+       /* Initialize the list of control pages */
+       INIT_LIST_HEAD(&image->control_pages);
+
+       /* Initialize the list of destination pages */
+       INIT_LIST_HEAD(&image->dest_pages);
+
+       /* Initialize the list of unuseable pages */
+       INIT_LIST_HEAD(&image->unuseable_pages);
+
+       /* Read in the segments */
+       image->nr_segments = nr_segments;
+       segment_bytes = nr_segments * sizeof*segments;
+       result = copy_from_user(image->segment, segments, segment_bytes);
+       if (result)
+               goto out;
+
+       /*
+        * Verify we have good destination addresses.  The caller is
+        * responsible for making certain we don't attempt to load
+        * the new image into invalid or reserved areas of RAM.  This
+        * just verifies it is an address we can use.
+        */
+       result = -EADDRNOTAVAIL;
+       for (i = 0; i < nr_segments; i++) {
+               unsigned long mend;
+               mend = ((unsigned long)(image->segment[i].mem)) +
+                       image->segment[i].memsz;
+               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+                       goto out;
+       }
+
+       /*
+        * Find a location for the control code buffer, and add it
+        * the vector of segments so that it's pages will also be
+        * counted as destination pages.
+        */
+       result = -ENOMEM;
+       image->control_code_page = kimage_alloc_control_pages(image,
+               get_order(KEXEC_CONTROL_CODE_SIZE));
+       if (!image->control_code_page) {
+               printk(KERN_ERR "Could not allocate control_code_buffer\n");
+               goto out;
+       }
+
+       result = 0;
+ out:
+       if (result == 0) {
+               *rimage = image;
+       } else {
+               kfree(image);
+       }
+       return result;
+}
+
+static int kimage_is_destination_range(
+       struct kimage *image, unsigned long start, unsigned long end)
+{
+       unsigned long i;
+
+       for (i = 0; i < image->nr_segments; i++) {
+               unsigned long mstart, mend;
+               mstart = (unsigned long)image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+               if ((end > mstart) && (start < mend)) {
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+       struct page *pages;
+       pages = alloc_pages(gfp_mask, order);
+       if (pages) {
+               unsigned int count, i;
+               pages->mapping = NULL;
+               pages->private = order;
+               count = 1 << order;
+               for(i = 0; i < count; i++) {
+                       SetPageReserved(pages + i);
+               }
+       }
+       return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+       unsigned int order, count, i;
+       order = page->private;
+       count = 1 << order;
+       for(i = 0; i < count; i++) {
+               ClearPageReserved(page + i);
+       }
+       __free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+       struct list_head *pos, *next;
+       list_for_each_safe(pos, next, list) {
+               struct page *page;
+
+               page = list_entry(pos, struct page, lru);
+               list_del(&page->lru);
+
+               kimage_free_pages(page);
+       }
+}
+
+struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order)
+{
+       /* Control pages are special, they are the intermediaries
+        * that are needed while we copy the rest of the pages
+        * to their final resting place.  As such they must
+        * not conflict with either the destination addresses
+        * or memory the kernel is already using.
+        *
+        * The only case where we really need more than one of
+        * these are for architectures where we cannot disable
+        * the MMU and must instead generate an identity mapped
+        * page table for all of the memory.
+        *
+        * At worst this runs in O(N) of the image size.
+        */
+       struct list_head extra_pages;
+       struct page *pages;
+       unsigned int count;
+
+       count = 1 << order;
+       INIT_LIST_HEAD(&extra_pages);
+
+       /* Loop while I can allocate a page and the page allocated
+        * is a destination page.
+        */
+       do {
+               unsigned long pfn, epfn, addr, eaddr;
+               pages = kimage_alloc_pages(GFP_KERNEL, order);
+               if (!pages)
+                       break;
+               pfn   = page_to_pfn(pages);
+               epfn  = pfn + count;
+               addr  = pfn << PAGE_SHIFT;
+               eaddr = epfn << PAGE_SHIFT;
+               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+                       kimage_is_destination_range(image, addr, eaddr))
+               {
+                       list_add(&pages->lru, &extra_pages);
+                       pages = NULL;
+               }
+       } while(!pages);
+       if (pages) {
+               /* Remember the allocated page... */
+               list_add(&pages->lru, &image->control_pages);
+
+               /* Because the page is already in it's destination
+                * location we will never allocate another page at
+                * that address.  Therefore kimage_alloc_pages
+                * will not return it (again) and we don't need
+                * to give it an entry in image->segment[].
+                */
+       }
+       /* Deal with the destination pages I have inadvertently allocated.
+        *
+        * Ideally I would convert multi-page allocations into single
+        * page allocations, and add everyting to image->dest_pages.
+        *
+        * For now it is simpler to just free the pages.
+        */
+       kimage_free_page_list(&extra_pages);
+       return pages;
+
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+       if (*image->entry != 0) {
+               image->entry++;
+       }
+       if (image->entry == image->last_entry) {
+               kimage_entry_t *ind_page;
+               struct page *page;
+               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+               if (!page) {
+                       return -ENOMEM;
+               }
+               ind_page = page_address(page);
+               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               image->entry = ind_page;
+               image->last_entry =
+                       ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+       }
+       *image->entry = entry;
+       image->entry++;
+       *image->entry = 0;
+       return 0;
+}
+
+static int kimage_set_destination(
+       struct kimage *image, unsigned long destination)
+{
+       int result;
+
+       destination &= PAGE_MASK;
+       result = kimage_add_entry(image, destination | IND_DESTINATION);
+       if (result == 0) {
+               image->destination = destination;
+       }
+       return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+       int result;
+
+       page &= PAGE_MASK;
+       result = kimage_add_entry(image, page | IND_SOURCE);
+       if (result == 0) {
+               image->destination += PAGE_SIZE;
+       }
+       return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+       /* Walk through and free any extra destination pages I may have */
+       kimage_free_page_list(&image->dest_pages);
+
+       /* Walk through and free any unuseable pages I have cached */
+       kimage_free_page_list(&image->unuseable_pages);
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+       int result;
+
+       result = kimage_add_entry(image, IND_DONE);
+       if (result == 0) {
+               /* Point at the terminating element */
+               image->entry--;
+               kimage_free_extra_pages(image);
+       }
+       return result;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+               ptr = (entry & IND_INDIRECTION)? \
+                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+       struct page *page;
+
+       page = pfn_to_page(entry >> PAGE_SHIFT);
+       kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+       kimage_entry_t *ptr, entry;
+       kimage_entry_t ind = 0;
+
+       if (!image)
+               return;
+       kimage_free_extra_pages(image);
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_INDIRECTION) {
+                       /* Free the previous indirection page */
+                       if (ind & IND_INDIRECTION) {
+                               kimage_free_entry(ind);
+                       }
+                       /* Save this indirection page until we are
+                        * done with it.
+                        */
+                       ind = entry;
+               }
+               else if (entry & IND_SOURCE) {
+                       kimage_free_entry(entry);
+               }
+       }
+       /* Free the final indirection page */
+       if (ind & IND_INDIRECTION) {
+               kimage_free_entry(ind);
+       }
+
+       /* Handle any machine specific cleanup */
+       machine_kexec_cleanup(image);
+
+       /* Free the kexec control pages... */
+       kimage_free_page_list(&image->control_pages);
+       kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+{
+       kimage_entry_t *ptr, entry;
+       unsigned long destination = 0;
+
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_DESTINATION) {
+                       destination = entry & PAGE_MASK;
+               }
+               else if (entry & IND_SOURCE) {
+                       if (page == destination) {
+                               return ptr;
+                       }
+                       destination += PAGE_SIZE;
+               }
+       }
+       return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+{
+       /*
+        * Here we implement safeguards to ensure that a source page
+        * is not copied to its destination page before the data on
+        * the destination page is no longer useful.
+        *
+        * To do this we maintain the invariant that a source page is
+        * either its own destination page, or it is not a
+        * destination page at all.
+        *
+        * That is slightly stronger than required, but the proof
+        * that no problems will not occur is trivial, and the
+        * implementation is simply to verify.
+        *
+        * When allocating all pages normally this algorithm will run
+        * in O(N) time, but in the worst case it will run in O(N^2)
+        * time.   If the runtime is a problem the data structures can
+        * be fixed.
+        */
+       struct page *page;
+       unsigned long addr;
+
+       /*
+        * Walk through the list of destination pages, and see if I
+        * have a match.
+        */
+       list_for_each_entry(page, &image->dest_pages, lru) {
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+               if (addr == destination) {
+                       list_del(&page->lru);
+                       return page;
+               }
+       }
+       page = NULL;
+       while (1) {
+               kimage_entry_t *old;
+
+               /* Allocate a page, if we run out of memory give up */
+               page = kimage_alloc_pages(gfp_mask, 0);
+               if (!page) {
+                       return 0;
+               }
+               /* If the page cannot be used file it away */
+               if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+                       list_add(&page->lru, &image->unuseable_pages);
+                       continue;
+               }
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+
+               /* If it is the destination page we want use it */
+               if (addr == destination)
+                       break;
+
+               /* If the page is not a destination page use it */
+               if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+                       break;
+
+               /*
+                * I know that the page is someones destination page.
+                * See if there is already a source page for this
+                * destination page.  And if so swap the source pages.
+                */
+               old = kimage_dst_used(image, addr);
+               if (old) {
+                       /* If so move it */
+                       unsigned long old_addr;
+                       struct page *old_page;
+
+                       old_addr = *old & PAGE_MASK;
+                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       copy_highpage(page, old_page);
+                       *old = addr | (*old & ~PAGE_MASK);
+
+                       /* The old page I have found cannot be a
+                        * destination page, so return it.
+                        */
+                       addr = old_addr;
+                       page = old_page;
+                       break;
+               }
+               else {
+                       /* Place the page on the destination list I
+                        * will use it later.
+                        */
+                       list_add(&page->lru, &image->dest_pages);
+               }
+       }
+       return page;
+}
+
+static int kimage_load_segment(struct kimage *image,
+       struct kexec_segment *segment)
+{
+       unsigned long mstart;
+       int result;
+       unsigned long offset;
+       unsigned long offset_end;
+       unsigned char *buf;
+
+       result = 0;
+       buf = segment->buf;
+       mstart = (unsigned long)segment->mem;
+
+       offset_end = segment->memsz;
+
+       result = kimage_set_destination(image, mstart);
+       if (result < 0) {
+               goto out;
+       }
+       for (offset = 0;  offset < segment->memsz; offset += PAGE_SIZE) {
+               struct page *page;
+               char *ptr;
+               size_t size, leader;
+               page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
+               if (page == 0) {
+                       result  = -ENOMEM;
+                       goto out;
+               }
+               result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
+               if (result < 0) {
+                       goto out;
+               }
+               ptr = kmap(page);
+               if (segment->bufsz < offset) {
+                       /* We are past the end zero the whole page */
+                       memset(ptr, 0, PAGE_SIZE);
+                       kunmap(page);
+                       continue;
+               }
+               size = PAGE_SIZE;
+               leader = 0;
+               if ((offset == 0)) {
+                       leader = mstart & ~PAGE_MASK;
+               }
+               if (leader) {
+                       /* We are on the first page zero the unused portion */
+                       memset(ptr, 0, leader);
+                       size -= leader;
+                       ptr += leader;
+               }
+               if (size > (segment->bufsz - offset)) {
+                       size = segment->bufsz - offset;
+               }
+               if (size < (PAGE_SIZE - leader)) {
+                       /* zero the trailing part of the page */
+                       memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
+               }
+               result = copy_from_user(ptr, buf + offset, size);
+               kunmap(page);
+               if (result) {
+                       result = (result < 0) ? result : -EIO;
+                       goto out;
+               }
+       }
+ out:
+       return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and the copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = NULL;
+
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+       struct kexec_segment *segments, unsigned long flags)
+{
+       struct kimage *image;
+       int result;
+
+       /* We only trust the superuser with rebooting the system. */
+       if (!capable(CAP_SYS_BOOT))
+               return -EPERM;
+
+       /*
+        * In case we need just a little bit of special behavior for
+        * reboot on panic.
+        */
+       if (flags != 0)
+               return -EINVAL;
+
+       if (nr_segments > KEXEC_SEGMENT_MAX)
+               return -EINVAL;
+
+       image = NULL;
+       result = 0;
+
+       if (nr_segments > 0) {
+               unsigned long i;
+               result = kimage_alloc(&image, nr_segments, segments);
+               if (result) {
+                       goto out;
+               }
+               result = machine_kexec_prepare(image);
+               if (result) {
+                       goto out;
+               }
+               image->start = entry;
+               for (i = 0; i < nr_segments; i++) {
+                       result = kimage_load_segment(image, &image->segment[i]);
+                       if (result) {
+                               goto out;
+                       }
+               }
+               result = kimage_terminate(image);
+               if (result) {
+                       goto out;
+               }
+       }
+
+       image = xchg(&kexec_image, image);
+
+ out:
+       kimage_free(image);
+       return result;
+}
diff --git a/kernel/sched.c b/kernel/sched.c

index 20b0921..42af615 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -45,6 +45,8 @@
  #include <asm/tlb.h>
  
  #include <asm/unistd.h>
  #include <asm/tlb.h>
  
  #include <asm/unistd.h>
+#include <linux/ckrm_classqueue.h>
+#include <linux/ckrm_sched.h>
  
  #ifdef CONFIG_NUMA
  #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
  
  #ifdef CONFIG_NUMA
  #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
@@ -205,8 +207,6 @@ unsigned int task_timeslice(task_t *p)
   */
  
  typedef struct runqueue runqueue_t;
   */
  
  typedef struct runqueue runqueue_t;
-#include <linux/ckrm_classqueue.h>
-#include <linux/ckrm_sched.h>
  
  /*
   * This is the main, per-CPU runqueue data structure.
  
  /*
   * This is the main, per-CPU runqueue data structure.
@@ -227,17 +227,19 @@ struct runqueue {
         unsigned long cpu_load;
  #endif
         unsigned long long nr_switches, nr_preempt;
         unsigned long cpu_load;
  #endif
         unsigned long long nr_switches, nr_preempt;
-       unsigned long expired_timestamp, nr_uninterruptible;
+       unsigned long nr_uninterruptible;
         unsigned long long timestamp_last_tick;
         task_t *curr, *idle;
         struct mm_struct *prev_mm;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
         struct classqueue_struct classqueue;   
         ckrm_load_t ckrm_load;
         unsigned long long timestamp_last_tick;
         task_t *curr, *idle;
         struct mm_struct *prev_mm;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
         struct classqueue_struct classqueue;   
         ckrm_load_t ckrm_load;
+       ckrm_lrq_t   dflt_lrq; /* local runqueue of the default class */
  #else
          prio_array_t *active, *expired, arrays[2];
  #else
          prio_array_t *active, *expired, arrays[2];
-#endif
+       unsigned long expired_timestamp;
         int best_expired_prio;
         int best_expired_prio;
+#endif
         atomic_t nr_iowait;
  
  #ifdef CONFIG_SMP
         atomic_t nr_iowait;
  
  #ifdef CONFIG_SMP
@@ -320,10 +322,72 @@ static inline void rq_unlock(runqueue_t *rq)
         spin_unlock_irq(&rq->lock);
  }
  
         spin_unlock_irq(&rq->lock);
  }
  
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq);
+static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq);
+
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
+
+#define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled)
+#define ckrm_rq_cpu_enabled(rq)  ( rq->classqueue.enabled)
+
+static inline void class_enqueue_task(struct task_struct *p,
+                                     prio_array_t * array)
+{
+       ckrm_lrq_t *lrq;
+       int effective_prio;
+       
+       if (ckrm_rq_cpu_disabled(task_rq(p)))
+               return;
+       
+       lrq = get_task_lrq(p);
+       // BUG_ON(lrq==NULL); 
+       
+       cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
+       lrq->lrq_load += task_load(p);
+       
+       if ((p->prio < lrq->top_priority) && (array == lrq->active))
+               set_top_priority(lrq, p->prio); 
+       
+       if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
+               cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
+               effective_prio = get_effective_prio(lrq);
+               classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, 
+                                  effective_prio);
+       } 
+       
+}
+
+static inline void class_dequeue_task(struct task_struct *p,
+                                     prio_array_t * array)
+{
+       ckrm_lrq_t *lrq;
+       unsigned long load;
+       
+       if (ckrm_rq_cpu_disabled(task_rq(p)))
+               return;
+       
+       lrq = get_task_lrq(p);
+       load = task_load(p); 
+       
+       // BUG_ON(lrq->lrq_load < load);        
+       
+       lrq->lrq_load -= load;
+       
+       cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
+       
+       if ((array == lrq->active) && (p->prio == lrq->top_priority)
+           && list_empty(&(array->queue[p->prio])))
+               set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO,
+                                                  p->prio));
+}
+
  static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
  {
  static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
  {
-       cq_node_t *node = classqueue_get_head(&rq->classqueue);
+       cq_node_t *node;
+
+       if (ckrm_rq_cpu_disabled(rq)) 
+               return &rq->dflt_lrq;
+       node = classqueue_get_head(&rq->classqueue);
         return ((node) ? class_list_entry(node) : NULL);
  }
  
         return ((node) ? class_list_entry(node) : NULL);
  }
  
@@ -342,51 +406,189 @@ CVT_t get_local_cur_cvt(int cpu)
                 return 0;
  }
  
                 return 0;
  }
  
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
+                                                   int cpu) 
  {
         prio_array_t               *array;
         struct task_struct         *next;
         ckrm_lrq_t *queue;
         int idx;
  {
         prio_array_t               *array;
         struct task_struct         *next;
         ckrm_lrq_t *queue;
         int idx;
-       int cpu = smp_processor_id();
  
  
-       // it is guaranteed be the ( rq->nr_running > 0 ) check in 
-       // schedule that a task will be found.
+       if (ckrm_rq_cpu_disabled(rq)) {
+               /* original code from schedule(void) 
+                * see also code in non CKRM configuration
+                */
+               struct list_head *array_queue;
+               ckrm_lrq_t  *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu);
+
+               if (unlikely(!rq->nr_running)) {
+                       idle_balance(cpu, rq);
+                       if (!rq->nr_running) {
+                               rq->dflt_lrq.expired_timestamp = 0;
+                               wake_sleeping_dependent(cpu, rq);
+                               return NULL;
+                       }
+               }
+
+               array = lrq->active;
+               if (unlikely(!array->nr_active)) {
+                       /*
+                        * Switch the active and expired arrays.
+                        */
+                       lrq->active = lrq->expired;
+                       lrq->expired = array;
+                       array = lrq->active; 
+                       lrq->expired_timestamp = 0;
+                       lrq->best_expired_prio = MAX_PRIO;
+               }
+
+               idx = sched_find_first_bit(array->bitmap);
+               array_queue = array->queue + idx;
+               next = list_entry(array_queue->next, task_t, run_list);
+               return next;
+       }
  
  
+       /*-- CKRM SCHEDULER --*/
+       
   retry_next_class:
   retry_next_class:
+       /* we can't use (rq->nr_running == 0) to declare idleness
+        * first we have to make sure that the class runqueue is properly
+        * processed. This is due to two facts/requirements:
+        * (a) when the last task is removed form an lrq we do not remove
+        *     the lrq from the class runqueue. As a result the lrq is 
+        *     selected again and we can perform necessary 
+        *     expired switches.
+        * (b) perform outstanding expired switches
+        * 
+        */
+
         queue = rq_get_next_class(rq);
         queue = rq_get_next_class(rq);
-       // BUG_ON( !queue );
+       if (unlikely(queue == NULL)) {
+               idle_balance(cpu, rq);
+               if (!rq->nr_running) {
+                       rq->dflt_lrq.expired_timestamp = 0;
+                       wake_sleeping_dependent(cpu, rq);
+                       return NULL;
+               }
+               goto retry_next_class; // try again
+       }
  
         array = queue->active;
         if (unlikely(!array->nr_active)) {
                 queue->active = queue->expired;
                 queue->expired = array;
  
         array = queue->active;
         if (unlikely(!array->nr_active)) {
                 queue->active = queue->expired;
                 queue->expired = array;
+               array = queue->active;
                 queue->expired_timestamp = 0;
  
                 queue->expired_timestamp = 0;
  
-               if (queue->active->nr_active)
+               if (array->nr_active)
                         set_top_priority(queue,
                         set_top_priority(queue,
-                                        find_first_bit(queue->active->bitmap, MAX_PRIO));
+                                        find_first_bit(array->bitmap,MAX_PRIO));
                 else {
                 else {
+                       /* since we do not dequeue a lrq when it becomes empty
+                        * but rely on the switching mechanism, we must dequeue
+                        * at this point
+                        */
                         classqueue_dequeue(queue->classqueue,
                                            &queue->classqueue_linkobj);
                         classqueue_dequeue(queue->classqueue,
                                            &queue->classqueue_linkobj);
-                       cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
+                       cpu_demand_event(get_rq_local_stat(queue,cpu),
+                                        CPU_DEMAND_DEQUEUE,0);
                 }
                 goto retry_next_class;                          
         }
                 }
                 goto retry_next_class;                          
         }
-       // BUG_ON(!array->nr_active);
  
         idx = queue->top_priority;
  
         idx = queue->top_priority;
-       // BUG_ON (idx == MAX_PRIO);
+       //BUG_ON(!array->nr_active);
+       //BUG_ON(idx == MAX_PRIO);
+       //BUG_ON(list_empty(array->queue+idx));
         next = task_list_entry(array->queue[idx].next);
         return next;
  }
         next = task_list_entry(array->queue[idx].next);
         return next;
  }
+
+static inline void ckrm_account_task(struct runqueue* rq, 
+                                    struct task_struct *prev, 
+                                    unsigned long long now)
+{
+       if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) {
+               unsigned long long run = now - prev->timestamp;
+               ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+               lrq->lrq_load -= task_load(prev);
+               cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+               lrq->lrq_load += task_load(prev);
+
+               cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+               update_local_cvt(prev, run);
+       }
+
+}
+
+#ifdef CONFIG_SMP
+#define COND_SMP(dflt,cond) (cond)
+#else
+#define COND_SMP(dflt,cond) (dflt)
+#endif
+
+static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle,
+                                  runqueue_t *rq)
+{
+       /* first determine whether we have to do anything
+        * without grabing the global lock
+        */
+
+       int sample, update;
+
+#ifdef __SIMULATOR__
+       if ((this_cpu == 0) && (j % 1000) == 0) {
+               ckrm_cpu_monitor(1);
+       }
+#endif
+       
+       if (ckrm_rq_cpu_disabled(rq))
+               return;
+       
+       update = (j % CVT_UPDATE_TICK);
+       sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK)); 
+       
+// avoid taking the global class_list lock on every tick 
+       if (likely(update && sample))
+               return;   // nothing to be done;
+       
+       read_lock(&class_list_lock);
+       
+#ifdef CONFIG_SMP
+       if (sample==0) {
+               ckrm_load_sample(rq_ckrm_load(rq),this_cpu);
+       }
+#endif
+       
+       if (update==0) {
+               classqueue_update_base(get_cpu_classqueue(this_cpu));
+               update_class_cputime(this_cpu,idle);
+               // occasionally we need to call the weight adjustment
+               // for SMP systems
+               if (COND_SMP(0,(this_cpu==0)))
+                       adjust_local_weight();   
+       }
+       
+       read_unlock(&class_list_lock);
+}
+
  #else /*! CONFIG_CKRM_CPU_SCHEDULE*/
  #else /*! CONFIG_CKRM_CPU_SCHEDULE*/
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
+                                                   int cpu) 
  {
         prio_array_t *array;
          struct list_head *queue;
         int idx;
  
  {
         prio_array_t *array;
          struct list_head *queue;
         int idx;
  
+       if (unlikely(!rq->nr_running)) {
+               idle_balance(cpu, rq);
+                if (!rq->nr_running) {
+                        rq->expired_timestamp = 0;
+                        wake_sleeping_dependent(cpu, rq);
+                        return NULL;
+                }
+       }
         array = rq->active;
         if (unlikely(!array->nr_active)) {
                 /*
         array = rq->active;
         if (unlikely(!array->nr_active)) {
                 /*
@@ -404,11 +606,17 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
         return list_entry(queue->next, task_t, run_list);
  }
  
         return list_entry(queue->next, task_t, run_list);
  }
  
-static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
-static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_enqueue_task(struct task_struct* p, 
+                                     prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, 
+                                     prio_array_t *array) { }
  static inline void init_cpu_classes(void) { }
  static inline void init_cpu_classes(void) { }
+static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {}
+static inline void ckrm_account_task(struct runqueue* rq, struct 
+                                    task_struct *prev, 
+                                    unsigned long long now)  { }
  #define rq_ckrm_load(rq) NULL
  #define rq_ckrm_load(rq) NULL
-static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
+
  #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
  
  /*
  #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
  
  /*
@@ -1558,261 +1766,129 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
         return 1;
  }
  
         return 1;
  }
  
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
-                                     int phase, enum idle_type idle)
-{
-       long pressure = task_load(tmp);
-       
-       if (pressure > max) 
-               return 0;
-
-       if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
-               return 0;
-       return 1;
-}
-
  /*
  /*
- * move tasks for a specic local class
- * return number of tasks pulled
+ * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
+ * as part of a balancing operation within "domain". Returns the number of
+ * tasks moved.
+ *
+ * Called with both runqueues locked.
   */
   */
-static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
-                                     runqueue_t *this_rq,
-                                     runqueue_t *busiest,
-                                     struct sched_domain *sd,
-                                     int this_cpu,
-                                     enum idle_type idle,
-                                     long* pressure_imbalance) 
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+                     unsigned long max_nr_move, struct sched_domain *sd,
+                     enum idle_type idle)
  {
         prio_array_t *array, *dst_array;
         struct list_head *head, *curr;
  {
         prio_array_t *array, *dst_array;
         struct list_head *head, *curr;
+       int idx, pulled = 0;
         task_t *tmp;
         task_t *tmp;
-       int idx;
-       int pulled = 0;
-       int phase = -1;
-       long pressure_min, pressure_max;
-       /*hzheng: magic : 90% balance is enough*/
-       long balance_min = *pressure_imbalance / 10; 
-/*
- * we don't want to migrate tasks that will reverse the balance
- *     or the tasks that make too small difference
- */
-#define CKRM_BALANCE_MAX_RATIO 100
-#define CKRM_BALANCE_MIN_RATIO 1
- start:
-       phase ++;
+#if CONFIG_CKRM_CPU_SCHEDULE
+       /* need to distinguish between the runqueues and the class
+         * local runqueues.
+        * we know we can get here only if the dflt class is present
+        */
+       ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq;
+       ckrm_lrq_t *l_busiest = &busiest->dflt_lrq;
+#else
+#define l_busiest busiest
+#define l_this_rq this_rq
+#endif
+
+       if (max_nr_move <= 0 || busiest->nr_running <= 1)
+               goto out;
+
         /*
          * We first consider expired tasks. Those will likely not be
          * executed in the near future, and they are most likely to
          * be cache-cold, thus switching CPUs has the least effect
          * on them.
          */
         /*
          * We first consider expired tasks. Those will likely not be
          * executed in the near future, and they are most likely to
          * be cache-cold, thus switching CPUs has the least effect
          * on them.
          */
-       if (src_lrq->expired->nr_active) {
-               array = src_lrq->expired;
-               dst_array = dst_lrq->expired;
+       if (l_busiest->expired->nr_active) {
+               array = l_busiest->expired;
+               dst_array = l_this_rq->expired;
         } else {
         } else {
-               array = src_lrq->active;
-               dst_array = dst_lrq->active;
+               array = l_busiest->active;
+               dst_array = l_this_rq->active;
         }
         }
-       
- new_array:
+
+new_array:
         /* Start searching at priority 0: */
         idx = 0;
         /* Start searching at priority 0: */
         idx = 0;
- skip_bitmap:
+skip_bitmap:
         if (!idx)
                 idx = sched_find_first_bit(array->bitmap);
         else
                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
         if (idx >= MAX_PRIO) {
         if (!idx)
                 idx = sched_find_first_bit(array->bitmap);
         else
                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
         if (idx >= MAX_PRIO) {
-               if (array == src_lrq->expired && src_lrq->active->nr_active) {
-                       array = src_lrq->active;
-                       dst_array = dst_lrq->active;
+               if (array == l_busiest->expired && l_busiest->active->nr_active) {
+                       array = l_busiest->active;
+                       dst_array = l_this_rq->active;
                         goto new_array;
                 }
                         goto new_array;
                 }
-               if ((! phase) && (! pulled) && (idle != IDLE))
-                       goto start; //try again
-               else 
-                       goto out; //finished search for this lrq
+               goto out;
         }
         }
-       
+
         head = array->queue + idx;
         curr = head->prev;
         head = array->queue + idx;
         curr = head->prev;
- skip_queue:
+skip_queue:
         tmp = list_entry(curr, task_t, run_list);
         tmp = list_entry(curr, task_t, run_list);
-       
+
         curr = curr->prev;
         curr = curr->prev;
-       
+
         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
                 if (curr != head)
                         goto skip_queue;
                 idx++;
                 goto skip_bitmap;
         }
         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
                 if (curr != head)
                         goto skip_queue;
                 idx++;
                 goto skip_bitmap;
         }
+       pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+       pulled++;
  
  
-       pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
-       pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
-       /*
-        * skip the tasks that will reverse the balance too much
-        */
-       if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
-               *pressure_imbalance -= task_load(tmp);
-               pull_task(busiest, array, tmp, 
-                         this_rq, dst_array, this_cpu);
-               pulled++;
-
-               if (*pressure_imbalance <= balance_min)
-                       goto out;
+       /* We only want to steal up to the prescribed number of tasks. */
+       if (pulled < max_nr_move) {
+               if (curr != head)
+                       goto skip_queue;
+               idx++;
+               goto skip_bitmap;
         }
         }
-               
-       if (curr != head)
-               goto skip_queue;
-       idx++;
-       goto skip_bitmap;
- out:         
+out:
         return pulled;
  }
  
         return pulled;
  }
  
-static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
-{
-       long imbalance;
-       /*
-        * make sure after balance, imbalance' > - imbalance/2
-        * we don't want the imbalance be reversed too much
-        */
-       imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) 
-               - pid_get_pressure(rq_ckrm_load(this_rq),1);
-       imbalance /= 2;
-       return imbalance;
-}
-
  /*
  /*
- * try to balance the two runqueues
- *
- * Called with both runqueues locked.
- * if move_tasks is called, it will try to move at least one task over
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the number of tasks which should be
+ * moved to restore balance via the imbalance parameter.
   */
   */
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-                     unsigned long max_nr_move, struct sched_domain *sd,
-                     enum idle_type idle)
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                  unsigned long *imbalance, enum idle_type idle)
  {
  {
-       struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
-       ckrm_lrq_t* src_lrq,*dst_lrq;
-       long pressure_imbalance, pressure_imbalance_old;
-       int src_cpu = task_cpu(busiest->curr);
-       struct list_head *list;
-       int pulled = 0;
-       long imbalance;
-
-       imbalance =  ckrm_rq_imbalance(this_rq,busiest);
+       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
  
  
-       if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
-               goto out;
+       max_load = this_load = total_load = total_pwr = 0;
  
  
-       //try to find the vip class
-        list_for_each_entry(clsptr,&active_cpu_classes,links) {
-               src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+       do {
+               cpumask_t tmp;
+               unsigned long load;
+               int local_group;
+               int i, nr_cpus = 0;
  
  
-               if (! lrq_nr_running(src_lrq))
-                       continue;
-
-               if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
-                       {
-                               vip_cls = clsptr;
-                       }
-       }
-
-       /*
-        * do search from the most significant class
-        * hopefully, less tasks will be migrated this way
-        */
-       clsptr = vip_cls;
-
- move_class:
-       if (! clsptr)
-               goto out;
-       
-
-       src_lrq = get_ckrm_lrq(clsptr,src_cpu);
-       if (! lrq_nr_running(src_lrq))
-               goto other_class;
-       
-       dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
-
-       //how much pressure for this class should be transferred
-       pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
-       if (pulled && ! pressure_imbalance) 
-               goto other_class;
-       
-       pressure_imbalance_old = pressure_imbalance;
-       
-       //move tasks
-       pulled += 
-               ckrm_cls_move_tasks(src_lrq,dst_lrq,
-                                   this_rq,
-                                   busiest,
-                                   sd,this_cpu,idle,
-                                   &pressure_imbalance);
-
-       /* 
-        * hzheng: 2 is another magic number
-        * stop balancing if the imbalance is less than 25% of the orig
-        */
-       if (pressure_imbalance <= (pressure_imbalance_old >> 2))
-               goto out;
-               
-       //update imbalance
-       imbalance *= pressure_imbalance / pressure_imbalance_old;
- other_class:
-       //who is next?
-       list = clsptr->links.next;
-       if (list == &active_cpu_classes)
-               list = list->next;
-       clsptr = list_entry(list, typeof(*clsptr), links);
-       if (clsptr != vip_cls)
-               goto move_class;
- out:
-       return pulled;
-}
-
-/**
- * ckrm_check_balance - is load balancing necessary?
- * return 0 if load balancing is not necessary
- * otherwise return the average load of the system
- * also, update nr_group
- *
- * heuristics: 
- *   no load balancing if it's load is over average
- *   no load balancing if it's load is far more than the min
- * task:
- *   read the status of all the runqueues
- */
-static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
-                                            enum idle_type idle, int* nr_group)
-{
-       struct sched_group *group = sd->groups;
-       unsigned long min_load, max_load, avg_load;
-       unsigned long total_load, this_load, total_pwr;
-
-       max_load = this_load = total_load = total_pwr = 0;
-       min_load = 0xFFFFFFFF;
-       *nr_group = 0;
-
-       do {
-               cpumask_t tmp;
-               unsigned long load;
-               int local_group;
-               int i, nr_cpus = 0;
+               local_group = cpu_isset(this_cpu, group->cpumask);
  
                 /* Tally up the load of all CPUs in the group */
  
                 /* Tally up the load of all CPUs in the group */
+               avg_load = 0;
                 cpus_and(tmp, group->cpumask, cpu_online_map);
                 if (unlikely(cpus_empty(tmp)))
                         goto nextgroup;
  
                 cpus_and(tmp, group->cpumask, cpu_online_map);
                 if (unlikely(cpus_empty(tmp)))
                         goto nextgroup;
  
-               avg_load = 0;
-               local_group = cpu_isset(this_cpu, group->cpumask);
-
                 for_each_cpu_mask(i, tmp) {
                 for_each_cpu_mask(i, tmp) {
-                       load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+                       /* Bias balancing toward cpus of our domain */
+                       if (local_group)
+                               load = target_load(i);
+                       else
+                               load = source_load(i);
+
                         nr_cpus++;
                         avg_load += load;
                 }
                         nr_cpus++;
                         avg_load += load;
                 }
@@ -1828,386 +1904,86 @@ static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
  
                 if (local_group) {
                         this_load = avg_load;
  
                 if (local_group) {
                         this_load = avg_load;
+                       this = group;
                         goto nextgroup;
                 } else if (avg_load > max_load) {
                         max_load = avg_load;
                         goto nextgroup;
                 } else if (avg_load > max_load) {
                         max_load = avg_load;
-               }      
-               if (avg_load < min_load) {
-                       min_load = avg_load;
+                       busiest = group;
                 }
  nextgroup:
                 group = group->next;
                 }
  nextgroup:
                 group = group->next;
-               *nr_group = *nr_group + 1;
         } while (group != sd->groups);
  
         } while (group != sd->groups);
  
-       if (!max_load || this_load >= max_load)
+       if (!busiest || this_load >= max_load)
                 goto out_balanced;
  
         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
  
                 goto out_balanced;
  
         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
  
-       /* hzheng: debugging: 105 is a magic number
-        * 100*max_load <= sd->imbalance_pct*this_load)
-        * should use imbalance_pct instead
-        */
-       if (this_load > avg_load 
-           || 100*max_load < 105*this_load
-           || 100*min_load < 70*this_load
-           )
+       if (this_load >= avg_load ||
+                       100*max_load <= sd->imbalance_pct*this_load)
                 goto out_balanced;
  
                 goto out_balanced;
  
-       return avg_load;
- out_balanced:
-       return 0;
-}
-
-/**
- * any group that has above average load is considered busy
- * find the busiest queue from any of busy group
- */
-static runqueue_t *
-ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
-                    unsigned long avg_load, enum idle_type idle,
-                    int nr_group)
-{
-       struct sched_group *group;
-       runqueue_t * busiest=NULL;
-       unsigned long rand;
-       
-       group = sd->groups;
-       rand = get_ckrm_rand(nr_group);
-       nr_group = 0;
+       /*
+        * We're trying to get all the cpus to the average_load, so we don't
+        * want to push ourselves above the average load, nor do we wish to
+        * reduce the max loaded cpu below the average load, as either of these
+        * actions would just result in more rebalancing later, and ping-pong
+        * tasks around. Thus we look for the minimum possible imbalance.
+        * Negative imbalances (*we* are more loaded than anyone else) will
+        * be counted as no imbalance for these purposes -- we can't fix that
+        * by pulling tasks to us.  Be careful of negative numbers as they'll
+        * appear as very large values with unsigned longs.
+        */
+       *imbalance = min(max_load - avg_load, avg_load - this_load);
  
  
-       do {
-               unsigned long load,total_load,max_load;
-               cpumask_t tmp;
-               int i;
-               runqueue_t * grp_busiest;
+       /* How much load to actually move to equalise the imbalance */
+       *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+                               / SCHED_LOAD_SCALE;
  
  
-               cpus_and(tmp, group->cpumask, cpu_online_map);
-               if (unlikely(cpus_empty(tmp)))
-                       goto find_nextgroup;
+       if (*imbalance < SCHED_LOAD_SCALE - 1) {
+               unsigned long pwr_now = 0, pwr_move = 0;
+               unsigned long tmp;
  
  
-               total_load = 0;
-               max_load = 0;
-               grp_busiest = NULL;
-               for_each_cpu_mask(i, tmp) {
-                       load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
-                       total_load += load;
-                       if (load > max_load) {
-                               max_load = load;
-                               grp_busiest = cpu_rq(i);
-                       }                               
+               if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
+                       *imbalance = 1;
+                       return busiest;
                 }
  
                 }
  
-               total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
-               if (total_load > avg_load) {
-                       busiest = grp_busiest;
-                       if (nr_group >= rand)
-                               break;
-               }
-       find_nextgroup:         
-               group = group->next;
-               nr_group ++;
-       } while (group != sd->groups);
+               /*
+                * OK, we don't have enough imbalance to justify moving tasks,
+                * however we may be able to increase total CPU power used by
+                * moving them.
+                */
  
  
-       return busiest;
-}
+               pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+               pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+               pwr_now /= SCHED_LOAD_SCALE;
  
  
-/**
- * load_balance - pressure based load balancing algorithm used by ckrm
- */
-static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
-                       struct sched_domain *sd, enum idle_type idle)
-{
-       runqueue_t *busiest;
-       unsigned long avg_load;
-       int nr_moved,nr_group;
+               /* Amount of load we'd subtract */
+               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+               if (max_load > tmp)
+                       pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+                                                       max_load - tmp);
  
  
-       avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
-       if (! avg_load)
-               goto out_balanced;
+               /* Amount of load we'd add */
+               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+               if (max_load < tmp)
+                       tmp = max_load;
+               pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+               pwr_move /= SCHED_LOAD_SCALE;
  
  
-       busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
-       if (! busiest)
-               goto out_balanced;
-       /*
-        * This should be "impossible", but since load
-        * balancing is inherently racy and statistical,
-        * it could happen in theory.
-        */
-       if (unlikely(busiest == this_rq)) {
-               WARN_ON(1);
-               goto out_balanced;
-       }
+               /* Move if we gain another 8th of a CPU worth of throughput */
+               if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+                       goto out_balanced;
  
  
-       nr_moved = 0;
-       if (busiest->nr_running > 1) {
-               /*
-                * Attempt to move tasks. If find_busiest_group has found
-                * an imbalance but busiest->nr_running <= 1, the group is
-                * still unbalanced. nr_moved simply stays zero, so it is
-                * correctly treated as an imbalance.
-                */
-               double_lock_balance(this_rq, busiest);
-               nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                     0,sd, idle);              
-               spin_unlock(&busiest->lock);
-               if (nr_moved) {
-                       adjust_local_weight();
-               }
+               *imbalance = 1;
+               return busiest;
         }
  
         }
  
-       if (!nr_moved) 
-               sd->nr_balance_failed ++;
-       else
-               sd->nr_balance_failed  = 0;             
-
-       /* We were unbalanced, so reset the balancing interval */
-       sd->balance_interval = sd->min_interval;
+       /* Get rid of the scaling factor, rounding down as we divide */
+       *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
  
  
-       return nr_moved;
-
-out_balanced:
-       /* tune up the balancing interval */
-       if (sd->balance_interval < sd->max_interval)
-               sd->balance_interval *= 2;
-
-       return 0;
-}
-
-/*
- * this_rq->lock is already held
- */
-static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
-                                      struct sched_domain *sd)
-{
-       int ret;
-       read_lock(&class_list_lock);
-       ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
-       read_unlock(&class_list_lock);
-       return ret;
-}
-
-static inline int load_balance(int this_cpu, runqueue_t *this_rq,
-                       struct sched_domain *sd, enum idle_type idle)
-{
-       int ret;
-
-       spin_lock(&this_rq->lock);
-       read_lock(&class_list_lock);
-       ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
-       read_unlock(&class_list_lock);
-       spin_unlock(&this_rq->lock);
-       return ret;
-}
-#else /*! CONFIG_CKRM_CPU_SCHEDULE */
-/*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-                     unsigned long max_nr_move, struct sched_domain *sd,
-                     enum idle_type idle)
-{
-       prio_array_t *array, *dst_array;
-       struct list_head *head, *curr;
-       int idx, pulled = 0;
-       task_t *tmp;
-
-       if (max_nr_move <= 0 || busiest->nr_running <= 1)
-               goto out;
-
-       /*
-        * We first consider expired tasks. Those will likely not be
-        * executed in the near future, and they are most likely to
-        * be cache-cold, thus switching CPUs has the least effect
-        * on them.
-        */
-       if (busiest->expired->nr_active) {
-               array = busiest->expired;
-               dst_array = this_rq->expired;
-       } else {
-               array = busiest->active;
-               dst_array = this_rq->active;
-       }
-
-new_array:
-       /* Start searching at priority 0: */
-       idx = 0;
-skip_bitmap:
-       if (!idx)
-               idx = sched_find_first_bit(array->bitmap);
-       else
-               idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
-       if (idx >= MAX_PRIO) {
-               if (array == busiest->expired && busiest->active->nr_active) {
-                       array = busiest->active;
-                       dst_array = this_rq->active;
-                       goto new_array;
-               }
-               goto out;
-       }
-
-       head = array->queue + idx;
-       curr = head->prev;
-skip_queue:
-       tmp = list_entry(curr, task_t, run_list);
-
-       curr = curr->prev;
-
-       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
-               if (curr != head)
-                       goto skip_queue;
-               idx++;
-               goto skip_bitmap;
-       }
-       pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
-       pulled++;
-
-       /* We only want to steal up to the prescribed number of tasks. */
-       if (pulled < max_nr_move) {
-               if (curr != head)
-                       goto skip_queue;
-               idx++;
-               goto skip_bitmap;
-       }
-out:
-       return pulled;
-}
-
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
- * moved to restore balance via the imbalance parameter.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum idle_type idle)
-{
-       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
-
-       max_load = this_load = total_load = total_pwr = 0;
-
-       do {
-               cpumask_t tmp;
-               unsigned long load;
-               int local_group;
-               int i, nr_cpus = 0;
-
-               local_group = cpu_isset(this_cpu, group->cpumask);
-
-               /* Tally up the load of all CPUs in the group */
-               avg_load = 0;
-               cpus_and(tmp, group->cpumask, cpu_online_map);
-               if (unlikely(cpus_empty(tmp)))
-                       goto nextgroup;
-
-               for_each_cpu_mask(i, tmp) {
-                       /* Bias balancing toward cpus of our domain */
-                       if (local_group)
-                               load = target_load(i);
-                       else
-                               load = source_load(i);
-
-                       nr_cpus++;
-                       avg_load += load;
-               }
-
-               if (!nr_cpus)
-                       goto nextgroup;
-
-               total_load += avg_load;
-               total_pwr += group->cpu_power;
-
-               /* Adjust by relative CPU power of the group */
-               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
-               if (local_group) {
-                       this_load = avg_load;
-                       this = group;
-                       goto nextgroup;
-               } else if (avg_load > max_load) {
-                       max_load = avg_load;
-                       busiest = group;
-               }
-nextgroup:
-               group = group->next;
-       } while (group != sd->groups);
-
-       if (!busiest || this_load >= max_load)
-               goto out_balanced;
-
-       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
-
-       if (this_load >= avg_load ||
-                       100*max_load <= sd->imbalance_pct*this_load)
-               goto out_balanced;
-
-       /*
-        * We're trying to get all the cpus to the average_load, so we don't
-        * want to push ourselves above the average load, nor do we wish to
-        * reduce the max loaded cpu below the average load, as either of these
-        * actions would just result in more rebalancing later, and ping-pong
-        * tasks around. Thus we look for the minimum possible imbalance.
-        * Negative imbalances (*we* are more loaded than anyone else) will
-        * be counted as no imbalance for these purposes -- we can't fix that
-        * by pulling tasks to us.  Be careful of negative numbers as they'll
-        * appear as very large values with unsigned longs.
-        */
-       *imbalance = min(max_load - avg_load, avg_load - this_load);
-
-       /* How much load to actually move to equalise the imbalance */
-       *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
-                               / SCHED_LOAD_SCALE;
-
-       if (*imbalance < SCHED_LOAD_SCALE - 1) {
-               unsigned long pwr_now = 0, pwr_move = 0;
-               unsigned long tmp;
-
-               if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-                       *imbalance = 1;
-                       return busiest;
-               }
-
-               /*
-                * OK, we don't have enough imbalance to justify moving tasks,
-                * however we may be able to increase total CPU power used by
-                * moving them.
-                */
-
-               pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-               pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
-               pwr_now /= SCHED_LOAD_SCALE;
-
-               /* Amount of load we'd subtract */
-               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
-               if (max_load > tmp)
-                       pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-                                                       max_load - tmp);
-
-               /* Amount of load we'd add */
-               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-               if (max_load < tmp)
-                       tmp = max_load;
-               pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
-               pwr_move /= SCHED_LOAD_SCALE;
-
-               /* Move if we gain another 8th of a CPU worth of throughput */
-               if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
-                       goto out_balanced;
-
-               *imbalance = 1;
-               return busiest;
-       }
-
-       /* Get rid of the scaling factor, rounding down as we divide */
-       *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
-
-       return busiest;
+       return busiest;
  
  out_balanced:
         if (busiest && (idle == NEWLY_IDLE ||
  
  out_balanced:
         if (busiest && (idle == NEWLY_IDLE ||
@@ -2249,6 +2025,17 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
   *
   * Called with this_rq unlocked.
   */
   *
   * Called with this_rq unlocked.
   */
+
+static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+                                   struct sched_domain *sd, 
+                                   enum idle_type idle)
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+{
+       return -1;
+}
+#endif
+;
+
  static int load_balance(int this_cpu, runqueue_t *this_rq,
                         struct sched_domain *sd, enum idle_type idle)
  {
  static int load_balance(int this_cpu, runqueue_t *this_rq,
                         struct sched_domain *sd, enum idle_type idle)
  {
@@ -2259,6 +2046,9 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
  
         spin_lock(&this_rq->lock);
  
  
         spin_lock(&this_rq->lock);
  
+       if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1)
+               goto out_balanced;
+
         group = find_busiest_group(sd, this_cpu, &imbalance, idle);
         if (!group)
                 goto out_balanced;
         group = find_busiest_group(sd, this_cpu, &imbalance, idle);
         if (!group)
                 goto out_balanced;
@@ -2344,8 +2134,12 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
         struct sched_group *group;
         runqueue_t *busiest = NULL;
         unsigned long imbalance;
         struct sched_group *group;
         runqueue_t *busiest = NULL;
         unsigned long imbalance;
-       int nr_moved = 0;
+       int nr_moved;
+
+       if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1)
+               goto out;
  
  
+       nr_moved = 0;
         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
         if (!group)
                 goto out;
         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
         if (!group)
                 goto out;
@@ -2365,8 +2159,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
  out:
         return nr_moved;
  }
  out:
         return nr_moved;
  }
-#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
-
  
  /*
   * idle_balance is called by schedule() if this_cpu is about to become
  
  /*
   * idle_balance is called by schedule() if this_cpu is about to become
@@ -2472,6 +2264,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
         struct sched_domain *sd;
  
         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
         struct sched_domain *sd;
  
+       ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq);
+
         /* Update our load */
         old_load = this_rq->cpu_load;
         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
         /* Update our load */
         old_load = this_rq->cpu_load;
         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -2510,7 +2304,9 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
   */
  static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
  {
   */
  static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
  {
+       ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq);
  }
  }
+
  static inline void idle_balance(int cpu, runqueue_t *rq)
  {
  }
  static inline void idle_balance(int cpu, runqueue_t *rq)
  {
  }
@@ -2547,15 +2343,19 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  
  #ifndef CONFIG_CKRM_CPU_SCHEDULE
  #define EXPIRED_STARVING(rq) \
  
  #ifndef CONFIG_CKRM_CPU_SCHEDULE
  #define EXPIRED_STARVING(rq) \
-       ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+               ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
                 (jiffies - (rq)->expired_timestamp >= \
                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
  #else
                 (jiffies - (rq)->expired_timestamp >= \
                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
  #else
+/* we need to scale the starvation based on weight 
+ * classes with small weight have longer expiration starvation
+ */
  #define EXPIRED_STARVING(rq) \
  #define EXPIRED_STARVING(rq) \
-               (STARVATION_LIMIT && ((rq)->expired_timestamp && \
+                ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
                 (jiffies - (rq)->expired_timestamp >= \
                 (jiffies - (rq)->expired_timestamp >= \
-                       STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
+                       (((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \
+                       (this_rq()->curr->static_prio > (rq)->best_expired_prio))
  #endif
  
  /*
  #endif
  
  /*
@@ -2598,7 +2398,6 @@ void scheduler_tick(int user_ticks, int sys_ticks)
                         cpustat->idle += sys_ticks;
                 if (wake_priority_sleeper(rq))
                         goto out;
                         cpustat->idle += sys_ticks;
                 if (wake_priority_sleeper(rq))
                         goto out;
-               ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
                 rebalance_tick(cpu, rq, IDLE);
                 return;
         }
                 rebalance_tick(cpu, rq, IDLE);
                 return;
         }
@@ -2639,8 +2438,11 @@ void scheduler_tick(int user_ticks, int sys_ticks)
         }
         if (vx_need_resched(p)) {
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
         }
         if (vx_need_resched(p)) {
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
-               /* Hubertus ... we can abstract this out */
-               ckrm_lrq_t* rq = get_task_lrq(p);
+               /* we redefine RQ to be a local runqueue */
+               ckrm_lrq_t* rq;
+               runqueue_t *cpu_rq = this_rq();
+               rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p) 
+                                                : &(cpu_rq->dflt_lrq);
  #endif
                 dequeue_task(p, rq->active);
                 set_tsk_need_resched(p);
  #endif
                 dequeue_task(p, rq->active);
                 set_tsk_need_resched(p);
@@ -2652,8 +2454,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
                         rq->expired_timestamp = jiffies;
                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
                         enqueue_task(p, rq->expired);
                         rq->expired_timestamp = jiffies;
                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
                         enqueue_task(p, rq->expired);
-                       if (p->static_prio < this_rq()->best_expired_prio)
-                               this_rq()->best_expired_prio = p->static_prio;
+                       if (p->static_prio < rq->best_expired_prio)
+                               rq->best_expired_prio = p->static_prio;
                 } else
                         enqueue_task(p, rq->active);
         } else {
                 } else
                         enqueue_task(p, rq->active);
         } else {
@@ -2687,7 +2489,6 @@ void scheduler_tick(int user_ticks, int sys_ticks)
  out_unlock:
         spin_unlock(&rq->lock);
  out:
  out_unlock:
         spin_unlock(&rq->lock);
  out:
-       ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
         rebalance_tick(cpu, rq, NOT_IDLE);
  }
  
         rebalance_tick(cpu, rq, NOT_IDLE);
  }
  
@@ -2788,21 +2589,17 @@ asmlinkage void __sched schedule(void)
         unsigned long long now;
         unsigned long run_time;
         int cpu;
         unsigned long long now;
         unsigned long run_time;
         int cpu;
-#ifdef CONFIG_VSERVER_HARDCPU          
-       struct vx_info *vxi;
-       int maxidle = -HZ;
-#endif
  
  
-       /*
+
+       /*
          * If crash dump is in progress, this other cpu's
          * need to wait until it completes.
          * NB: this code is optimized away for kernels without
          * dumping enabled.
          */
          * If crash dump is in progress, this other cpu's
          * need to wait until it completes.
          * NB: this code is optimized away for kernels without
          * dumping enabled.
          */
-       if (unlikely(dump_oncpu))
-               goto dump_scheduling_disabled;
+        if (unlikely(dump_oncpu))
+                goto dump_scheduling_disabled;
  
  
-       //WARN_ON(system_state == SYSTEM_BOOTING);
         /*
          * Test if we are atomic.  Since do_exit() needs to call into
          * schedule() atomically, we ignore that path for now.
         /*
          * Test if we are atomic.  Since do_exit() needs to call into
          * schedule() atomically, we ignore that path for now.
@@ -2837,19 +2634,8 @@ need_resched:
  
         spin_lock_irq(&rq->lock);
  
  
         spin_lock_irq(&rq->lock);
  
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-       if (prev != rq->idle) {
-               unsigned long long run = now - prev->timestamp;
-               ckrm_lrq_t * lrq = get_task_lrq(prev);
-
-               lrq->lrq_load -= task_load(prev);
-               cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
-               lrq->lrq_load += task_load(prev);
+       ckrm_account_task(rq,prev,now);
  
  
-               cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
-               update_local_cvt(prev, run);
-       }
-#endif
         /*
          * if entering off of a kernel preemption go straight
          * to picking the next task.
         /*
          * if entering off of a kernel preemption go straight
          * to picking the next task.
@@ -2865,8 +2651,9 @@ need_resched:
         }
  
         cpu = smp_processor_id();
         }
  
         cpu = smp_processor_id();
+
  #ifdef CONFIG_VSERVER_HARDCPU          
  #ifdef CONFIG_VSERVER_HARDCPU          
-       if (!list_empty(&rq->hold_queue)) {
+       if (!list_empty(&rq->hold_queue)) {
                 struct list_head *l, *n;
                 int ret;
  
                 struct list_head *l, *n;
                 int ret;
  
@@ -2875,7 +2662,7 @@ need_resched:
                         next = list_entry(l, task_t, run_list);
                         if (vxi == next->vx_info)
                                 continue;
                         next = list_entry(l, task_t, run_list);
                         if (vxi == next->vx_info)
                                 continue;
-
+                       
                         vxi = next->vx_info;
                         ret = vx_tokens_recalc(vxi);
                         // tokens = vx_tokens_avail(next);
                         vxi = next->vx_info;
                         ret = vx_tokens_recalc(vxi);
                         // tokens = vx_tokens_avail(next);
@@ -2885,51 +2672,43 @@ need_resched:
                                 next->state &= ~TASK_ONHOLD;
                                 recalc_task_prio(next, now);
                                 __activate_task(next, rq);
                                 next->state &= ~TASK_ONHOLD;
                                 recalc_task_prio(next, now);
                                 __activate_task(next, rq);
-                               // printk("··· unhold %p\n", next);
+                               // printk("×·· unhold %p\n", next);
                                 break;
                         }
                         if ((ret < 0) && (maxidle < ret))
                                 maxidle = ret;
                                 break;
                         }
                         if ((ret < 0) && (maxidle < ret))
                                 maxidle = ret;
-               }       
+               }
         }
         }
-       rq->idle_tokens = -maxidle;
-
-pick_next:
-#endif
-       if (unlikely(!rq->nr_running)) {
-               idle_balance(cpu, rq);
-                if (!rq->nr_running) {
-                        next = rq->idle;
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-                        rq->expired_timestamp = 0;
+       rq->idle_tokens = -maxidle;
+       
+ pick_next:
  #endif
  #endif
-                        wake_sleeping_dependent(cpu, rq);
-                        goto switch_tasks;
-                }
+       next = rq_get_next_task(rq,cpu);
+       if (unlikely(next == NULL)) {
+               next = rq->idle;
+               goto switch_tasks;
         }
  
         }
  
-       next = rq_get_next_task(rq);
-
         if (dependent_sleeper(cpu, rq, next)) {
                 next = rq->idle;
                 goto switch_tasks;
         }
  
  #ifdef CONFIG_VSERVER_HARDCPU          
         if (dependent_sleeper(cpu, rq, next)) {
                 next = rq->idle;
                 goto switch_tasks;
         }
  
  #ifdef CONFIG_VSERVER_HARDCPU          
-       vxi = next->vx_info;
-       if (vxi && __vx_flags(vxi->vx_flags,
-               VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
-               int ret = vx_tokens_recalc(vxi);
-
-               if (unlikely(ret <= 0)) {
-                       if (ret && (rq->idle_tokens > -ret))
-                               rq->idle_tokens = -ret;
-                       deactivate_task(next, rq);
-                       list_add_tail(&next->run_list, &rq->hold_queue);
-                       next->state |= TASK_ONHOLD;                     
-                       goto pick_next;
-               }
-       }
+       vxi = next->vx_info;
+       if (vxi && __vx_flags(vxi->vx_flags,
+                             VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
+               int ret = vx_tokens_recalc(vxi);
+               
+               if (unlikely(ret <= 0)) {
+                       if (ret && (rq->idle_tokens > -ret))
+                               rq->idle_tokens = -ret;
+                       deactivate_task(next, rq);
+                       list_add_tail(&next->run_list, &rq->hold_queue);
+                       next->state |= TASK_ONHOLD;                     
+                       goto pick_next;
+               }
+       }
  #endif
  
         if (!rt_task(next) && next->activated > 0) {
  #endif
  
         if (!rt_task(next) && next->activated > 0) {
@@ -2980,15 +2759,16 @@ switch_tasks:
         if (test_thread_flag(TIF_NEED_RESCHED))
                 goto need_resched;
  
         if (test_thread_flag(TIF_NEED_RESCHED))
                 goto need_resched;
  
-       return;
-
+       
+       return;
+       
   dump_scheduling_disabled:
   dump_scheduling_disabled:
-       /* allow scheduling only if this is the dumping cpu */
-       if (dump_oncpu != smp_processor_id()+1) {
-               while (dump_oncpu)
-                       cpu_relax();
-       }
-       return;
+       /* allow scheduling only if this is the dumping cpu */
+       if (dump_oncpu != smp_processor_id()+1) {
+               while (dump_oncpu)
+                       cpu_relax();
+       }
+       return;
  }
  
  EXPORT_SYMBOL(schedule);
  }
  
  EXPORT_SYMBOL(schedule);
@@ -3175,11 +2955,11 @@ EXPORT_SYMBOL(wait_for_completion);
         spin_unlock_irqrestore(&q->lock, flags);
  
  #define SLEEP_ON_BKLCHECK                              \
         spin_unlock_irqrestore(&q->lock, flags);
  
  #define SLEEP_ON_BKLCHECK                              \
-       if (unlikely(!kernel_locked()) &&               \
-           sleep_on_bkl_warnings < 10) {               \
-               sleep_on_bkl_warnings++;                \
-               WARN_ON(1);                             \
-       }
+       if (unlikely(!kernel_locked()) &&               \
+           sleep_on_bkl_warnings < 10) {               \
+               sleep_on_bkl_warnings++;                \
+               WARN_ON(1);                             \
+       }
  
  static int sleep_on_bkl_warnings;
  
  
  static int sleep_on_bkl_warnings;
  
@@ -3202,7 +2982,7 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long
  {
         SLEEP_ON_VAR
  
  {
         SLEEP_ON_VAR
  
-       SLEEP_ON_BKLCHECK
+        SLEEP_ON_BKLCHECK
  
         current->state = TASK_INTERRUPTIBLE;
  
  
         current->state = TASK_INTERRUPTIBLE;
  
@@ -3215,11 +2995,26 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long
  
  EXPORT_SYMBOL(interruptible_sleep_on_timeout);
  
  
  EXPORT_SYMBOL(interruptible_sleep_on_timeout);
  
+void fastcall __sched sleep_on(wait_queue_head_t *q)
+{
+       SLEEP_ON_VAR
+
+        SLEEP_ON_BKLCHECK
+
+       current->state = TASK_UNINTERRUPTIBLE;
+
+       SLEEP_ON_HEAD
+       schedule();
+       SLEEP_ON_TAIL
+}
+
+EXPORT_SYMBOL(sleep_on);
+
  long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
  {
         SLEEP_ON_VAR
  
  long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
  {
         SLEEP_ON_VAR
  
-       SLEEP_ON_BKLCHECK
+        SLEEP_ON_BKLCHECK
  
         current->state = TASK_UNINTERRUPTIBLE;
  
  
         current->state = TASK_UNINTERRUPTIBLE;
  
@@ -3346,7 +3141,6 @@ int task_nice(const task_t *p)
  {
         return TASK_NICE(p);
  }
  {
         return TASK_NICE(p);
  }
-
  EXPORT_SYMBOL(task_nice);
  
  /**
  EXPORT_SYMBOL(task_nice);
  
  /**
@@ -3969,8 +3763,6 @@ void show_state(void)
         read_unlock(&tasklist_lock);
  }
  
         read_unlock(&tasklist_lock);
  }
  
-EXPORT_SYMBOL_GPL(show_state);
-
  void __devinit init_idle(task_t *idle, int cpu)
  {
         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
  void __devinit init_idle(task_t *idle, int cpu)
  {
         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
@@ -4657,13 +4449,12 @@ void __init sched_init(void)
  
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
  
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
+               rq->best_expired_prio = MAX_PRIO;
  #else
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
  #endif
  
  #else
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
  #endif
  
-               rq->best_expired_prio = MAX_PRIO;
-
  #ifdef CONFIG_SMP
                 rq->sd = &sched_domain_init;
                 rq->cpu_load = 0;
  #ifdef CONFIG_SMP
                 rq->sd = &sched_domain_init;
                 rq->cpu_load = 0;
@@ -4676,7 +4467,7 @@ void __init sched_init(void)
                 INIT_LIST_HEAD(&rq->migration_queue);
  #endif
  #ifdef CONFIG_VSERVER_HARDCPU          
                 INIT_LIST_HEAD(&rq->migration_queue);
  #endif
  #ifdef CONFIG_VSERVER_HARDCPU          
-               INIT_LIST_HEAD(&rq->hold_queue);
+               INIT_LIST_HEAD(&rq->hold_queue);
  #endif
                 atomic_set(&rq->nr_iowait, 0);
         }
  #endif
                 atomic_set(&rq->nr_iowait, 0);
         }
@@ -4712,15 +4503,15 @@ void __might_sleep(char *file, int line, int atomic_depth)
  #ifndef CONFIG_PREEMPT
         atomic_depth = 0;
  #endif
  #ifndef CONFIG_PREEMPT
         atomic_depth = 0;
  #endif
-       if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
+       if ((in_atomic() || irqs_disabled()) &&
             system_state == SYSTEM_RUNNING) {
                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                         return;
                 prev_jiffy = jiffies;
                 printk(KERN_ERR "Debug: sleeping function called from invalid"
                                 " context at %s:%d\n", file, line);
             system_state == SYSTEM_RUNNING) {
                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                         return;
                 prev_jiffy = jiffies;
                 printk(KERN_ERR "Debug: sleeping function called from invalid"
                                 " context at %s:%d\n", file, line);
-               printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
-                       in_atomic(), atomic_depth, irqs_disabled());
+               printk("in_atomic():%d, irqs_disabled():%d\n",
+                       in_atomic(), irqs_disabled());
                 dump_stack();
         }
  #endif
                 dump_stack();
         }
  #endif
@@ -4783,6 +4574,20 @@ EXPORT_SYMBOL(task_running_sys);
  #endif
  
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
  #endif
  
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
+
+/********************************************************************
+ *
+ *  CKRM Scheduler additions
+ * 
+ *  (a) helper functions
+ *  (b) load balancing code
+ *
+ *  These are required here to avoid having to externalize many
+ *  of the definitions in sched.c
+ *
+ * 
+ ********************************************************************/
+
  /**
   * return the classqueue object of a certain processor
   */
  /**
   * return the classqueue object of a certain processor
   */
@@ -4811,4 +4616,559 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
  
         task_rq_unlock(rq,&flags);
  }
  
         task_rq_unlock(rq,&flags);
  }
+
+/**
+ * get_min_cvt_locking  - get the mininum cvt on a particular cpu under rqlock
+ */
+
+CVT_t get_min_cvt(int cpu);
+
+CVT_t get_min_cvt_locking(int cpu)
+{
+       CVT_t cvt;
+       struct runqueue *rq = cpu_rq(cpu);
+       spin_lock(&rq->lock);
+       cvt = get_min_cvt(cpu);
+       spin_unlock(&rq->lock);
+       return cvt;
+}
+
+ckrm_lrq_t *rq_get_dflt_lrq(int cpu)
+{
+       return &(cpu_rq(cpu)->dflt_lrq);
+}
+
+#ifdef CONFIG_SMP
+
+/**************  CKRM Load Balancing code ************************/
+
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
+                                     int phase, enum idle_type idle)
+{
+       long pressure = task_load(tmp);
+       
+       if (pressure > max) 
+               return 0;
+
+       if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+               return 0;
+       return 1;
+}
+
+/*
+ * move tasks for a specic local class
+ * return number of tasks pulled
+ */
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+                                     runqueue_t *this_rq,
+                                     runqueue_t *busiest,
+                                     struct sched_domain *sd,
+                                     int this_cpu,
+                                     enum idle_type idle,
+                                     long* pressure_imbalance) 
+{
+       prio_array_t *array, *dst_array;
+       struct list_head *head, *curr;
+       task_t *tmp;
+       int idx;
+       int pulled = 0;
+       int phase = -1;
+       long pressure_min, pressure_max;
+       /*hzheng: magic : 90% balance is enough*/
+       long balance_min = *pressure_imbalance / 10; 
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ *     or the tasks that make too small difference
+ */
+#define CKRM_BALANCE_MAX_RATIO 100
+#define CKRM_BALANCE_MIN_RATIO 1
+ start:
+       phase ++;
+       /*
+        * We first consider expired tasks. Those will likely not be
+        * executed in the near future, and they are most likely to
+        * be cache-cold, thus switching CPUs has the least effect
+        * on them.
+        */
+       if (src_lrq->expired->nr_active) {
+               array = src_lrq->expired;
+               dst_array = dst_lrq->expired;
+       } else {
+               array = src_lrq->active;
+               dst_array = dst_lrq->active;
+       }
+       
+ new_array:
+       /* Start searching at priority 0: */
+       idx = 0;
+ skip_bitmap:
+       if (!idx)
+               idx = sched_find_first_bit(array->bitmap);
+       else
+               idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+       if (idx >= MAX_PRIO) {
+               if (array == src_lrq->expired && src_lrq->active->nr_active) {
+                       array = src_lrq->active;
+                       dst_array = dst_lrq->active;
+                       goto new_array;
+               }
+               if ((! phase) && (! pulled) && (idle != IDLE))
+                       goto start; //try again
+               else 
+                       goto out; //finished search for this lrq
+       }
+       
+       head = array->queue + idx;
+       curr = head->prev;
+ skip_queue:
+       tmp = list_entry(curr, task_t, run_list);
+       
+       curr = curr->prev;
+       
+       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+               if (curr != head)
+                       goto skip_queue;
+               idx++;
+               goto skip_bitmap;
+       }
+
+       pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+       pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
+       /*
+        * skip the tasks that will reverse the balance too much
+        */
+       if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+               *pressure_imbalance -= task_load(tmp);
+               pull_task(busiest, array, tmp, 
+                         this_rq, dst_array, this_cpu);
+               pulled++;
+
+               if (*pressure_imbalance <= balance_min)
+                       goto out;
+       }
+               
+       if (curr != head)
+               goto skip_queue;
+       idx++;
+       goto skip_bitmap;
+ out:         
+       return pulled;
+}
+
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+       long imbalance;
+       /*
+        * make sure after balance, imbalance' > - imbalance/2
+        * we don't want the imbalance be reversed too much
+        */
+       imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0) 
+               - ckrm_get_pressure(rq_ckrm_load(this_rq),1);
+       imbalance /= 2;
+       return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu, 
+                          runqueue_t *busiest,
+                          unsigned long max_nr_move, struct sched_domain *sd,
+                          enum idle_type idle)
+{
+       struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+       ckrm_lrq_t* src_lrq,*dst_lrq;
+       long pressure_imbalance, pressure_imbalance_old;
+       int src_cpu = task_cpu(busiest->curr);
+       struct list_head *list;
+       int pulled = 0;
+       long imbalance;
+
+       imbalance =  ckrm_rq_imbalance(this_rq,busiest);
+
+       if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+               goto out;
+
+       //try to find the vip class
+        list_for_each_entry(clsptr,&active_cpu_classes,links) {
+               src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+
+               if (! lrq_nr_running(src_lrq))
+                       continue;
+
+               if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
+                       {
+                               vip_cls = clsptr;
+                       }
+       }
+
+       /*
+        * do search from the most significant class
+        * hopefully, less tasks will be migrated this way
+        */
+       clsptr = vip_cls;
+
+ move_class:
+       if (! clsptr)
+               goto out;
+       
+
+       src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+       if (! lrq_nr_running(src_lrq))
+               goto other_class;
+       
+       dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+       //how much pressure for this class should be transferred
+       pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight);
+       if (pulled && ! pressure_imbalance) 
+               goto other_class;
+       
+       pressure_imbalance_old = pressure_imbalance;
+       
+       //move tasks
+       pulled += 
+               ckrm_cls_move_tasks(src_lrq,dst_lrq,
+                                   this_rq,
+                                   busiest,
+                                   sd,this_cpu,idle,
+                                   &pressure_imbalance);
+
+       /* 
+        * hzheng: 2 is another magic number
+        * stop balancing if the imbalance is less than 25% of the orig
+        */
+       if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+               goto out;
+               
+       //update imbalance
+       imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+       //who is next?
+       list = clsptr->links.next;
+       if (list == &active_cpu_classes)
+               list = list->next;
+       clsptr = list_entry(list, typeof(*clsptr), links);
+       if (clsptr != vip_cls)
+               goto move_class;
+ out:
+       return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics: 
+ *   no load balancing if it's load is over average
+ *   no load balancing if it's load is far more than the min
+ * task:
+ *   read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+                                            enum idle_type idle, int* nr_group)
+{
+       struct sched_group *group = sd->groups;
+       unsigned long min_load, max_load, avg_load;
+       unsigned long total_load, this_load, total_pwr;
+
+       max_load = this_load = total_load = total_pwr = 0;
+       min_load = 0xFFFFFFFF;
+       *nr_group = 0;
+
+       do {
+               cpumask_t tmp;
+               unsigned long load;
+               int local_group;
+               int i, nr_cpus = 0;
+
+               /* Tally up the load of all CPUs in the group */
+               cpus_and(tmp, group->cpumask, cpu_online_map);
+               if (unlikely(cpus_empty(tmp)))
+                       goto nextgroup;
+
+               avg_load = 0;
+               local_group = cpu_isset(this_cpu, group->cpumask);
+
+               for_each_cpu_mask(i, tmp) {
+                       load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+                       nr_cpus++;
+                       avg_load += load;
+               }
+
+               if (!nr_cpus)
+                       goto nextgroup;
+
+               total_load += avg_load;
+               total_pwr += group->cpu_power;
+
+               /* Adjust by relative CPU power of the group */
+               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+               if (local_group) {
+                       this_load = avg_load;
+                       goto nextgroup;
+               } else if (avg_load > max_load) {
+                       max_load = avg_load;
+               }      
+               if (avg_load < min_load) {
+                       min_load = avg_load;
+               }
+nextgroup:
+               group = group->next;
+               *nr_group = *nr_group + 1;
+       } while (group != sd->groups);
+
+       if (!max_load || this_load >= max_load)
+               goto out_balanced;
+
+       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+       /* hzheng: debugging: 105 is a magic number
+        * 100*max_load <= sd->imbalance_pct*this_load)
+        * should use imbalance_pct instead
+        */
+       if (this_load > avg_load 
+           || 100*max_load < 105*this_load
+           || 100*min_load < 70*this_load
+           )
+               goto out_balanced;
+
+       return avg_load;
+ out_balanced:
+       return 0;
+}
+
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
+ */
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+                    unsigned long avg_load, enum idle_type idle,
+                    int nr_group)
+{
+       struct sched_group *group;
+       runqueue_t * busiest=NULL;
+       unsigned long rand;
+       
+       group = sd->groups;
+       rand = get_ckrm_rand(nr_group);
+       nr_group = 0;
+
+       do {
+               unsigned long load,total_load,max_load;
+               cpumask_t tmp;
+               int i;
+               runqueue_t * grp_busiest;
+
+               cpus_and(tmp, group->cpumask, cpu_online_map);
+               if (unlikely(cpus_empty(tmp)))
+                       goto find_nextgroup;
+
+               total_load = 0;
+               max_load = 0;
+               grp_busiest = NULL;
+               for_each_cpu_mask(i, tmp) {
+                       load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+                       total_load += load;
+                       if (load > max_load) {
+                               max_load = load;
+                               grp_busiest = cpu_rq(i);
+                       }                               
+               }
+
+               total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+               if (total_load > avg_load) {
+                       busiest = grp_busiest;
+                       if (nr_group >= rand)
+                               break;
+               }
+       find_nextgroup:         
+               group = group->next;
+               nr_group ++;
+       } while (group != sd->groups);
+
+       return busiest;
+}
+
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq,
+                                   struct sched_domain *sd, 
+                                   enum idle_type idle)
+{
+       runqueue_t *busiest;
+       unsigned long avg_load;
+       int nr_moved,nr_group;
+
+       avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+       if (! avg_load)
+               goto out_balanced;
+
+       busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
+       if (! busiest)
+               goto out_balanced;
+       /*
+        * This should be "impossible", but since load
+        * balancing is inherently racy and statistical,
+        * it could happen in theory.
+        */
+       if (unlikely(busiest == this_rq)) {
+               WARN_ON(1);
+               goto out_balanced;
+       }
+
+       nr_moved = 0;
+       if (busiest->nr_running > 1) {
+               /*
+                * Attempt to move tasks. If find_busiest_group has found
+                * an imbalance but busiest->nr_running <= 1, the group is
+                * still unbalanced. nr_moved simply stays zero, so it is
+                * correctly treated as an imbalance.
+                */
+               double_lock_balance(this_rq, busiest);
+               nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest,
+                                          0,sd, idle);         
+               spin_unlock(&busiest->lock);
+               if (nr_moved) {
+                       adjust_local_weight();
+               }
+       }
+
+       if (!nr_moved) 
+               sd->nr_balance_failed ++;
+       else
+               sd->nr_balance_failed  = 0;             
+
+       /* We were unbalanced, so reset the balancing interval */
+       sd->balance_interval = sd->min_interval;
+
+       return nr_moved;
+
+out_balanced:
+       /* tune up the balancing interval */
+       if (sd->balance_interval < sd->max_interval)
+               sd->balance_interval *= 2;
+
+       return 0;
+}
+
+static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+                                   struct sched_domain *sd, 
+                                   enum idle_type idle)
+{
+       int ret;
+
+       if (ckrm_rq_cpu_disabled(this_rq)) 
+               return -1;
+       //spin_lock(&this_rq->lock);
+       read_lock(&class_list_lock);
+       ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle);
+       // ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE);
+       read_unlock(&class_list_lock);
+       //spin_unlock(&this_rq->lock);
+       return ret;
+}
+
+#endif   // CONFIG_SMP
+
+
+void ckrm_cpu_class_queue_update(int on)
+{
+       /* This is called when the mode changes from disabled
+        * to enabled (on=1) or vice versa (on=0).
+        * we make sure that all classqueues on all cpus
+        * either have the default class enqueued (on=1) or 
+        * all classes dequeued (on=0). 
+        * if not done a race condition will persist
+        * when flipping the ckrm_sched_mode.
+        * Otherwise will lead to more complicated code
+        * in rq_get_next_task, where we despite knowing of
+        * runnable tasks can not find an enqueued class.
+        */
+
+       int i;
+       runqueue_t *rq;
+       ckrm_lrq_t *lrq;
+       struct ckrm_cpu_class *clsptr;
+
+       if (on) {       
+               BUG_ON(ckrm_cpu_enabled());
+               for_each_cpu(i) {
+                       rq = cpu_rq(i);
+                       BUG_ON(ckrm_rq_cpu_enabled(rq));
+                       lrq = &rq->dflt_lrq;
+                       spin_lock(&rq->lock);
+
+                       BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj));
+
+                       classqueue_init(&rq->classqueue,1);
+                       lrq->top_priority = find_first_bit(lrq->active->bitmap,
+                                                          MAX_PRIO),
+                       classqueue_enqueue(lrq->classqueue, 
+                                          &lrq->classqueue_linkobj, 0);
+                       spin_unlock(&rq->lock);
+#if 0
+                       printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i,
+                               rq->nr_running,lrq->active->nr_active,
+                               lrq->expired->nr_active,
+                               find_first_bit(lrq->active->bitmap,MAX_PRIO),
+                               find_first_bit(lrq->expired->bitmap,MAX_PRIO),
+                               lrq->top_priority);
  #endif
  #endif
+               }
+       } else {
+               for_each_cpu(i) {
+                       rq = cpu_rq(i);
+                       spin_lock(&rq->lock);
+
+                       /* walk through all classes and make sure they
+                        * are not enqueued
+                        */
+                       write_lock(&class_list_lock);
+                       list_for_each_entry(clsptr,&active_cpu_classes,links) {
+                               lrq = get_ckrm_lrq(clsptr,i);
+                               BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq));  // must be empty
+                               if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
+                                       classqueue_dequeue(lrq->classqueue,
+                                                       &lrq->classqueue_linkobj);
+                       }
+                       rq->classqueue.enabled = 0;
+                       write_unlock(&class_list_lock);
+                       spin_unlock(&rq->lock);
+               }
+       }
+}
+
+/*
+ * callback when a class is getting deleted
+ * need to remove it from the class runqueue. see (class_queue_update)
+ */
+
+void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr)
+{
+       int i;
+       
+       for_each_cpu(i) {
+               runqueue_t *rq = cpu_rq(i);
+               ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i);
+
+               spin_lock(&rq->lock);
+               write_lock(&class_list_lock);
+               BUG_ON(lrq_nr_running(lrq));  // must be empty
+               if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
+                       classqueue_dequeue(lrq->classqueue,
+                                          &lrq->classqueue_linkobj);
+               write_unlock(&class_list_lock);
+               spin_unlock(&rq->lock);
+       }
+}
+
+#endif  // CONFIG_CKRM_CPU_SCHEDULE
diff --git a/kernel/sys.c b/kernel/sys.c

index c69f6ed..6e8b073 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -18,6 +18,8 @@
  #include <linux/init.h>
  #include <linux/highuid.h>
  #include <linux/fs.h>
  #include <linux/init.h>
  #include <linux/highuid.h>
  #include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
  #include <linux/workqueue.h>
  #include <linux/device.h>
  #include <linux/times.h>
  #include <linux/workqueue.h>
  #include <linux/device.h>
  #include <linux/times.h>
@@ -511,6 +513,25 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
                 machine_restart(buffer);
                 break;
  
                 machine_restart(buffer);
                 break;
  
+#ifdef CONFIG_KEXEC
+       case LINUX_REBOOT_CMD_KEXEC:
+       {
+               struct kimage *image;
+               image = xchg(&kexec_image, 0);
+               if (!image) {
+                       unlock_kernel();
+                       return -EINVAL;
+               }
+               notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+               system_state = SYSTEM_RESTART;
+               device_shutdown();
+               system_state = SYSTEM_BOOTING;
+               printk(KERN_EMERG "Starting new kernel\n");
+               machine_shutdown();
+               machine_kexec(image);
+               break;
+       }
+#endif
  #ifdef CONFIG_SOFTWARE_SUSPEND
         case LINUX_REBOOT_CMD_SW_SUSPEND:
                 {
  #ifdef CONFIG_SOFTWARE_SUSPEND
         case LINUX_REBOOT_CMD_SW_SUSPEND:
                 {
diff --git a/lib/.cvsignore b/lib/.cvsignore

new file mode 100644 (file)

index 0000000..30d3818
--- /dev/null
+++ b/lib/.cvsignore
@@ -0,0 +1,2 @@
+crc32table.h
+gen_crc32table
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig

index b58141e..c4bae8c 100644 (file)
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -628,5 +628,50 @@ config IP_NF_MATCH_REALM
           If you want to compile it as a module, say M here and read
           Documentation/modules.txt.  If unsure, say `N'.
  
           If you want to compile it as a module, say M here and read
           Documentation/modules.txt.  If unsure, say `N'.
  
+config IP_NF_CT_ACCT
+       bool "Connection tracking flow accounting"
+       depends on IP_NF_CONNTRACK
+
+config IP_NF_CT_PROTO_GRE
+       tristate  ' GRE protocol support'
+       depends on IP_NF_CONNTRACK
+       help
+         This module adds generic support for connection tracking and NAT of the
+         GRE protocol (RFC1701, RFC2784).  Please note that this will only work
+         with GRE connections using the key field of the GRE header.
+       
+         You will need GRE support to enable PPTP support.
+       
+         If you want to compile it as a module, say `M' here and read
+         Documentation/modules.txt.  If unsire, say `N'.
+
+config IP_NF_PPTP
+       tristate  'PPTP protocol support'
+       depends on IP_NF_CT_PROTO_GRE
+       help
+         This module adds support for PPTP (Point to Point Tunnelling Protocol, 
+         RFC2637) conncection tracking and NAT. 
+       
+         If you are running PPTP sessions over a stateful firewall or NAT box,
+         you may want to enable this feature.  
+       
+         Please note that not all PPTP modes of operation are supported yet.
+         For more info, read top of the file net/ipv4/netfilter/ip_conntrack_pptp.c
+       
+         If you want to compile it as a module, say M here and read
+         Documentation/modules.txt.  If unsure, say `N'.
+
+config IP_NF_NAT_PPTP
+       tristate
+       depends on IP_NF_NAT!=n && IP_NF_PPTP!=n
+       default IP_NF_NAT if IP_NF_PPTP=y
+       default m if IP_NF_PPTP=m
+
+config IP_NF_NAT_PROTO_GRE
+       tristate
+       depends on IP_NF_NAT!=n && IP_NF_CT_PROTO_GRE!=n
+       default IP_NF_NAT if IP_NF_CT_PROTO_GRE=y
+       default m if IP_NF_CT_PROTO_GRE=m
+
  endmenu
  
  endmenu
  
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile

index bdb23fd..f54887b 100644 (file)
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -19,17 +19,25 @@ ipchains-objs               := $(ip_nf_compat-objs) ipchains_core.o
  # connection tracking
  obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
  
  # connection tracking
  obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
  
+# connection tracking protocol helpers
+obj-$(CONFIG_IP_NF_CT_PROTO_GRE) += ip_conntrack_proto_gre.o
+
+# NAT protocol helpers
+obj-$(CONFIG_IP_NF_NAT_PROTO_GRE) += ip_nat_proto_gre.o
+
  # connection tracking helpers
  obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
  obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
  obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
  obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
  # connection tracking helpers
  obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
  obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
  obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
  obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o
  
  # NAT helpers 
  obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
  obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
  obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
  obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
  
  # NAT helpers 
  obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
  obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
  obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
  obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o
  
  # generic IP tables 
  obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
  
  # generic IP tables 
  obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c

index 4e8f4d8..40ed447 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -58,7 +58,7 @@ static int help(struct sk_buff *skb,
  
         /* increase the UDP timeout of the master connection as replies from
          * Amanda clients to the server can be quite delayed */
  
         /* increase the UDP timeout of the master connection as replies from
          * Amanda clients to the server can be quite delayed */
-       ip_ct_refresh(ct, master_timeout * HZ);
+       ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
  
         /* No data? */
         dataoff = skb->nh.iph->ihl*4 + sizeof(struct udphdr);
  
         /* No data? */
         dataoff = skb->nh.iph->ihl*4 + sizeof(struct udphdr);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c

index 05fbb43..757af68 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -143,6 +143,7 @@ get_tuple(const struct iphdr *iph,
         tuple->src.ip = iph->saddr;
         tuple->dst.ip = iph->daddr;
         tuple->dst.protonum = iph->protocol;
         tuple->src.ip = iph->saddr;
         tuple->dst.ip = iph->daddr;
         tuple->dst.protonum = iph->protocol;
+       tuple->src.u.all = tuple->dst.u.all = 0;
  
         return protocol->pkt_to_tuple(skb, dataoff, tuple);
  }
  
         return protocol->pkt_to_tuple(skb, dataoff, tuple);
  }
@@ -156,6 +157,8 @@ invert_tuple(struct ip_conntrack_tuple *inverse,
         inverse->dst.ip = orig->src.ip;
         inverse->dst.protonum = orig->dst.protonum;
  
         inverse->dst.ip = orig->src.ip;
         inverse->dst.protonum = orig->dst.protonum;
  
+       inverse->src.u.all = inverse->dst.u.all = 0;
+
         return protocol->invert_tuple(inverse, orig);
  }
  
         return protocol->invert_tuple(inverse, orig);
  }
  
@@ -976,8 +979,8 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
          * so there is no need to use the tuple lock too */
  
         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
          * so there is no need to use the tuple lock too */
  
         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
-       DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
-       DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
+       DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
+       DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
  
         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
                         struct ip_conntrack_expect *, &expect->tuple, 
  
         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
                         struct ip_conntrack_expect *, &expect->tuple, 
@@ -1070,15 +1073,14 @@ int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
  
         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
  
         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
-
         DEBUGP("change_expect:\n");
         DEBUGP("change_expect:\n");
-       DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
-       DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
-       DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
+       DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
+       DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
+       DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
         if (expect->ct_tuple.dst.protonum == 0) {
                 /* Never seen before */
                 DEBUGP("change expect: never seen before\n");
         if (expect->ct_tuple.dst.protonum == 0) {
                 /* Never seen before */
                 DEBUGP("change expect: never seen before\n");
-               if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
+               if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
                         /* Force NAT to find an unused tuple */
                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
                         /* Force NAT to find an unused tuple */
@@ -1166,21 +1168,39 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
         synchronize_net();
  }
  
         synchronize_net();
  }
  
-/* Refresh conntrack for this many jiffies. */
-void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+static inline void ct_add_counters(struct ip_conntrack *ct,
+                                  enum ip_conntrack_info ctinfo,
+                                  const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_NF_CT_ACCT
+       if (skb) {
+               ct->counters[CTINFO2DIR(ctinfo)].packets++;
+               ct->counters[CTINFO2DIR(ctinfo)].bytes += 
+                                       ntohs(skb->nh.iph->tot_len);
+       }
+#endif
+}
+
+/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
+void ip_ct_refresh_acct(struct ip_conntrack *ct, 
+                       enum ip_conntrack_info ctinfo,
+                       const struct sk_buff *skb,
+                       unsigned long extra_jiffies)
  {
         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
  
         /* If not in hash table, timer will not be active yet */
  {
         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
  
         /* If not in hash table, timer will not be active yet */
-       if (!is_confirmed(ct))
+       if (!is_confirmed(ct)) {
                 ct->timeout.expires = extra_jiffies;
                 ct->timeout.expires = extra_jiffies;
-       else {
+               ct_add_counters(ct, ctinfo, skb);
+       } else {
                 WRITE_LOCK(&ip_conntrack_lock);
                 /* Need del_timer for race avoidance (may already be dying). */
                 if (del_timer(&ct->timeout)) {
                         ct->timeout.expires = jiffies + extra_jiffies;
                         add_timer(&ct->timeout);
                 }
                 WRITE_LOCK(&ip_conntrack_lock);
                 /* Need del_timer for race avoidance (may already be dying). */
                 if (del_timer(&ct->timeout)) {
                         ct->timeout.expires = jiffies + extra_jiffies;
                         add_timer(&ct->timeout);
                 }
+               ct_add_counters(ct, ctinfo, skb);
                 WRITE_UNLOCK(&ip_conntrack_lock);
         }
  }
                 WRITE_UNLOCK(&ip_conntrack_lock);
         }
  }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c

index 0df558a..6a7db77 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
@@ -50,9 +50,9 @@ static unsigned int generic_print_conntrack(char *buffer,
  /* Returns verdict for packet, or -1 for invalid. */
  static int packet(struct ip_conntrack *conntrack,
                   const struct sk_buff *skb,
  /* Returns verdict for packet, or -1 for invalid. */
  static int packet(struct ip_conntrack *conntrack,
                   const struct sk_buff *skb,
-                 enum ip_conntrack_info conntrackinfo)
+                 enum ip_conntrack_info ctinfo)
  {
  {
-       ip_ct_refresh(conntrack, ip_ct_generic_timeout);
+       ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
         return NF_ACCEPT;
  }
  
         return NF_ACCEPT;
  }
  
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c

index 013f759..edccfe8 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -130,13 +130,6 @@ int ip_ct_gre_keymap_add(struct ip_conntrack_expect *exp,
  void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km,
                              struct ip_conntrack_tuple *t)
  {
  void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km,
                              struct ip_conntrack_tuple *t)
  {
-        if (!km)
-        {
-                printk(KERN_WARNING
-                        "NULL GRE conntrack keymap change requested\n");
-                return;
-        }
-
         DEBUGP("changing entry %p to: ", km);
         DUMP_TUPLE_GRE(t);
  
         DEBUGP("changing entry %p to: ", km);
         DUMP_TUPLE_GRE(t);
  
@@ -188,8 +181,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
         u_int32_t srckey;
  
         grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
         u_int32_t srckey;
  
         grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
-       /* PPTP header is variable length, only need up to the call_id field */
-       pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
+       pgrehdr = skb_header_pointer(skb, dataoff, sizeof(_pgrehdr), &_pgrehdr);
  
         if (!grehdr || !pgrehdr)
                 return 0;
  
         if (!grehdr || !pgrehdr)
                 return 0;
@@ -219,11 +211,11 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
  
         srckey = gre_keymap_lookup(tuple);
  
  
         srckey = gre_keymap_lookup(tuple);
  
-       tuple->src.u.gre.key = srckey;
  #if 0
         DEBUGP("found src key %x for tuple ", ntohl(srckey));
         DUMP_TUPLE_GRE(tuple);
  #endif
  #if 0
         DEBUGP("found src key %x for tuple ", ntohl(srckey));
         DUMP_TUPLE_GRE(tuple);
  #endif
+       tuple->src.u.gre.key = srckey;
  
         return 1;
  }
  
         return 1;
  }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c

index 4711484..e854193 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -94,7 +94,7 @@ static int icmp_packet(struct ip_conntrack *ct,
                         ct->timeout.function((unsigned long)ct);
         } else {
                 atomic_inc(&ct->proto.icmp.count);
                         ct->timeout.function((unsigned long)ct);
         } else {
                 atomic_inc(&ct->proto.icmp.count);
-               ip_ct_refresh(ct, ip_ct_icmp_timeout);
+               ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
         }
  
         return NF_ACCEPT;
         }
  
         return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c

index 463cafa..73fe040 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -225,7 +225,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
                 set_bit(IPS_ASSURED_BIT, &conntrack->status);
  
  out:   WRITE_UNLOCK(&tcp_lock);
                 set_bit(IPS_ASSURED_BIT, &conntrack->status);
  
  out:   WRITE_UNLOCK(&tcp_lock);
-       ip_ct_refresh(conntrack, *tcp_timeouts[newconntrack]);
+       ip_ct_refresh_acct(conntrack, ctinfo, skb, *tcp_timeouts[newconntrack]);
  
         return NF_ACCEPT;
  }
  
         return NF_ACCEPT;
  }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c

index a63c32d..a69e14b 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -60,16 +60,17 @@ static unsigned int udp_print_conntrack(char *buffer,
  /* Returns verdict for packet, and may modify conntracktype */
  static int udp_packet(struct ip_conntrack *conntrack,
                       const struct sk_buff *skb,
  /* Returns verdict for packet, and may modify conntracktype */
  static int udp_packet(struct ip_conntrack *conntrack,
                       const struct sk_buff *skb,
-                     enum ip_conntrack_info conntrackinfo)
+                     enum ip_conntrack_info ctinfo)
  {
         /* If we've seen traffic both ways, this is some kind of UDP
            stream.  Extend timeout. */
         if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
  {
         /* If we've seen traffic both ways, this is some kind of UDP
            stream.  Extend timeout. */
         if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
-               ip_ct_refresh(conntrack, ip_ct_udp_timeout_stream);
+               ip_ct_refresh_acct(conntrack, ctinfo, skb, 
+                                  ip_ct_udp_timeout_stream);
                 /* Also, more likely to be important, and not a probe */
                 set_bit(IPS_ASSURED_BIT, &conntrack->status);
         } else
                 /* Also, more likely to be important, and not a probe */
                 set_bit(IPS_ASSURED_BIT, &conntrack->status);
         } else
-               ip_ct_refresh(conntrack, ip_ct_udp_timeout);
+               ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
  
         return NF_ACCEPT;
  }
  
         return NF_ACCEPT;
  }
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c

index fd688f4..76c827d 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -83,6 +83,17 @@ print_expect(char *buffer, const struct ip_conntrack_expect *expect)
         return len;
  }
  
         return len;
  }
  
+#ifdef CONFIG_IP_NF_CT_ACCT
+static unsigned int
+print_counters(char *buffer, struct ip_conntrack_counter *counter)
+{
+       return sprintf(buffer, "packets=%llu bytes=%llu ", 
+                       counter->packets, counter->bytes);
+}
+#else
+#define print_counters(x, y)   0
+#endif
+
  static unsigned int
  print_conntrack(char *buffer, struct ip_conntrack *conntrack)
  {
  static unsigned int
  print_conntrack(char *buffer, struct ip_conntrack *conntrack)
  {
@@ -103,12 +114,16 @@ print_conntrack(char *buffer, struct ip_conntrack *conntrack)
                            &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                            proto);
         len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_ORIGINAL]);
                            &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                            proto);
         len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_ORIGINAL]);
+       len += print_counters(buffer + len, 
+                             &conntrack->counters[IP_CT_DIR_ORIGINAL]);
         if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
                 len += sprintf(buffer + len, "[UNREPLIED] ");
         len += print_tuple(buffer + len,
                            &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
                            proto);
         len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_REPLY]);
         if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
                 len += sprintf(buffer + len, "[UNREPLIED] ");
         len += print_tuple(buffer + len,
                            &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
                            proto);
         len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_REPLY]);
+       len += print_counters(buffer + len, 
+                             &conntrack->counters[IP_CT_DIR_REPLY]);
         if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
                 len += sprintf(buffer + len, "[ASSURED] ");
         len += sprintf(buffer + len, "use=%u ",
         if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
                 len += sprintf(buffer + len, "[ASSURED] ");
         len += sprintf(buffer + len, "use=%u ",
@@ -640,7 +655,7 @@ EXPORT_SYMBOL(need_ip_conntrack);
  EXPORT_SYMBOL(ip_conntrack_helper_register);
  EXPORT_SYMBOL(ip_conntrack_helper_unregister);
  EXPORT_SYMBOL(ip_ct_selective_cleanup);
  EXPORT_SYMBOL(ip_conntrack_helper_register);
  EXPORT_SYMBOL(ip_conntrack_helper_unregister);
  EXPORT_SYMBOL(ip_ct_selective_cleanup);
-EXPORT_SYMBOL(ip_ct_refresh);
+EXPORT_SYMBOL(ip_ct_refresh_acct);
  EXPORT_SYMBOL(ip_ct_find_proto);
  EXPORT_SYMBOL(__ip_ct_find_proto);
  EXPORT_SYMBOL(ip_ct_find_helper);
  EXPORT_SYMBOL(ip_ct_find_proto);
  EXPORT_SYMBOL(__ip_ct_find_proto);
  EXPORT_SYMBOL(ip_ct_find_helper);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c

index 1c6b781..130b01c 100644 (file)
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -438,7 +438,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
         *tuple = *orig_tuple;
         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
                != NULL) {
         *tuple = *orig_tuple;
         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
                != NULL) {
-               DEBUGP("Found best for "); DUMP_TUPLE(tuple);
+               DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple);
                 /* 3) The per-protocol part of the manip is made to
                    map into the range to make a unique tuple. */
  
                 /* 3) The per-protocol part of the manip is made to
                    map into the range to make a unique tuple. */
  
@@ -580,9 +580,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
                        conntrack);
                 DEBUGP("Original: ");
                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
                        conntrack);
                 DEBUGP("Original: ");
-               DUMP_TUPLE(&orig_tp);
+               DUMP_TUPLE_RAW(&orig_tp);
                 DEBUGP("New: ");
                 DEBUGP("New: ");
-               DUMP_TUPLE(&new_tuple);
+               DUMP_TUPLE_RAW(&new_tuple);
  #endif
  
                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
  #endif
  
                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index 23f8f51..ad097f5 100644 (file)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1107,6 +1107,75 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
         return 0;
  }
  
         return 0;
  }
  
+/* XXX (mef) need to generalize the IPOD stuff.  Right now I am borrowing 
+   from the ICMP infrastructure. */
+#ifdef CONFIG_ICMP_IPOD
+#include <linux/reboot.h>
+
+extern int sysctl_icmp_ipod_version;
+extern int sysctl_icmp_ipod_enabled;
+extern u32 sysctl_icmp_ipod_host;
+extern u32 sysctl_icmp_ipod_mask;
+extern char sysctl_icmp_ipod_key[32+1];
+#define IPOD_CHECK_KEY \
+       (sysctl_icmp_ipod_key[0] != 0)
+#define IPOD_VALID_KEY(d) \
+       (strncmp(sysctl_icmp_ipod_key, (char *)(d), strlen(sysctl_icmp_ipod_key)) == 0)
+
+static void udp_ping_of_death(struct sk_buff *skb, struct udphdr *uh, u32 saddr)
+{
+       int doit = 0;
+
+       /*
+        * If IPOD not enabled or wrong UDP IPOD port, ignore.
+        */
+       if (!sysctl_icmp_ipod_enabled || (ntohs(uh->dest) != 664))
+               return;
+
+#if 0
+       printk(KERN_INFO "IPOD: got udp pod request, host=%u.%u.%u.%u\n", NIPQUAD(saddr));
+#endif
+
+
+       /*
+        * First check the source address info.
+        * If host not set, ignore.
+        */
+       if (sysctl_icmp_ipod_host != 0xffffffff &&
+           (ntohl(saddr) & sysctl_icmp_ipod_mask) == sysctl_icmp_ipod_host) {
+               /*
+                * Now check the key if enabled.
+                * If packet doesn't contain enough data or key
+                * is otherwise invalid, ignore.
+                */
+               if (IPOD_CHECK_KEY) {
+                       if (pskb_may_pull(skb, sizeof(sysctl_icmp_ipod_key)+sizeof(struct udphdr)-1)){
+#if 0
+                           int i;
+                           for (i=0;i<32+1;i++){
+                               printk("%c",((char*)skb->data)[i+sizeof(struct udphdr)]);
+                           }   
+                           printk("\n");
+#endif
+                           if (IPOD_VALID_KEY(skb->data+sizeof(struct udphdr)))
+                               doit = 1;
+                       }
+               } else {
+                       doit = 1;
+               }
+       }
+       if (doit) {
+               sysctl_icmp_ipod_enabled = 0;
+               printk(KERN_CRIT "IPOD: reboot forced by %u.%u.%u.%u...\n",
+                      NIPQUAD(saddr));
+               machine_restart(NULL);
+       } else {
+               printk(KERN_WARNING "IPOD: from %u.%u.%u.%u rejected\n",
+                      NIPQUAD(saddr));
+       }
+}
+#endif
+
  /*
   *     All we need to do is get the socket, and then do a checksum. 
   */
  /*
   *     All we need to do is get the socket, and then do a checksum. 
   */
@@ -1143,6 +1212,10 @@ int udp_rcv(struct sk_buff *skb)
         if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
                 return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
  
         if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
                 return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
  
+#ifdef CONFIG_ICMP_IPOD
+       udp_ping_of_death(skb, uh, saddr);
+#endif
+
         sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
  
         if (sk != NULL) {
         sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
  
         if (sk != NULL) {
diff --git a/scripts/.cvsignore b/scripts/.cvsignore

new file mode 100644 (file)

index 0000000..d95bc0a
--- /dev/null
+++ b/scripts/.cvsignore
@@ -0,0 +1,4 @@
+bin2c
+conmakehash
+kallsyms
+pnmtologo
diff --git a/scripts/basic/.cvsignore b/scripts/basic/.cvsignore

new file mode 100644 (file)

index 0000000..fa6c888
--- /dev/null
+++ b/scripts/basic/.cvsignore
@@ -0,0 +1,3 @@
+docproc
+fixdep
+split-include
diff --git a/scripts/kconfig/.cvsignore b/scripts/kconfig/.cvsignore

new file mode 100644 (file)

index 0000000..37981a9
--- /dev/null
+++ b/scripts/kconfig/.cvsignore
@@ -0,0 +1,5 @@
+conf
+lex.zconf.c
+mconf
+zconf.tab.c
+zconf.tab.h
diff --git a/scripts/kernel-2.6-planetlab.spec b/scripts/kernel-2.6-planetlab.spec

index 4e2be56..84f9f99 100644 (file)
--- a/scripts/kernel-2.6-planetlab.spec
+++ b/scripts/kernel-2.6-planetlab.spec
@@ -22,7 +22,7 @@ Summary: The Linux kernel (the core of the Linux operating system)
  %define kversion 2.6.%{sublevel}
  %define rpmversion 2.6.%{sublevel}
  %define rhbsys  %([ -r /etc/beehive-root ] && echo  || echo .`whoami`)
  %define kversion 2.6.%{sublevel}
  %define rpmversion 2.6.%{sublevel}
  %define rhbsys  %([ -r /etc/beehive-root ] && echo  || echo .`whoami`)
-%define release 1.521.2.6.planetlab%{?date:.%{date}}
+%define release 1.521.3.planetlab%{?date:.%{date}}
  %define signmodules 0
  
  %define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE}
  %define signmodules 0
  
  %define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE}
diff --git a/scripts/lxdialog/.cvsignore b/scripts/lxdialog/.cvsignore

new file mode 100644 (file)

index 0000000..bebf295
--- /dev/null
+++ b/scripts/lxdialog/.cvsignore
@@ -0,0 +1 @@
+lxdialog
diff --git a/scripts/mod/.cvsignore b/scripts/mod/.cvsignore

new file mode 100644 (file)

index 0000000..a6dd5e2
--- /dev/null
+++ b/scripts/mod/.cvsignore
@@ -0,0 +1,3 @@
+elfconfig.h
+mk_elfconfig
+modpost
diff --git a/usr/.cvsignore b/usr/.cvsignore

new file mode 100644 (file)

index 0000000..d06dfff
--- /dev/null
+++ b/usr/.cvsignore
@@ -0,0 +1,3 @@
+gen_init_cpio
+initramfs_data.cpio
+initramfs_data.cpio.gz
author	Planet-Lab Support <support@planet-lab.org>
	Fri, 21 Jan 2005 03:34:32 +0000 (03:34 +0000)
committer	Planet-Lab Support <support@planet-lab.org>
	Fri, 21 Jan 2005 03:34:32 +0000 (03:34 +0000)
.cvsignore	[new file with mode: 0644]	patch \| blob
Documentation/ckrm/cpusched	[new file with mode: 0644]	patch \| blob
MAINTAINERS		patch \| blob \| history
Makefile		patch \| blob \| history
arch/i386/Kconfig		patch \| blob \| history
arch/i386/boot/.cvsignore	[new file with mode: 0644]	patch \| blob
arch/i386/boot/compressed/.cvsignore	[new file with mode: 0644]	patch \| blob
arch/i386/boot/compressed/misc.c		patch \| blob \| history
arch/i386/boot/tools/.cvsignore	[new file with mode: 0644]	patch \| blob
arch/i386/defconfig		patch \| blob \| history
arch/i386/kernel/.cvsignore	[new file with mode: 0644]	patch \| blob
arch/i386/kernel/Makefile		patch \| blob \| history
arch/i386/kernel/apic.c		patch \| blob \| history
arch/i386/kernel/asm-offsets.c		patch \| blob \| history
arch/i386/kernel/entry.S		patch \| blob \| history
arch/i386/kernel/i386_ksyms.c		patch \| blob \| history
arch/i386/kernel/i8259.c		patch \| blob \| history
arch/i386/kernel/init_task.c		patch \| blob \| history
arch/i386/kernel/io_apic.c		patch \| blob \| history
arch/i386/kernel/irq.c		patch \| blob \| history
arch/i386/kernel/machine_kexec.c	[new file with mode: 0644]	patch \| blob
arch/i386/kernel/process.c		patch \| blob \| history
arch/i386/kernel/reboot.c		patch \| blob \| history
arch/i386/kernel/relocate_kernel.S	[new file with mode: 0644]	patch \| blob
configs/kernel-2.6.8-i686-planetlab.config		patch \| blob \| history
drivers/block/cfq-iosched-orig.c	[deleted file]	patch \| blob \| history
drivers/block/cfq-iosched.c		patch \| blob \| history
drivers/block/ckrm-io.c		patch \| blob \| history
drivers/block/ckrm-iostub.c		patch \| blob \| history
drivers/char/.cvsignore	[new file with mode: 0644]	patch \| blob
drivers/pci/.cvsignore	[new file with mode: 0644]	patch \| blob
drivers/scsi/aic7xxx/.cvsignore	[new file with mode: 0644]	patch \| blob
fs/aio.c		patch \| blob \| history
include/.cvsignore	[new file with mode: 0644]	patch \| blob
include/asm-i386/.cvsignore	[new file with mode: 0644]	patch \| blob
include/asm-i386/apicdef.h		patch \| blob \| history
include/asm-i386/irq.h		patch \| blob \| history
include/asm-i386/kexec.h	[new file with mode: 0644]	patch \| blob
include/asm-i386/module.h		patch \| blob \| history
include/asm-i386/processor.h		patch \| blob \| history
include/asm-i386/segment.h		patch \| blob \| history
include/asm-i386/thread_info.h		patch \| blob \| history
include/linux/.cvsignore	[new file with mode: 0644]	patch \| blob
include/linux/ckrm-io.h		patch \| blob \| history
include/linux/ckrm_ce.h		patch \| blob \| history
include/linux/ckrm_classqueue.h		patch \| blob \| history
include/linux/ckrm_rc.h		patch \| blob \| history
include/linux/ckrm_sched.h		patch \| blob \| history
include/linux/ckrm_tc.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/kexec.h	[new file with mode: 0644]	patch \| blob
include/linux/mm.h		patch \| blob \| history
include/linux/netfilter_ipv4/ip_conntrack.h		patch \| blob \| history
include/linux/netfilter_ipv4/ip_conntrack_tuple.h		patch \| blob \| history
include/linux/reboot.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/skbuff.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/.cvsignore	[new file with mode: 0644]	patch \| blob
kernel/Makefile		patch \| blob \| history
kernel/ckrm/Makefile		patch \| blob \| history
kernel/ckrm/ckrm.c		patch \| blob \| history
kernel/ckrm/ckrm_cpu_class.c		patch \| blob \| history
kernel/ckrm/ckrm_cpu_monitor.c		patch \| blob \| history
kernel/ckrm/ckrm_laq.c	[deleted file]	patch \| blob \| history
kernel/ckrm/ckrm_listenaq.c		patch \| blob \| history
kernel/ckrm/rbce/rbcemod.c		patch \| blob \| history
kernel/ckrm_classqueue.c		patch \| blob \| history
kernel/ckrm_sched.c		patch \| blob \| history
kernel/kexec.c	[new file with mode: 0644]	patch \| blob
kernel/sched.c		patch \| blob \| history
kernel/sys.c		patch \| blob \| history
lib/.cvsignore	[new file with mode: 0644]	patch \| blob
net/ipv4/netfilter/Kconfig		patch \| blob \| history
net/ipv4/netfilter/Makefile		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_amanda.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_core.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_generic.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_gre.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_icmp.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_tcp.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_udp.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_standalone.c		patch \| blob \| history
net/ipv4/netfilter/ip_nat_core.c		patch \| blob \| history
net/ipv4/udp.c		patch \| blob \| history
scripts/.cvsignore	[new file with mode: 0644]	patch \| blob
scripts/basic/.cvsignore	[new file with mode: 0644]	patch \| blob
scripts/kconfig/.cvsignore	[new file with mode: 0644]	patch \| blob
scripts/kernel-2.6-planetlab.spec		patch \| blob \| history
scripts/lxdialog/.cvsignore	[new file with mode: 0644]	patch \| blob
scripts/mod/.cvsignore	[new file with mode: 0644]	patch \| blob
usr/.cvsignore	[new file with mode: 0644]	patch \| blob