From: Planet-Lab Support <support@planet-lab.org>
Date: Fri, 21 Jan 2005 03:34:32 +0000 (+0000)
Subject: This commit was manufactured by cvs2svn to create tag
X-Git-Tag: after-ckrm_E16-cpu-controller-v9rc1^0
X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=14f387ae37713b1527d145daf89a1ce1581b5ad0;hp=a91482bdcc2e0f6035702e46f1b99043a0893346;p=linux-2.6.git

This commit was manufactured by cvs2svn to create tag
'after-ckrm_E16-cpu-controller-v9rc1'.
---

diff --git a/.cvsignore b/.cvsignore
new file mode 100644
index 000000000..5e7d07457
--- /dev/null
+++ b/.cvsignore
@@ -0,0 +1,13 @@
+.config
+.tmp_System.map
+.tmp_kallsyms1.S
+.tmp_kallsyms2.S
+.tmp_kallsyms3.S
+.tmp_versions
+.tmp_vmlinux1
+.tmp_vmlinux2
+.tmp_vmlinux3
+.version
+Module.symvers
+System.map
+vmlinux
diff --git a/Documentation/ckrm/cpusched b/Documentation/ckrm/cpusched
new file mode 100644
index 000000000..01f7f232a
--- /dev/null
+++ b/Documentation/ckrm/cpusched
@@ -0,0 +1,86 @@
+CKRM CPU Scheduling 
+===================
+
+Overview
+--------
+
+In CKRM, cpu scheduling is based on a two level scheduling decision.
+Every time a new task is to be selected, the scheduler first determines
+which class to run next and then schedules the next task in selected
+task.
+
+The scheduling within a class is performed using the default Linux
+O(1) scheduler.
+
+The class scheduler also follows the O(1) principle and works as
+follows: 
+
+Each class maintains a local runqueue per cpu aka <struct
+ckrm_runqueue> or short lrq. The existing O(1) scheduler is used to
+schedule within an <lrq>.
+
+Weights are assigned to each lrq that mirror the effectives shares of
+that class. Every time a task executes, its weighted cycles are
+charged against its class. Thus classes progress in time called
+cummulative virtual time (CVT). In essence the class with the smallest
+CVT is selected next. Provisions are made to keep interactivity and
+avoid starvation by longer sleeping classes.
+
+Load balancing across an SMP is performed by balancing the load of
+each class across CPUs such that they produce equal load and thus 
+on the whole system maintain their share.
+
+Due to the fact that CKRM uses a class hierarchy, cycles that are unused
+by a class are redistributed to among busy siblings.
+Enabling the CKRM CPU scheduler
+-------------------------------
+
+The scheduler is integrated into the linux scheduler and therefore
+can not be loaded dynamically like other CKRM schedulers
+
+However it can be selected at boot time or dynamically at run time.
+
+The boot options "ckrmcpu" OR "nockrmcpu" enable / disable the CKRM
+cpu scheduler at boot time. Currently by default the scheduler is
+disabled.
+
+# cat /rcfs/taskclass/config 
+
+"res=cpu,mode=enabled" indicates that the CKRM cpu scheduler is
+enabled
+
+"res=cpu,mode=disabled" indicates that the CKRM cpu scheduler is
+disabled
+
+The strings can also be used to dynamically change the scheduling modus
+at runtime. For example, to dynamically activate the scheduler.
+
+# echo "res=cpu,mode=enabled" > /rcfs/taskclass/config
+
+# cat /rcfs/taskclass/*/stats
+
+The cpu portion of the scheduler is shown
+
+    "cpu-usage(2,10,60)= 290 340 510"
+
+The 3 numbers represent the load for the 2 second, 10 second 
+and 60 seconds. The base = 1000.
+Hence the system has 29.0%, 33.5% and 49.8% respectively
+
+For debugging purposes additional information can be printed out but
+that format should not be relied upon. 
+
+Use `echo "res=cpu,usage_detail=3" for the highest detail on usage.
+Please consult the source code for the specifics.
+
+Assigning shares
+----------------
+
+Follows the general approach described under ckrm_basics.
+
+# echo "res=cpu,guarantee=val" > shares   
+
+sets the minimum guarantee of a class.
+
+
+
diff --git a/MAINTAINERS b/MAINTAINERS
index c8c25df43..523f115fb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1226,6 +1226,17 @@ W:	http://nfs.sourceforge.net/
 W:	http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
 S:	Maintained
 
+KEXEC
+P:	Eric Biederman
+P:	Randy Dunlap
+M:	ebiederm@xmission.com
+M:	rddunlap@osdl.org
+W:	http://www.xmission.com/~ebiederm/files/kexec/
+W:	http://developer.osdl.org/rddunlap/kexec/
+L:	linux-kernel@vger.kernel.org
+L:	fastboot@osdl.org
+S:	Maintained
+
 LANMEDIA WAN CARD DRIVER
 P:	Andrew Stanley-Jones
 M:	asj@lanmedia.com
diff --git a/Makefile b/Makefile
index 4d94580e0..c57684382 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 8
-EXTRAVERSION = -1.521.2.5.planetlab
+EXTRAVERSION = -1.521.3.planetlab
 NAME=Zonked Quokka
 
 # *DOCUMENTATION*
@@ -453,6 +453,10 @@ ifndef CONFIG_FRAME_POINTER
 CFLAGS		+= -fomit-frame-pointer
 endif
 
+ifdef CONFIG_X86_STACK_CHECK
+CFLAGS		+= -p
+endif
+
 ifdef CONFIG_DEBUG_INFO
 CFLAGS		+= -g
 endif
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 15b003b50..3a3ba7fec 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -926,6 +926,74 @@ config REGPARM
 	generate incorrect output with certain kernel constructs when
 	-mregparm=3 is used.
 
+config IRQSTACKS
+	bool "Use separate IRQ stacks"
+	help
+	If you say Y here the kernel will use a separate IRQ stack on each
+	cpu to handle interrupts.
+
+config STACK_SIZE_SHIFT
+	int "Kernel stack size (12 => 4KB, 13 => 8KB, 14 => 16KB)"
+	range 12 14
+	default 12 if IRQSTACKS
+	default 13
+	help
+	Select kernel stack size.  4KB stacks are best as they let
+	the system scale further.  Use 8KB stacks if you have an 
+	experimental kernel where a stack overlow with a 4KB stack
+	might occur.  Use 16KB stacks if you want to safely support
+	Windows device drivers using either Linuxant or ndiswrapper.
+
+config STACK_WARN
+	int "Print stack trace when stack grows beyond specified bytes"
+	default 4096 if IRQSTACKS
+	default 4096
+	help
+	The kernel will print a stack trace when the current stack exceeds
+	the specified size.
+
+config X86_STACK_CHECK
+	bool "Check for stack overflows"
+	default n
+	help
+	Say Y here to have the kernel attempt to detect when the per-task
+	kernel stack overflows.
+
+	Some older versions of gcc don't handle the -p option correctly.
+	Kernprof is affected by the same problem, which is described here:
+	http://oss.sgi.com/projects/kernprof/faq.html#Q9
+
+	Basically, if you get oopses in __free_pages_ok during boot when
+	you have this turned on, you need to fix gcc. The Redhat 2.96
+	version and gcc-3.x seem to work.
+
+	If not debugging a stack overflow problem, say N
+
+config STACK_PANIC
+	int "Panic when stack approaches with specified bytes of the stack limit"
+	depends on X86_STACK_CHECK
+	default 512 if IRQSTACKS
+	default 512
+	help
+	Panic if the stack grows to within specified byte range.
+
+config KEXEC
+	bool "kexec system call (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  kexec is a system call that implements the ability to shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is indepedent of the system firmware.   And like a reboot
+	  you can start any kernel with it, not just Linux.
+
+	  The name comes from the similiarity to the exec system call.
+
+	  It is an ongoing process to be certain the hardware in a machine
+	  is properly shutdown, so do not be surprised if this code does not
+	  initially work for you.  It may help to enable device hotplugging
+	  support.  As of this writing the exact hardware interface is
+	  strongly in flux, so no good recommendation can be made.
+
 endmenu
 
 
diff --git a/arch/i386/boot/.cvsignore b/arch/i386/boot/.cvsignore
new file mode 100644
index 000000000..2d8a3afa4
--- /dev/null
+++ b/arch/i386/boot/.cvsignore
@@ -0,0 +1,4 @@
+bootsect
+bzImage
+setup
+vmlinux.bin
diff --git a/arch/i386/boot/compressed/.cvsignore b/arch/i386/boot/compressed/.cvsignore
new file mode 100644
index 000000000..96b1b0022
--- /dev/null
+++ b/arch/i386/boot/compressed/.cvsignore
@@ -0,0 +1,3 @@
+vmlinux
+vmlinux.bin
+vmlinux.bin.gz
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index fa6704523..874568330 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -380,3 +380,6 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
 	if (high_loaded) close_output_buffer_if_we_run_high(mv);
 	return high_loaded;
 }
+
+/* We don't actually check for stack overflows this early. */
+__asm__(".globl mcount ; mcount: ret\n");
diff --git a/arch/i386/boot/tools/.cvsignore b/arch/i386/boot/tools/.cvsignore
new file mode 100644
index 000000000..378eac25d
--- /dev/null
+++ b/arch/i386/boot/tools/.cvsignore
@@ -0,0 +1 @@
+build
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index aed3bc298..ed2bbb54d 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1221,7 +1221,7 @@ CONFIG_OPROFILE=y
 CONFIG_EARLY_PRINTK=y
 CONFIG_DEBUG_SPINLOCK_SLEEP=y
 # CONFIG_FRAME_POINTER is not set
-CONFIG_4KSTACKS=y
+# CONFIG_4KSTACKS is not set
 CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
 
diff --git a/arch/i386/kernel/.cvsignore b/arch/i386/kernel/.cvsignore
new file mode 100644
index 000000000..21c28761b
--- /dev/null
+++ b/arch/i386/kernel/.cvsignore
@@ -0,0 +1,2 @@
+asm-offsets.s
+vmlinux.lds.s
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index a056d5068..ab1ef80d1 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit.o
 obj-$(CONFIG_MODULES)		+= module.o
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index ecf2b632f..eb4d41628 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -193,6 +193,36 @@ void disconnect_bsp_APIC(void)
 		outb(0x70, 0x22);
 		outb(0x00, 0x23);
 	}
+	else {
+		/* Go back to Virtual Wire compatibility mode */
+		unsigned long value;
+
+		/* For the spurious interrupt use vector F, and enable it */
+		value = apic_read(APIC_SPIV);
+		value &= ~APIC_VECTOR_MASK;
+		value |= APIC_SPIV_APIC_ENABLED;
+		value |= 0xf;
+		apic_write_around(APIC_SPIV, value);
+
+		/* For LVT0 make it edge triggered, active high, external and enabled */
+		value = apic_read(APIC_LVT0);
+		value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
+		apic_write_around(APIC_LVT0, value);
+
+		/* For LVT1 make it edge triggered, active high, nmi and enabled */
+		value = apic_read(APIC_LVT1);
+		value &= ~(
+			APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+		apic_write_around(APIC_LVT1, value);
+	}
 }
 
 void disable_local_APIC(void)
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 43943f871..b03f579a6 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -7,11 +7,11 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/personality.h>
+#include <linux/thread_info.h>
 #include <asm/ucontext.h>
 #include "sigframe.h"
 #include <asm/fixmap.h>
 #include <asm/processor.h>
-#include <asm/thread_info.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3ac74183c..dfbade1b9 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1029,8 +1029,55 @@ ENTRY(sys_call_table)
 	.long sys_mq_timedreceive	/* 280 */
 	.long sys_mq_notify
 	.long sys_mq_getsetattr
-	.long sys_ni_syscall		/* reserved for kexec */
+	.long sys_kexec_load
 	.long sys_ioprio_set
 	.long sys_ioprio_get		/* 285 */
 
 syscall_table_size=(.-sys_call_table)
+
+#ifdef CONFIG_X86_STACK_CHECK
+.data
+.globl stack_overflowed
+stack_overflowed:
+	.long 0
+.text
+
+ENTRY(mcount)
+#warning stack check enabled
+	push %eax
+	movl $(THREAD_SIZE - 1),%eax
+	andl %esp,%eax
+	cmpl $STACK_WARN,%eax
+	jle 1f
+2:
+	popl %eax
+	ret
+1:
+	/* prevent infinite recursion from call to mcount from the
+	 * stack_overflow function.  Need to revisit this code for
+	 * SMP based systems.
+	 */
+	lock; btsl $0,stack_overflowed
+	jc 2b
+
+	/* prepare to jmp to stack_overflow directly, as if it were 
+	 * called directly by the caller of mcount.  
+	 */
+	pushl %ebp
+	pushl %ebx
+	pushl %esi
+	pushl %edi
+	
+	call stack_overflow
+	/* Note that stack_overflow() will clear the stack_overflowed
+	 * variable.
+	 */
+
+	popl %edi
+	popl %esi
+	popl %ebx
+	popl %ebp
+	
+	popl %eax	
+	ret
+#endif
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 5a50c536d..584982c3e 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -188,6 +188,12 @@ EXPORT_SYMBOL(atomic_dec_and_lock);
 
 EXPORT_SYMBOL(__PAGE_KERNEL);
 
+#ifdef CONFIG_X86_STACK_CHECK
+extern void mcount(void);
+EXPORT_SYMBOL(mcount);
+#endif
+
+
 #ifdef CONFIG_HIGHMEM
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index 97653d20f..7141d27ec 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -244,9 +244,21 @@ static int i8259A_resume(struct sys_device *dev)
 	return 0;
 }
 
+static int i8259A_shutdown(struct sys_device *dev)
+{
+      /* Put the i8259A into a quiescent state that
+       * the kernel initialization code can get it
+       * out of.
+       */
+      outb(0xff, 0x21);       /* mask all of 8259A-1 */
+      outb(0xff, 0xA1);       /* mask all of 8259A-1 */
+      return 0;
+}
+
 static struct sysdev_class i8259_sysdev_class = {
 	set_kset_name("i8259"),
 	.resume = i8259A_resume,
+        .shutdown = i8259A_shutdown,
 };
 
 static struct sys_device device_i8259A = {
diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
index 7422d73ee..30cfd4085 100644
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -29,6 +29,13 @@ union thread_union init_thread_union
 	__attribute__((__section__(".data.init_task"))) =
 		{ INIT_THREAD_INFO(init_task, init_thread_union) };
 
+#ifdef CONFIG_X86_STACK_CHECK
+union thread_union stack_overflow_stack
+ __attribute__((__section__(".data.init_task"))) =
+		{ INIT_THREAD_INFO(init_task, stack_overflow_stack) };
+#endif
+
+
 /*
  * Initial task structure.
  *
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 39af35d19..f600e6799 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1604,11 +1604,42 @@ static void __init enable_IO_APIC(void)
  */
 void disable_IO_APIC(void)
 {
+	int pin;
 	/*
 	 * Clear the IO-APIC before rebooting:
 	 */
 	clear_IO_APIC();
 
+	/*
+	 * If the i82559 is routed through an IOAPIC
+	 * Put that IOAPIC in virtual wire mode
+	 * so legacy interrups can be delivered.
+	 */
+	pin = find_isa_irq_pin(0, mp_ExtINT);
+	if (pin != -1) {
+		struct IO_APIC_route_entry entry;
+		unsigned long flags;
+
+		memset(&entry, 0, sizeof(entry));
+		entry.mask            = 0; /* Enabled */
+		entry.trigger         = 0; /* Edge */
+		entry.irr             = 0;
+		entry.polarity        = 0; /* High */
+		entry.delivery_status = 0;
+		entry.dest_mode       = 0; /* Physical */
+		entry.delivery_mode   = 7; /* ExtInt */
+		entry.vector          = 0;
+		entry.dest.physical.physical_dest = 0;
+
+
+		/*
+		 * Add it to the IO-APIC irq-routing table:
+		 */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+		io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
 	disconnect_bsp_APIC();
 }
 
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 22f7fc771..1c8bedaeb 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -76,8 +76,10 @@ static void register_irq_proc (unsigned int irq);
 /*
  * per-CPU IRQ handling stacks
  */
+#ifdef CONFIG_IRQSTACKS
 union irq_ctx *hardirq_ctx[NR_CPUS];
 union irq_ctx *softirq_ctx[NR_CPUS];
+#endif
 
 /*
  * Special irq handlers.
@@ -220,6 +222,9 @@ asmlinkage int handle_IRQ_event(unsigned int irq,
 	int status = 1;	/* Force the "do bottom halves" bit */
 	int retval = 0;
 
+	if (!(action->flags & SA_INTERRUPT))
+		local_irq_enable();
+
 	do {
 		status |= action->flags;
 		retval |= action->handler(irq, action->dev_id, regs);
@@ -489,10 +494,12 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 		u32 *isp;
 		union irq_ctx * curctx;
 		union irq_ctx * irqctx;
-
+#ifdef CONFIG_IRQSTACKS
 		curctx = (union irq_ctx *) current_thread_info();
 		irqctx = hardirq_ctx[smp_processor_id()];
-
+#else
+		curctx = irqctx = (union irq_ctx *)0;
+#endif
 		spin_unlock(&desc->lock);
 
 		/*
@@ -536,7 +543,6 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 			break;
 		desc->status &= ~IRQ_PENDING;
 	}
-
 	desc->status &= ~IRQ_INPROGRESS;
 
 out:
@@ -1095,6 +1101,7 @@ void init_irq_proc (void)
 }
 
 
+#ifdef CONFIG_IRQSTACKS
 /*
  * These should really be __section__(".bss.page_aligned") as well, but
  * gcc's 3.0 and earlier don't handle that correctly.
@@ -1174,3 +1181,4 @@ asmlinkage void do_softirq(void)
 }
 
 EXPORT_SYMBOL(do_softirq);
+#endif
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
new file mode 100644
index 000000000..3a9e878f8
--- /dev/null
+++ b/arch/i386/kernel/machine_kexec.c
@@ -0,0 +1,208 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+
+static inline unsigned long read_cr3(void)
+{
+	unsigned long cr3;
+	asm volatile("movl %%cr3,%0": "=r"(cr3));
+	return cr3;
+}
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#define LEVEL0_SIZE (1UL << 12UL)
+
+#ifndef CONFIG_X86_PAE
+#define LEVEL1_SIZE (1UL << 22UL)
+static u32 pgtable_level1[1024] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+	unsigned long level1_index, level2_index;
+	u32 *pgtable_level2;
+
+	/* Find the current page table */
+	pgtable_level2 = __va(read_cr3());
+
+	/* Find the indexes of the physical address to identity map */
+	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+	level2_index = address / LEVEL1_SIZE;
+
+	/* Identity map the page table entry */
+	pgtable_level1[level1_index] = address | L0_ATTR;
+	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+
+	/* Flush the tlb so the new mapping takes effect.
+	 * Global tlb entries are not flushed but that is not an issue.
+	 */
+	load_cr3(pgtable_level2);
+}
+
+#else
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+static u64 pgtable_level1[512] PAGE_ALIGNED;
+static u64 pgtable_level2[512] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+	unsigned long level1_index, level2_index, level3_index;
+	u64 *pgtable_level3;
+
+	/* Find the current page table */
+	pgtable_level3 = __va(read_cr3());
+
+	/* Find the indexes of the physical address to identity map */
+	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+	level3_index = address / LEVEL2_SIZE;
+
+	/* Identity map the page table entry */
+	pgtable_level1[level1_index] = address | L0_ATTR;
+	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+	set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR);
+
+	/* Flush the tlb so the new mapping takes effect.
+	 * Global tlb entries are not flushed but that is not an issue.
+	 */
+	load_cr3(pgtable_level3);
+}
+#endif
+
+
+static void set_idt(void *newidt, __u16 limit)
+{
+	unsigned char curidt[6];
+
+	/* ia32 supports unaliged loads & stores */
+	(*(__u16 *)(curidt)) = limit;
+	(*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
+
+	__asm__ __volatile__ (
+		"lidt %0\n"
+		: "=m" (curidt)
+		);
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+	unsigned char curgdt[6];
+
+	/* ia32 supports unaligned loads & stores */
+	(*(__u16 *)(curgdt)) = limit;
+	(*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+	__asm__ __volatile__ (
+		"lgdt %0\n"
+		: "=m" (curgdt)
+		);
+};
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+	__asm__ __volatile__ (
+		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+		"\t1:\n"
+		"\tmovl $"STR(__KERNEL_DS)",%eax\n"
+		"\tmovl %eax,%ds\n"
+		"\tmovl %eax,%es\n"
+		"\tmovl %eax,%fs\n"
+		"\tmovl %eax,%gs\n"
+		"\tmovl %eax,%ss\n"
+		);
+#undef STR
+#undef __STR
+}
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+	unsigned long indirection_page, unsigned long reboot_code_buffer,
+	unsigned long start_address, unsigned int has_pae);
+
+const extern unsigned char relocate_new_kernel[];
+extern void relocate_new_kernel_end(void);
+const extern unsigned int relocate_new_kernel_size;
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.  Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+	unsigned long indirection_page;
+	unsigned long reboot_code_buffer;
+	relocate_new_kernel_t rnk;
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	/* Compute some offsets */
+	reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	indirection_page = image->head & PAGE_MASK;
+
+	/* Set up an identity mapping for the reboot_code_buffer */
+	identity_map_page(reboot_code_buffer);
+
+	/* copy it out */
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
+
+	/* The segment registers are funny things, they are
+	 * automatically loaded from a table, in memory wherever you
+	 * set them to a specific selector, but this table is never
+	 * accessed again you set the segment to a different selector.
+	 *
+	 * The more common model is are caches where the behide
+	 * the scenes work is done, but is also dropped at arbitrary
+	 * times.
+	 *
+	 * I take advantage of this here by force loading the
+	 * segments, before I zap the gdt with an invalid value.
+	 */
+	load_segments();
+	/* The gdt & idt are now invalid.
+	 * If you want to load them you must set up your own idt & gdt.
+	 */
+	set_gdt(phys_to_virt(0),0);
+	set_idt(phys_to_virt(0),0);
+
+	/* now call it */
+	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+	(*rnk)(indirection_page, reboot_code_buffer, image->start, cpu_has_pae);
+}
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 3093d1fc6..e8a01f2b5 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -219,6 +219,32 @@ static int __init idle_setup (char *str)
 
 __setup("idle=", idle_setup);
 
+void stack_overflow(void)
+{
+        extern unsigned long stack_overflowed;
+        unsigned long esp = current_stack_pointer();
+	int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC);
+
+	oops_in_progress = 1;
+	printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%lx %d %d\n",
+		esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, 
+		(((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing);
+	show_trace(current,(void*)esp);
+
+	if (panicing)
+	  panic("stack overflow\n");
+
+	oops_in_progress = 0;
+
+	/* Just let it happen once per task, as otherwise it goes nuts
+	 * in printing stack traces.  This means that I need to dump
+	 * the stack_overflowed boolean into the task or thread_info
+	 * structure.  For now just turn it off all together.
+	 */
+
+	/* stack_overflowed = 0; */
+}
+
 void show_regs(struct pt_regs * regs)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index e8d5cd3ab..85e89f94b 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -23,7 +23,6 @@ static int reboot_mode;
 int reboot_thru_bios;
 
 #ifdef CONFIG_SMP
-int reboot_smp = 0;
 static int reboot_cpu = -1;
 /* shamelessly grabbed from lib/vsprintf.c for readability */
 #define is_digit(c)	((c) >= '0' && (c) <= '9')
@@ -85,33 +84,9 @@ static int __init set_bios_reboot(struct dmi_system_id *d)
 	return 0;
 }
 
-/*
- * Some machines require the "reboot=s"  commandline option, this quirk makes that automatic.
- */
-static int __init set_smp_reboot(struct dmi_system_id *d)
-{
-#ifdef CONFIG_SMP
-	if (!reboot_smp) {
-		reboot_smp = 1;
-		printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
-	}
-#endif
-	return 0;
-}
-
-/*
- * Some machines require the "reboot=b,s"  commandline option, this quirk makes that automatic.
- */
-static int __init set_smp_bios_reboot(struct dmi_system_id *d)
-{
-	set_smp_reboot(d);
-	set_bios_reboot(d);
-	return 0;
-}
-
 static struct dmi_system_id __initdata reboot_dmi_table[] = {
 	{	/* Handle problems with rebooting on Dell 1300's */
-		.callback = set_smp_bios_reboot,
+		.callback = set_bios_reboot,
 		.ident = "Dell PowerEdge 1300",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
@@ -294,41 +269,32 @@ void machine_real_restart(unsigned char *code, int length)
 				: "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
 }
 
-void machine_restart(char * __unused)
+void machine_shutdown(void)
 {
 #ifdef CONFIG_SMP
-	int cpuid;
-	
-	cpuid = GET_APIC_ID(apic_read(APIC_ID));
-
-	if (reboot_smp) {
-
-		/* check to see if reboot_cpu is valid 
-		   if its not, default to the BSP */
-		if ((reboot_cpu == -1) ||  
-		      (reboot_cpu > (NR_CPUS -1))  || 
-		      !physid_isset(cpuid, phys_cpu_present_map))
-			reboot_cpu = boot_cpu_physical_apicid;
-
-		reboot_smp = 0;  /* use this as a flag to only go through this once*/
-		/* re-run this function on the other CPUs
-		   it will fall though this section since we have 
-		   cleared reboot_smp, and do the reboot if it is the
-		   correct CPU, otherwise it halts. */
-		if (reboot_cpu != cpuid)
-			smp_call_function((void *)machine_restart , NULL, 1, 0);
+        int reboot_cpu_id;
+
+        /* The boot cpu is always logical cpu 0 */
+        reboot_cpu_id = 0;
+
+        /* See if there has been given a command line override */
+	if ((reboot_cpu_id != -1) && (reboot_cpu < NR_CPUS) &&
+	        cpu_isset(reboot_cpu, cpu_online_map)) {
+                reboot_cpu_id = reboot_cpu;
 	}
 
-	/* if reboot_cpu is still -1, then we want a tradional reboot, 
-	   and if we are not running on the reboot_cpu,, halt */
-	if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
-		for (;;)
-		__asm__ __volatile__ ("hlt");
+	/* Make certain the cpu I'm rebooting on is online */
+        if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+                reboot_cpu_id = smp_processor_id();
 	}
-	/*
-	 * Stop all CPUs and turn off local APICs and the IO-APIC, so
-	 * other OSs see a clean IRQ state.
+
+        /* Make certain I only run on the appropriate processor */
+        set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+        /* O.K. Now that I'm on the appropriate processor, stop
+         * all of the others, and disable their local APICs.
 	 */
+
 	if (!netdump_mode)
 		smp_send_stop();
 #elif defined(CONFIG_X86_LOCAL_APIC)
@@ -341,6 +307,11 @@ void machine_restart(char * __unused)
 #ifdef CONFIG_X86_IO_APIC
 	disable_IO_APIC();
 #endif
+}
+
+void machine_restart(char * __unused)
+{
+        machine_shutdown();
 
 	if (!reboot_thru_bios) {
 		if (efi_enabled) {
diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S
new file mode 100644
index 000000000..54be4c2ae
--- /dev/null
+++ b/arch/i386/kernel/relocate_kernel.S
@@ -0,0 +1,118 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+	/*
+	 * Must be relocatable PIC code callable as a C function, that once
+	 * it starts can not use the previous processes stack.
+	 */
+	.globl relocate_new_kernel
+relocate_new_kernel:
+	/* read the arguments and say goodbye to the stack */
+	movl  4(%esp), %ebx /* indirection_page */
+	movl  8(%esp), %ebp /* reboot_code_buffer */
+	movl  12(%esp), %edx /* start address */
+	movl  16(%esp), %ecx /* cpu_has_pae */
+
+	/* zero out flags, and disable interrupts */
+	pushl $0
+	popfl
+
+	/* set a new stack at the bottom of our page... */
+	lea   4096(%ebp), %esp
+
+	/* store the parameters back on the stack */
+	pushl   %edx /* store the start address */
+
+	/* Set cr0 to a known state:
+	 * 31 0 == Paging disabled
+	 * 18 0 == Alignment check disabled
+	 * 16 0 == Write protect disabled
+	 * 3  0 == No task switch
+	 * 2  0 == Don't do FP software emulation.
+	 * 0  1 == Proctected mode enabled
+	 */
+	movl	%cr0, %eax
+	andl	$~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+	orl	$(1<<0), %eax
+	movl	%eax, %cr0
+
+	/* clear cr4 if applicable */
+	testl	%ecx, %ecx
+	jz	1f
+	/* Set cr4 to a known state:
+	 * Setting everything to zero seems safe.
+	 */
+	movl	%cr4, %eax
+	andl	$0, %eax
+	movl	%eax, %cr4
+
+	jmp 1f
+1:
+
+	/* Flush the TLB (needed?) */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* Do the copies */
+	cld
+0:	/* top, read another word for the indirection page */
+	movl    %ebx, %ecx
+	movl	(%ebx), %ecx
+	addl	$4, %ebx
+	testl	$0x1,   %ecx  /* is it a destination page */
+	jz	1f
+	movl	%ecx,	%edi
+	andl	$0xfffff000, %edi
+	jmp     0b
+1:
+	testl	$0x2,	%ecx  /* is it an indirection page */
+	jz	1f
+	movl	%ecx,	%ebx
+	andl	$0xfffff000, %ebx
+	jmp     0b
+1:
+	testl   $0x4,   %ecx /* is it the done indicator */
+	jz      1f
+	jmp     2f
+1:
+	testl   $0x8,   %ecx /* is it the source indicator */
+	jz      0b	     /* Ignore it otherwise */
+	movl    %ecx,   %esi /* For every source page do a copy */
+	andl    $0xfffff000, %esi
+
+	movl    $1024, %ecx
+	rep ; movsl
+	jmp     0b
+
+2:
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB, it's handy, and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* set all of the registers to known values */
+	/* leave %esp alone */
+
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %edi, %edi
+	xorl    %ebp, %ebp
+	ret
+relocate_new_kernel_end:
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.long relocate_new_kernel_end - relocate_new_kernel
diff --git a/configs/kernel-2.6.8-i686-planetlab.config b/configs/kernel-2.6.8-i686-planetlab.config
index ea66387e5..8cc762f56 100644
--- a/configs/kernel-2.6.8-i686-planetlab.config
+++ b/configs/kernel-2.6.8-i686-planetlab.config
@@ -30,8 +30,9 @@ CONFIG_RCFS_FS=y
 CONFIG_CKRM_TYPE_TASKCLASS=y
 CONFIG_CKRM_RES_NUMTASKS=y
 CONFIG_CKRM_CPU_SCHEDULE=y
-CONFIG_CKRM_RES_BLKIO=y
+# CONFIG_CKRM_RES_BLKIO is not set
 # CONFIG_CKRM_RES_MEM is not set
+CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y
 # CONFIG_CKRM_TYPE_SOCKETCLASS is not set
 CONFIG_CKRM_RBCE=y
 CONFIG_SYSCTL=y
@@ -140,6 +141,12 @@ CONFIG_HIGHPTE=y
 # CONFIG_MATH_EMULATION is not set
 CONFIG_MTRR=y
 CONFIG_REGPARM=y
+CONFIG_IRQSTACKS=y
+CONFIG_STACK_SIZE_SHIFT=13
+CONFIG_STACK_WARN=4000
+CONFIG_X86_STACK_CHECK=y
+CONFIG_STACK_PANIC=512
+CONFIG_KEXEC=y
 
 #
 # Power management options (ACPI, APM)
@@ -211,7 +218,7 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y
 #
 # Block devices
 #
-# CONFIG_BLK_DEV_FD is not set
+CONFIG_BLK_DEV_FD=m
 # CONFIG_BLK_DEV_XD is not set
 CONFIG_BLK_CPQ_DA=m
 CONFIG_BLK_CPQ_CISS_DA=m
diff --git a/drivers/block/cfq-iosched-orig.c b/drivers/block/cfq-iosched-orig.c
deleted file mode 100644
index 977d32ddd..000000000
--- a/drivers/block/cfq-iosched-orig.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- *  linux/drivers/block/cfq-iosched.c
- *
- *  CFQ, or complete fairness queueing, disk scheduler.
- *
- *  Based on ideas from a previously unfinished io
- *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
- *
- *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/hash.h>
-#include <linux/rbtree.h>
-#include <linux/mempool.h>
-
-/*
- * tunables
- */
-static int cfq_quantum = 4;
-static int cfq_queued = 8;
-
-#define CFQ_QHASH_SHIFT		6
-#define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)	list_entry((entry), struct cfq_queue, cfq_hash)
-
-#define CFQ_MHASH_SHIFT		8
-#define CFQ_MHASH_BLOCK(sec)	((sec) >> 3)
-#define CFQ_MHASH_ENTRIES	(1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)	(hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
-#define ON_MHASH(crq)		!list_empty(&(crq)->hash)
-#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)	list_entry((ptr), struct cfq_rq, hash)
-
-#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
-
-#define RQ_DATA(rq)		((struct cfq_rq *) (rq)->elevator_private)
-
-static kmem_cache_t *crq_pool;
-static kmem_cache_t *cfq_pool;
-static mempool_t *cfq_mpool;
-
-struct cfq_data {
-	struct list_head rr_list;
-	struct list_head *dispatch;
-	struct list_head *cfq_hash;
-
-	struct list_head *crq_hash;
-
-	unsigned int busy_queues;
-	unsigned int max_queued;
-
-	mempool_t *crq_pool;
-};
-
-struct cfq_queue {
-	struct list_head cfq_hash;
-	struct list_head cfq_list;
-	struct rb_root sort_list;
-	int pid;
-	int queued[2];
-#if 0
-	/*
-	 * with a simple addition like this, we can do io priorities. almost.
-	 * does need a split request free list, too.
-	 */
-	int io_prio
-#endif
-};
-
-struct cfq_rq {
-	struct rb_node rb_node;
-	sector_t rb_key;
-
-	struct request *request;
-
-	struct cfq_queue *cfq_queue;
-
-	struct list_head hash;
-};
-
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
-static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq);
-
-/*
- * lots of deadline iosched dupes, can be abstracted later...
- */
-static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
-{
-	list_del_init(&crq->hash);
-}
-
-static inline void cfq_del_crq_hash(struct cfq_rq *crq)
-{
-	if (ON_MHASH(crq))
-		__cfq_del_crq_hash(crq);
-}
-
-static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
-{
-	cfq_del_crq_hash(crq);
-
-	if (q->last_merge == crq->request)
-		q->last_merge = NULL;
-}
-
-static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-	struct request *rq = crq->request;
-
-	BUG_ON(ON_MHASH(crq));
-
-	list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
-}
-
-static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
-{
-	struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-	struct list_head *entry, *next = hash_list->next;
-
-	while ((entry = next) != hash_list) {
-		struct cfq_rq *crq = list_entry_hash(entry);
-		struct request *__rq = crq->request;
-
-		next = entry->next;
-
-		BUG_ON(!ON_MHASH(crq));
-
-		if (!rq_mergeable(__rq)) {
-			__cfq_del_crq_hash(crq);
-			continue;
-		}
-
-		if (rq_hash_key(__rq) == offset)
-			return __rq;
-	}
-
-	return NULL;
-}
-
-/*
- * rb tree support functions
- */
-#define RB_NONE		(2)
-#define RB_EMPTY(node)	((node)->rb_node == NULL)
-#define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
-#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
-#define ON_RB(node)	((node)->rb_color != RB_NONE)
-#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)		(rq)->sector
-
-static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
-{
-	if (ON_RB(&crq->rb_node)) {
-		cfqq->queued[rq_data_dir(crq->request)]--;
-		rb_erase(&crq->rb_node, &cfqq->sort_list);
-		crq->cfq_queue = NULL;
-	}
-}
-
-static struct cfq_rq *
-__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
-{
-	struct rb_node **p = &cfqq->sort_list.rb_node;
-	struct rb_node *parent = NULL;
-	struct cfq_rq *__crq;
-
-	while (*p) {
-		parent = *p;
-		__crq = rb_entry_crq(parent);
-
-		if (crq->rb_key < __crq->rb_key)
-			p = &(*p)->rb_left;
-		else if (crq->rb_key > __crq->rb_key)
-			p = &(*p)->rb_right;
-		else
-			return __crq;
-	}
-
-	rb_link_node(&crq->rb_node, parent, p);
-	return 0;
-}
-
-static void
-cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
-{
-	struct request *rq = crq->request;
-	struct cfq_rq *__alias;
-
-	crq->rb_key = rq_rb_key(rq);
-	cfqq->queued[rq_data_dir(rq)]++;
-retry:
-	__alias = __cfq_add_crq_rb(cfqq, crq);
-	if (!__alias) {
-		rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-		crq->cfq_queue = cfqq;
-		return;
-	}
-
-	cfq_del_crq_rb(cfqq, __alias);
-	cfq_dispatch_sort(cfqd->dispatch, __alias);
-	goto retry;
-}
-
-static struct request *
-cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
-{
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
-	struct rb_node *n;
-
-	if (!cfqq)
-		goto out;
-
-	n = cfqq->sort_list.rb_node;
-	while (n) {
-		struct cfq_rq *crq = rb_entry_crq(n);
-
-		if (sector < crq->rb_key)
-			n = n->rb_left;
-		else if (sector > crq->rb_key)
-			n = n->rb_right;
-		else
-			return crq->request;
-	}
-
-out:
-	return NULL;
-}
-
-static void cfq_remove_request(request_queue_t *q, struct request *rq)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-	struct cfq_rq *crq = RQ_DATA(rq);
-
-	if (crq) {
-		struct cfq_queue *cfqq = crq->cfq_queue;
-
-		cfq_remove_merge_hints(q, crq);
-		list_del_init(&rq->queuelist);
-
-		if (cfqq) {
-			cfq_del_crq_rb(cfqq, crq);
-
-			if (RB_EMPTY(&cfqq->sort_list))
-				cfq_put_queue(cfqd, cfqq);
-		}
-	}
-}
-
-static int
-cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-	struct request *__rq;
-	int ret;
-
-	ret = elv_try_last_merge(q, bio);
-	if (ret != ELEVATOR_NO_MERGE) {
-		__rq = q->last_merge;
-		goto out_insert;
-	}
-
-	__rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
-	if (__rq) {
-		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_BACK_MERGE;
-			goto out;
-		}
-	}
-
-	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
-	if (__rq) {
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_FRONT_MERGE;
-			goto out;
-		}
-	}
-
-	return ELEVATOR_NO_MERGE;
-out:
-	q->last_merge = __rq;
-out_insert:
-	*req = __rq;
-	return ret;
-}
-
-static void cfq_merged_request(request_queue_t *q, struct request *req)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-	struct cfq_rq *crq = RQ_DATA(req);
-
-	cfq_del_crq_hash(crq);
-	cfq_add_crq_hash(cfqd, crq);
-
-	if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
-		struct cfq_queue *cfqq = crq->cfq_queue;
-
-		cfq_del_crq_rb(cfqq, crq);
-		cfq_add_crq_rb(cfqd, cfqq, crq);
-	}
-
-	q->last_merge = req;
-}
-
-static void
-cfq_merged_requests(request_queue_t *q, struct request *req,
-		    struct request *next)
-{
-	cfq_merged_request(q, req);
-	cfq_remove_request(q, next);
-}
-
-static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq)
-{
-	struct list_head *entry = head;
-	struct request *__rq;
-
-	if (!list_empty(head)) {
-		__rq = list_entry_rq(head->next);
-
-		if (crq->request->sector < __rq->sector) {
-			entry = head->prev;
-			goto link;
-		}
-	}
-
-	while ((entry = entry->prev) != head) {
-		__rq = list_entry_rq(entry);
-
-		if (crq->request->sector <= __rq->sector)
-			break;
-	}
-
-link:
-	list_add_tail(&crq->request->queuelist, entry);
-}
-
-static inline void
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-			struct cfq_queue *cfqq)
-{
-	struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
-
-	cfq_del_crq_rb(cfqq, crq);
-	cfq_remove_merge_hints(q, crq);
-	cfq_dispatch_sort(cfqd->dispatch, crq);
-}
-
-static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
-{
-	struct cfq_queue *cfqq;
-	struct list_head *entry, *tmp;
-	int ret, queued, good_queues;
-
-	if (list_empty(&cfqd->rr_list))
-		return 0;
-
-	queued = ret = 0;
-restart:
-	good_queues = 0;
-	list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-		cfqq = list_entry_cfqq(cfqd->rr_list.next);
-
-		BUG_ON(RB_EMPTY(&cfqq->sort_list));
-
-		__cfq_dispatch_requests(q, cfqd, cfqq);
-
-		if (RB_EMPTY(&cfqq->sort_list))
-			cfq_put_queue(cfqd, cfqq);
-		else
-			good_queues++;
-
-		queued++;
-		ret = 1;
-	}
-
-	if ((queued < cfq_quantum) && good_queues)
-		goto restart;
-
-	return ret;
-}
-
-static struct request *cfq_next_request(request_queue_t *q)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-	struct request *rq;
-
-	if (!list_empty(cfqd->dispatch)) {
-		struct cfq_rq *crq;
-dispatch:
-		rq = list_entry_rq(cfqd->dispatch->next);
-
-		crq = RQ_DATA(rq);
-		if (crq)
-			cfq_remove_merge_hints(q, crq);
-
-		return rq;
-	}
-
-	if (cfq_dispatch_requests(q, cfqd))
-		goto dispatch;
-
-	return NULL;
-}
-
-static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
-{
-	struct list_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct list_head *entry;
-
-	list_for_each(entry, hash_list) {
-		struct cfq_queue *__cfqq = list_entry_qhash(entry);
-
-		if (__cfqq->pid == pid)
-			return __cfqq;
-	}
-
-	return NULL;
-}
-
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
-{
-	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
-
-	return __cfq_find_cfq_hash(cfqd, pid, hashval);
-}
-
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	cfqd->busy_queues--;
-	list_del(&cfqq->cfq_list);
-	list_del(&cfqq->cfq_hash);
-	mempool_free(cfqq, cfq_mpool);
-}
-
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid)
-{
-	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
-	struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
-
-	if (!cfqq) {
-		cfqq = mempool_alloc(cfq_mpool, GFP_NOIO);
-
-		INIT_LIST_HEAD(&cfqq->cfq_hash);
-		INIT_LIST_HEAD(&cfqq->cfq_list);
-		RB_CLEAR_ROOT(&cfqq->sort_list);
-
-		cfqq->pid = pid;
-		cfqq->queued[0] = cfqq->queued[1] = 0;
-		list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
-	}
-
-	return cfqq;
-}
-
-static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-	struct cfq_queue *cfqq;
-
-	cfqq = cfq_get_queue(cfqd, current->tgid);
-
-	cfq_add_crq_rb(cfqd, cfqq, crq);
-
-	if (list_empty(&cfqq->cfq_list)) {
-		list_add(&cfqq->cfq_list, &cfqd->rr_list);
-		cfqd->busy_queues++;
-	}
-}
-
-static void
-cfq_insert_request(request_queue_t *q, struct request *rq, int where)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-	struct cfq_rq *crq = RQ_DATA(rq);
-
-	switch (where) {
-		case ELEVATOR_INSERT_BACK:
-			while (cfq_dispatch_requests(q, cfqd))
-				;
-			list_add_tail(&rq->queuelist, cfqd->dispatch);
-			break;
-		case ELEVATOR_INSERT_FRONT:
-			list_add(&rq->queuelist, cfqd->dispatch);
-			break;
-		case ELEVATOR_INSERT_SORT:
-			BUG_ON(!blk_fs_request(rq));
-			cfq_enqueue(cfqd, crq);
-			break;
-		default:
-			printk("%s: bad insert point %d\n", __FUNCTION__,where);
-			return;
-	}
-
-	if (rq_mergeable(rq)) {
-		cfq_add_crq_hash(cfqd, crq);
-
-		if (!q->last_merge)
-			q->last_merge = rq;
-	}
-}
-
-static int cfq_queue_empty(request_queue_t *q)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-
-	if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
-		return 1;
-
-	return 0;
-}
-
-static struct request *
-cfq_former_request(request_queue_t *q, struct request *rq)
-{
-	struct cfq_rq *crq = RQ_DATA(rq);
-	struct rb_node *rbprev = rb_prev(&crq->rb_node);
-
-	if (rbprev)
-		return rb_entry_crq(rbprev)->request;
-
-	return NULL;
-}
-
-static struct request *
-cfq_latter_request(request_queue_t *q, struct request *rq)
-{
-	struct cfq_rq *crq = RQ_DATA(rq);
-	struct rb_node *rbnext = rb_next(&crq->rb_node);
-
-	if (rbnext)
-		return rb_entry_crq(rbnext)->request;
-
-	return NULL;
-}
-
-static int cfq_may_queue(request_queue_t *q, int rw)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-	struct cfq_queue *cfqq;
-	int ret = 1;
-
-	if (!cfqd->busy_queues)
-		goto out;
-
-	cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
-	if (cfqq) {
-		int limit = (q->nr_requests - cfq_queued) / cfqd->busy_queues;
-
-		if (limit < 3)
-			limit = 3;
-		else if (limit > cfqd->max_queued)
-			limit = cfqd->max_queued;
-
-		if (cfqq->queued[rw] > limit)
-			ret = 0;
-	}
-out:
-	return ret;
-}
-
-static void cfq_put_request(request_queue_t *q, struct request *rq)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-	struct cfq_rq *crq = RQ_DATA(rq);
-
-	if (crq) {
-		BUG_ON(q->last_merge == rq);
-		BUG_ON(ON_MHASH(crq));
-
-		mempool_free(crq, cfqd->crq_pool);
-		rq->elevator_private = NULL;
-	}
-}
-
-static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
-{
-	struct cfq_data *cfqd = q->elevator.elevator_data;
-	struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
-
-	if (crq) {
-		RB_CLEAR(&crq->rb_node);
-		crq->request = rq;
-		crq->cfq_queue = NULL;
-		INIT_LIST_HEAD(&crq->hash);
-		rq->elevator_private = crq;
-		return 0;
-	}
-
-	return 1;
-}
-
-static void cfq_exit(request_queue_t *q, elevator_t *e)
-{
-	struct cfq_data *cfqd = e->elevator_data;
-
-	e->elevator_data = NULL;
-	mempool_destroy(cfqd->crq_pool);
-	kfree(cfqd->crq_hash);
-	kfree(cfqd->cfq_hash);
-	kfree(cfqd);
-}
-
-static int cfq_init(request_queue_t *q, elevator_t *e)
-{
-	struct cfq_data *cfqd;
-	int i;
-
-	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
-	if (!cfqd)
-		return -ENOMEM;
-
-	memset(cfqd, 0, sizeof(*cfqd));
-	INIT_LIST_HEAD(&cfqd->rr_list);
-
-	cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
-	if (!cfqd->crq_hash)
-		goto out_crqhash;
-
-	cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
-	if (!cfqd->cfq_hash)
-		goto out_cfqhash;
-
-	cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
-	if (!cfqd->crq_pool)
-		goto out_crqpool;
-
-	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-		INIT_LIST_HEAD(&cfqd->crq_hash[i]);
-	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
-		INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
-
-	cfqd->dispatch = &q->queue_head;
-	e->elevator_data = cfqd;
-
-	/*
-	 * just set it to some high value, we want anyone to be able to queue
-	 * some requests. fairness is handled differently
-	 */
-	cfqd->max_queued = q->nr_requests;
-	q->nr_requests = 8192;
-
-	return 0;
-out_crqpool:
-	kfree(cfqd->cfq_hash);
-out_cfqhash:
-	kfree(cfqd->crq_hash);
-out_crqhash:
-	kfree(cfqd);
-	return -ENOMEM;
-}
-
-static int __init cfq_slab_setup(void)
-{
-	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
-					NULL, NULL);
-
-	if (!crq_pool)
-		panic("cfq_iosched: can't init crq pool\n");
-
-	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
-					NULL, NULL);
-
-	if (!cfq_pool)
-		panic("cfq_iosched: can't init cfq pool\n");
-
-	cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
-
-	if (!cfq_mpool)
-		panic("cfq_iosched: can't init cfq mpool\n");
-
-	return 0;
-}
-
-subsys_initcall(cfq_slab_setup);
-
-elevator_t iosched_cfq = {
-	.elevator_name =		"cfq",
-	.elevator_merge_fn = 		cfq_merge,
-	.elevator_merged_fn =		cfq_merged_request,
-	.elevator_merge_req_fn =	cfq_merged_requests,
-	.elevator_next_req_fn =		cfq_next_request,
-	.elevator_add_req_fn =		cfq_insert_request,
-	.elevator_remove_req_fn =	cfq_remove_request,
-	.elevator_queue_empty_fn =	cfq_queue_empty,
-	.elevator_former_req_fn =	cfq_former_request,
-	.elevator_latter_req_fn =	cfq_latter_request,
-	.elevator_set_req_fn =		cfq_set_request,
-	.elevator_put_req_fn =		cfq_put_request,
-	.elevator_may_queue_fn =	cfq_may_queue,
-	.elevator_init_fn =		cfq_init,
-	.elevator_exit_fn =		cfq_exit,
-};
-
-EXPORT_SYMBOL(iosched_cfq);
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 7b45a805d..70d66c5c9 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -39,8 +39,6 @@
 #error Cannot support this many io priority levels
 #endif
 
-#define LIMIT_DEBUG   1
-
 /*
  * tunables
  */
@@ -52,6 +50,10 @@ static int cfq_queued = 4;
 static int cfq_grace_rt = HZ / 100 ?: 1;
 static int cfq_grace_idle = HZ / 10;
 
+#define CFQ_EPOCH		1000000000
+#define CFQ_SECTORATE		1000   
+#define CFQ_HMAX_PCT		80
+
 #define CFQ_QHASH_SHIFT		6
 #define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
 #define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
@@ -69,13 +71,6 @@ static int cfq_grace_idle = HZ / 10;
 #define cfq_account_io(crq)	\
 	((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
 
-/* define to be 50 ms for now; make tunable later */
-#define CFQ_EPOCH		50000
-/* Needs to be made tunable right away, in MiB/s */
-#define CFQ_DISKBW		10       
-/* Temporary global limit, as percent of available b/w, for each "class" */
-#define CFQ_TEMPLIM		10
-
 /*
  * defines how we distribute bandwidth (can be tgid, uid, etc)
  */
@@ -87,18 +82,22 @@ static int cfq_grace_idle = HZ / 10;
  */
 
 #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
-extern inline void *cki_hash_key(struct task_struct *tsk);
-extern inline int cki_ioprio(struct task_struct *tsk);
-#define cfq_hash_key(current)   ((int)cki_hash_key((current)))
-#define cfq_ioprio(current)	(cki_ioprio((current)))
+extern void *cki_hash_key(struct task_struct *tsk);
+extern int cki_ioprio(struct task_struct *tsk);
+extern void *cki_cfqpriv(struct task_struct *tsk); 
+
+#define cfq_hash_key(tsk)   ((int)cki_hash_key((tsk)))
+#define cfq_ioprio(tsk)	(cki_ioprio((tsk)))
+#define cfq_cfqpriv(cfqd,tsk)	(cki_cfqpriv((tsk)))
 
 #else
-#define cfq_hash_key(current)	((current)->tgid)
+#define cfq_hash_key(tsk)	((tsk)->tgid)
+#define cfq_cfqpriv(cfqd,tsk)	(&(((cfqd)->cid[(tsk)->ioprio]).cfqpriv))
 
 /*
  * move to io_context
  */
-#define cfq_ioprio(current)	((current)->ioprio)
+#define cfq_ioprio(tsk)	((tsk)->ioprio)
 #endif
 
 #define CFQ_WAIT_RT	0
@@ -125,16 +124,12 @@ struct io_prio_data {
 	atomic_t cum_sectors_in,cum_sectors_out;    
 	atomic_t cum_queues_in,cum_queues_out;
 
-#ifdef LIMIT_DEBUG
-	int nskip;
-	unsigned long navsec;
-	unsigned long csectorate;
-	unsigned long lsectorate;
-#endif
+	cfqlim_t cfqpriv; 	/* data for enforcing limits */
 
 	struct list_head prio_list;
 	int last_rq;
 	int last_sectors;
+
 };
 
 /*
@@ -179,8 +174,9 @@ struct cfq_data {
 	unsigned int cfq_grace_rt;
 	unsigned int cfq_grace_idle;
 
-	unsigned long cfq_epoch;	/* duration for limit enforcement */
-	unsigned long cfq_epochsectors;	/* max sectors dispatchable/epoch */
+	unsigned int cfq_epoch;
+	unsigned int cfq_hmax_pct;
+	unsigned int cfq_qsectorate;
 };
 
 /*
@@ -194,14 +190,34 @@ struct cfq_queue {
 	int queued[2];
 	int ioprio;
 
+	/* limit related settings/stats obtained 
+	   either from io_prio_data or ckrm I/O class
+	*/
+	struct cfqlim *cfqpriv;	
+
+	u64 epstart;		/* current epoch's starting timestamp (ns) */
+	u64 epsector[2];	/* Total sectors dispatched in [0] previous
+				 * and [1] current epoch
+				 */
+	
 	unsigned long avsec;		/* avg sectors dispatched/epoch */
-	unsigned long long lastime;	/* timestamp of last request served */
-	unsigned long sectorate;	/* limit for sectors served/epoch */
+//	unsigned long long lastime;	/* timestamp of last request served */
+//	unsigned long sectorate;	/* limit for sectors served/epoch */
 	int skipped;			/* queue skipped at last dispatch ? */
+
+	/* Per queue timer to suspend/resume queue from processing */
+	struct timer_list timer;
+	unsigned long wait_end;
+	unsigned long flags;
+	struct work_struct work;
+
+	struct cfq_data *cfqd;
 };
 
+
+
 /*
- * per-request structure
+ * Per-request structure
  */
 struct cfq_rq {
 	struct cfq_queue *cfq_queue;
@@ -516,69 +532,101 @@ link:
 	list_add_tail(&crq->request->queuelist, entry);
 }
 
-/*
- * remove from io scheduler core and put on dispatch list for service
- */
+struct cfq_queue *dcfqq;
+u64 dtmp;
+
+
+
+/* Over how many ns is sectorate defined */
+#define NS4SCALE  (100000000)
+
 static inline int
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-			struct cfq_queue *cfqq)
+__cfq_check_limit(struct cfq_data *cfqd,struct cfq_queue *cfqq, int dontskip)
 {
 	struct cfq_rq *crq;
-	unsigned long long ts, gap;
-	unsigned long newavsec;
+	unsigned long long ts, gap, epoch, tmp;
+	unsigned long newavsec, sectorate;
 
 	crq = rb_entry_crq(rb_first(&cfqq->sort_list));
 
-#if 1
-	/* Determine if queue should be skipped for being overshare */
 	ts = sched_clock();
-	gap = ts - cfqq->lastime;
-#ifdef LIMIT_DEBUG
-	cfqq->sectorate = (cfqd->cfq_epochsectors 
-			   * CFQ_TEMPLIM)/100;
-	
-#endif
-	if ((gap >= cfqd->cfq_epoch) || (gap < 0)) {
-		cfqq->avsec = crq->nr_sectors ; 
-		cfqq->lastime = ts;
+	gap = ts - cfqq->epstart;
+	epoch = cfqd->cfq_epoch;
+
+	sectorate = atomic_read(&cfqq->cfqpriv->sectorate);
+//	sectorate = atomic_read(&(cfqd->cid[crq->ioprio].sectorate));
+
+	dcfqq = cfqq;
+
+	if ((gap >= epoch) || (gap < 0)) {
+
+		if (gap >= (epoch << 1)) {
+			cfqq->epsector[0] = 0;
+			cfqq->epstart = ts ; 
+		} else {
+			cfqq->epsector[0] = cfqq->epsector[1];
+			cfqq->epstart += epoch;
+		} 
+		cfqq->epsector[1] = 0;
+		gap = ts - cfqq->epstart;
+
+		tmp  = (cfqq->epsector[0] + crq->nr_sectors) * NS4SCALE;
+		do_div(tmp,epoch+gap);
+
+		cfqq->avsec = (unsigned long)tmp;
+		cfqq->skipped = 0;
+		cfqq->epsector[1] += crq->nr_sectors;
+		
+		cfqq->cfqpriv->navsec = cfqq->avsec;
+		cfqq->cfqpriv->sec[0] = cfqq->epsector[0];
+		cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
+		cfqq->cfqpriv->timedout++;
+		/*
+		cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
+		cfqd->cid[crq->ioprio].sec[0] = cfqq->epsector[0];
+		cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
+		cfqd->cid[crq->ioprio].timedout++;
+		*/
+		return 0;
 	} else {
-		u64 tmp;
-		/* Age old average and accumalate request to be served */
-
-//		tmp = (u64) (cfqq->avsec * gap) ;
-//		do_div(tmp, cfqd->cfq_epoch);
-		newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors;
-//		if (crq->ioprio >= 0 && crq->ioprio <= 20)
-//			cfqd->cid[crq->ioprio].lsectorate = newavsec; 
-//		atomic_set(&(cfqd->cid[crq->ioprio].lsectorate),
-//			   newavsec);
-
-		if ((newavsec < cfqq->sectorate) || cfqq->skipped) {
+		
+		tmp = (cfqq->epsector[0] + cfqq->epsector[1] + crq->nr_sectors)
+			* NS4SCALE;
+		do_div(tmp,epoch+gap);
+
+		newavsec = (unsigned long)tmp;
+		if ((newavsec < sectorate) || dontskip) {
 			cfqq->avsec = newavsec ;
-			cfqq->lastime = ts;
 			cfqq->skipped = 0;
+			cfqq->epsector[1] += crq->nr_sectors;
+			cfqq->cfqpriv->navsec = cfqq->avsec;
+			cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
+			/*
+			cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
+			cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
+			*/
 		} else {
-			/* queue over share ; skip once */
 			cfqq->skipped = 1;
-#ifdef LIMIT_DEBUG	
-//			atomic_inc(&(cfqd->cid[crq->ioprio].nskip));
-//			if (crq->ioprio >= 0 && crq->ioprio <= 20)
-//				cfqd->cid[crq->ioprio].nskip++;
-#endif
-			return 0;
+			/* pause q's processing till avsec drops to 
+			   cfq_hmax_pct % of its value */
+			tmp = (epoch+gap) * (100-cfqd->cfq_hmax_pct);
+			do_div(tmp,1000000*cfqd->cfq_hmax_pct);
+			cfqq->wait_end = jiffies+msecs_to_jiffies(tmp);
 		}
-	}
-#endif
+	}			
+}
 
-#ifdef LIMIT_DEBUG
-//	if (crq->ioprio >= 0 && crq->ioprio <= 20) {
-//		cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
-//		cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate;
-//	}
+/*
+ * remove from io scheduler core and put on dispatch list for service
+ */
+static inline int
+__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
+			struct cfq_queue *cfqq)
+{
+	struct cfq_rq *crq;
+
+	crq = rb_entry_crq(rb_first(&cfqq->sort_list));
 
-//	atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec);
-//	atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate);
-#endif
 	cfq_dispatch_sort(cfqd, cfqq, crq);
 
 	/*
@@ -593,44 +641,83 @@ cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
 {
 	struct cfq_data *cfqd = q->elevator.elevator_data;
 	struct list_head *plist = &cfqd->cid[prio].rr_list;
+	struct cfq_queue *cfqq;
 	struct list_head *entry, *nxt;
 	int q_rq, q_io;
-	int ret ;
+	int first_round,busy_queues,busy_unlimited;
+
 
 	/*
 	 * for each queue at this prio level, dispatch a request
 	 */
 	q_rq = q_io = 0;
+	first_round=1;
+ restart:
+	busy_unlimited = 0;
+	busy_queues = 0;
 	list_for_each_safe(entry, nxt, plist) {
-		struct cfq_queue *cfqq = list_entry_cfqq(entry);
+		cfqq = list_entry_cfqq(entry);
 
 		BUG_ON(RB_EMPTY(&cfqq->sort_list));
+		busy_queues++;
 
-		ret = __cfq_dispatch_requests(q, cfqd, cfqq);
-		if (ret <= 0) {
-			continue; /* skip queue */
-			/* can optimize more by moving q to end of plist ? */
+		
+		if (first_round || busy_unlimited)
+			__cfq_check_limit(cfqd,cfqq,0);
+		else
+			__cfq_check_limit(cfqd,cfqq,1);
+
+		if (cfqq->skipped) {
+			cfqq->cfqpriv->nskip++;
+			/* cfqd->cid[prio].nskip++; */
+			busy_queues--;
+			if (time_before(jiffies, cfqq->wait_end)) {
+				list_del(&cfqq->cfq_list);
+				mod_timer(&cfqq->timer,cfqq->wait_end);
+			}
+			continue;
 		}
-		q_io += ret ;
-		q_rq++ ;
+		busy_unlimited++;
+
+		q_io += __cfq_dispatch_requests(q, cfqd, cfqq);
+		q_rq++;
 
-		if (RB_EMPTY(&cfqq->sort_list))
+		if (RB_EMPTY(&cfqq->sort_list)) {
+			busy_unlimited--;
+			busy_queues--;
 			cfq_put_queue(cfqd, cfqq);
-		/*
-		 * if we hit the queue limit, put the string of serviced
-		 * queues at the back of the pending list
-		 */
+		} 
+
 		if (q_io >= max_sectors || q_rq >= max_rq) {
+#if 0
 			struct list_head *prv = nxt->prev;
 
 			if (prv != plist) {
 				list_del(plist);
 				list_add(plist, prv);
 			}
+#endif
 			break;
 		}
 	}
 
+	if ((q_io < max_sectors) && (q_rq < max_rq) && 
+	    (busy_queues || first_round))
+	{
+		first_round = 0;
+		goto restart;
+	} else {
+		/*
+		 * if we hit the queue limit, put the string of serviced
+		 * queues at the back of the pending list
+		 */
+		struct list_head *prv = nxt->prev;
+		if (prv != plist) {
+			list_del(plist);
+			list_add(plist, prv);
+		}
+	}
+
 	cfqd->cid[prio].last_rq = q_rq;
 	cfqd->cid[prio].last_sectors = q_io;
 	return q_rq;
@@ -806,6 +893,29 @@ static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	mempool_free(cfqq, cfq_mpool);
 }
 
+static void cfq_pauseq_timer(unsigned long data)
+{
+	struct cfq_queue *cfqq = (struct cfq_queue *) data;
+	kblockd_schedule_work(&cfqq->work);
+}
+
+static void cfq_pauseq_work(void *data)
+{
+	struct cfq_queue *cfqq = (struct cfq_queue *) data;
+	struct cfq_data *cfqd = cfqq->cfqd;
+	request_queue_t *q = cfqd->queue;
+	unsigned long flags;
+	
+	spin_lock_irqsave(q->queue_lock, flags);
+	list_add_tail(&cfqq->cfq_list,&cfqd->cid[cfqq->ioprio].rr_list);
+	cfqq->skipped = 0;
+	if (cfq_next_request(q))
+		q->request_fn(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	//del_timer(&cfqq->timer);
+}	
+
 static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
 					 int gfp_mask)
 {
@@ -833,9 +943,22 @@ retry:
 		INIT_LIST_HEAD(&cfqq->cfq_list);
 		cfqq->hash_key = cfq_hash_key(current);
 		cfqq->ioprio = cfq_ioprio(current);
-		cfqq->avsec = 0 ;
-		cfqq->lastime = sched_clock();
-		cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100;
+		
+		cfqq->cfqpriv = cfq_cfqpriv(cfqd,current);
+		if (!cfqq->cfqpriv)
+			cfqq->cfqpriv = &((cfqd->cid[cfqq->ioprio]).cfqpriv);
+
+		cfqq->epstart = sched_clock();
+		/* epsector, avsec, skipped initialized to zero by memset */
+		
+		init_timer(&cfqq->timer);
+		cfqq->timer.function = cfq_pauseq_timer;
+		cfqq->timer.data = (unsigned long) cfqq;
+
+		INIT_WORK(&cfqq->work, cfq_pauseq_work, cfqq); 
+
+		cfqq->cfqd = cfqd ;
+
 		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
 	}
 
@@ -1132,6 +1255,8 @@ static void cfq_exit(request_queue_t *q, elevator_t *e)
 	kfree(cfqd);
 }
 
+	
+
 static void cfq_timer(unsigned long data)
 {
 	struct cfq_data *cfqd = (struct cfq_data *) data;
@@ -1182,12 +1307,12 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
 		atomic_set(&cid->cum_sectors_out,0);		
 		atomic_set(&cid->cum_queues_in,0);
 		atomic_set(&cid->cum_queues_out,0);
-#if 0
-		atomic_set(&cid->nskip,0);
-		atomic_set(&cid->navsec,0);
-		atomic_set(&cid->csectorate,0);
-		atomic_set(&cid->lsectorate,0);
-#endif
+
+		
+		atomic_set(&((cid->cfqpriv).sectorate),CFQ_SECTORATE);
+		(cid->cfqpriv).nskip = 0;
+		(cid->cfqpriv).navsec = 0;
+		(cid->cfqpriv).timedout = 0;
 	}
 
 	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
@@ -1217,6 +1342,9 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
 	cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
 	cfqd->cfq_grace_rt = cfq_grace_rt;
 	cfqd->cfq_grace_idle = cfq_grace_idle;
+	
+	cfqd->cfq_epoch = CFQ_EPOCH;
+	cfqd->cfq_hmax_pct = CFQ_HMAX_PCT;
 
 	q->nr_requests <<= 2;
 
@@ -1224,14 +1352,6 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
 	e->elevator_data = cfqd;
 	cfqd->queue = q;
 
-	cfqd->cfq_epoch = CFQ_EPOCH;
-	if (q->hardsect_size)
-		cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/
-				      q->hardsect_size)* (1000000 / CFQ_EPOCH);
-	else
-		cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512)
-			* (1000000 / CFQ_EPOCH) ;
-
 	return 0;
 out_crqpool:
 	kfree(cfqd->cfq_hash);
@@ -1302,6 +1422,8 @@ SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
 SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
 SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
+SHOW_FUNCTION(cfq_epoch_show, cfqd->cfq_epoch);
+SHOW_FUNCTION(cfq_hmax_pct_show, cfqd->cfq_hmax_pct);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
@@ -1321,63 +1443,38 @@ STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX
 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
 STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
 STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
+STORE_FUNCTION(cfq_epoch_store, &cfqd->cfq_epoch, 0, INT_MAX);
+STORE_FUNCTION(cfq_hmax_pct_store, &cfqd->cfq_hmax_pct, 1, 100);
 #undef STORE_FUNCTION
 
 
-static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page)
-{
-	return sprintf(page, "%lu\n", cfqd->cfq_epoch);
-}
-
-static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
-	char *p = (char *) page;
-	cfqd->cfq_epoch = simple_strtoul(p, &p, 10);
-	return count;
-}
-
-static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page)
-{
-	return sprintf(page, "%lu\n", cfqd->cfq_epochsectors);
-}
-
-static ssize_t 
-cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
-	char *p = (char *) page;
-	cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10);
-	return count;
-}
-
 /* Additional entries to get priority level data */
 static ssize_t
 cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
 {
-	int r1,r2,s1,s2,q1,q2;
+    //int r1,r2,s1,s2,q1,q2;
 
 	if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) 
 		return 0;
 	
+	/*
 	r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
 	r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
 	s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
 	s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
 	q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); 
 	q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
-	
-	return sprintf(page,"skip %d avsec %lu rate %lu new %lu"
-		       "rq (%d,%d) sec (%d,%d) q (%d,%d)\n",
-		       cfqd->cid[priolvl].nskip,
-		       cfqd->cid[priolvl].navsec,
-		       cfqd->cid[priolvl].csectorate,
-		       cfqd->cid[priolvl].lsectorate,
-//		       atomic_read(&cfqd->cid[priolvl].nskip),
-//		       atomic_read(&cfqd->cid[priolvl].navsec),
-//		       atomic_read(&cfqd->cid[priolvl].csectorate),
-//		       atomic_read(&cfqd->cid[priolvl].lsectorate),
-		       r1,r2,
-		       s1,s2,
-		       q1,q2);
+	*/
+
+	return sprintf(page,"skip %d timdout %d avsec %lu rate %ld "
+		       " sec0 %lu sec1 %lu\n",
+		       cfqd->cid[priolvl].cfqpriv.nskip,
+		       cfqd->cid[priolvl].cfqpriv.timedout,
+		       cfqd->cid[priolvl].cfqpriv.navsec,
+		       atomic_read(&(cfqd->cid[priolvl].cfqpriv.sectorate)),
+		       (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[0],
+		       (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[1]);
+
 }
 
 #define SHOW_PRIO_DATA(__PRIOLVL)                                               \
@@ -1411,12 +1508,25 @@ SHOW_PRIO_DATA(20);
 
 static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
 {	
+
+	char *p = (char *) page;
+	int val;
+
+	val = (int) simple_strtoul(p, &p, 10);
+
+	atomic_set(&(cfqd->cid[priolvl].cfqpriv.sectorate),val);
+	cfqd->cid[priolvl].cfqpriv.nskip = 0;
+	cfqd->cid[priolvl].cfqpriv.navsec = 0;
+	cfqd->cid[priolvl].cfqpriv.timedout = 0;
+
+#if 0
 	atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
 	atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
 	atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
 	atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
 	atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
 	atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
+#endif
 
 	return count;
 }
@@ -1491,10 +1601,10 @@ static struct cfq_fs_entry cfq_epoch_entry = {
 	.show = cfq_epoch_show,
 	.store = cfq_epoch_store,
 };
-static struct cfq_fs_entry cfq_epochsectors_entry = {
-	.attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_epochsectors_show,
-	.store = cfq_epochsectors_store,
+static struct cfq_fs_entry cfq_hmax_pct_entry = {
+	.attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_hmax_pct_show,
+	.store = cfq_hmax_pct_store,
 };
 
 #define P_0_STR   "p0"
@@ -1558,7 +1668,7 @@ static struct attribute *default_attrs[] = {
 	&cfq_grace_rt_entry.attr,
 	&cfq_grace_idle_entry.attr,
 	&cfq_epoch_entry.attr,
-	&cfq_epochsectors_entry.attr,
+	&cfq_hmax_pct_entry.attr,
 	&cfq_prio_0_entry.attr,
 	&cfq_prio_1_entry.attr,
 	&cfq_prio_2_entry.attr,
diff --git a/drivers/block/ckrm-io.c b/drivers/block/ckrm-io.c
index 7edfce727..89910268f 100644
--- a/drivers/block/ckrm-io.c
+++ b/drivers/block/ckrm-io.c
@@ -35,14 +35,11 @@
 #include <linux/ckrm_tc.h>
 #include <linux/ckrm-io.h>
 
-/* Tie to cfq priorities */
-#define CKI_IOPRIO_NORM		IOPRIO_NORM
+/* sectorate == 512 byte sectors served in CFQ_EPOCH ns*/
 
-/* Divisor to get fraction of bandwidth represented by an IOPRIO value */
-/* FIXME: Will not work if IOPRIO_NR > 100 */
-#define CKI_IOPRIO_DIV		(IOPRIO_NR-1)
-/* Minimum ioprio value to be assigned to a class */
-#define CKI_IOPRIO_MIN		1
+/* CKI_ROOTSECTORATE needs to be made configurable from outside */
+#define CKI_ROOTSECTORATE	100000
+#define CKI_MINSECTORATE	100
 
 #define CKI_IOUSAGE_UNIT	512
 
@@ -52,7 +49,12 @@ typedef struct ckrm_io_stats{
 	unsigned long        blksz;  /* size of bandwidth unit */
 	atomic_t             blkrd;  /* read units submitted to DD */
 	atomic_t             blkwr; /* write units submitted to DD */
-	
+
+	int nskip;			/* # times q skipped 	*/
+	unsigned long navsec;		/* avg sectors serviced */
+	int timedout;			/* # times gap > epoch 	*/
+	u64 sec[2];			/* sectors serviced in 
+					   prev & curr epochs 	*/
 } cki_stats_t;          /* per class I/O statistics */
 
 /* Note
@@ -75,8 +77,12 @@ typedef struct ckrm_io_class {
 	 * in local units. 
 	 */
 
+	cfqlim_t cfqpriv;	/* Data common with cfq priolvl's */ 	
+
+
 	int cnt_guarantee; /* Allocation as parent */
 	int cnt_unused;    /* Allocation to default subclass */
+	int cnt_limit;
 
 	/* Statistics, for class and default subclass */
 	cki_stats_t stats; 
@@ -85,19 +91,16 @@ typedef struct ckrm_io_class {
 } cki_icls_t;
 
 
-
 /* Internal functions */
 static inline void cki_reset_stats(cki_stats_t *usg);
 static inline void init_icls_one(cki_icls_t *icls);
-static inline int cki_div(int *a, int b, int c);
-//static inline int cki_recalc(cki_icls_t *icls, int rel2abs);
 static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres);
 
 /* External functions e.g. interface to ioscheduler */
 void *cki_tsk_icls (struct task_struct *tsk);
 int cki_tsk_ioprio (struct task_struct *tsk);
 
-extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio);
+extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv);
 
 /* CKRM Resource Controller API functions */
 static void * cki_alloc(struct ckrm_core_class *this,
@@ -139,45 +142,27 @@ static inline void init_icls_stats(cki_icls_t *icls)
 
 static inline void init_icls_one(cki_icls_t *icls)
 {
-	// Assign zero as initial guarantee otherwise creations
-	// could fail due to inadequate share
-
-	//icls->shares.my_guarantee = 
-	//	(CKI_IOPRIO_MIN * CKRM_SHARE_DFLT_TOTAL_GUARANTEE) / 
-	//	CKI_IOPRIO_DIV ;
-	icls->shares.my_guarantee = 0;
-	icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-	icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-	icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	/* Zero initial guarantee for scalable creation of
+	   multiple classes */
 
-	icls->shares.unused_guarantee = icls->shares.total_guarantee - 
-		icls->shares.my_guarantee;
-	icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-
-	icls->cnt_guarantee = icls->cnt_unused = IOPRIO_IDLE;
+	/* Try out a new set */
+	
+	icls->shares.my_guarantee = CKRM_SHARE_DONTCARE;
+	icls->shares.my_limit = CKRM_SHARE_DONTCARE;
+	icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+	icls->shares.unused_guarantee = icls->shares.total_guarantee;
+	icls->shares.cur_max_limit = 0;
 
-	//Same rationale icls->ioprio = CKI_IOPRIO_MIN;
-	//IOPRIO_IDLE equivalence to zero my_guarantee (set above) relies
-	//on former being zero.
+	icls->cnt_guarantee = CKRM_SHARE_DONTCARE;
+	icls->cnt_unused = CKRM_SHARE_DONTCARE;
+	icls->cnt_limit = CKRM_SHARE_DONTCARE;
 	
 	init_icls_stats(icls);
 }
 
-
-static inline int cki_div(int *a, int b, int c)
-{
-	u64 temp = (u64) b * c ;
-	do_div(temp,CKI_IOPRIO_DIV);
-	*a = (int) temp;
-
-	return 0;
-}
-	
-
-/* Recalculate absolute shares from relative (rel2abs=1)
- * or vice versa (rel2abs=0) 
- * Caller should have a lock on icls
+/* Recalculate absolute shares from relative
+ * Caller should hold a lock on icls
  */
 
 static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
@@ -186,17 +171,17 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
 	ckrm_core_class_t *child = NULL;
 	cki_icls_t *childres;
 	int resid = cki_rcbs.resid;
+	u64 temp;
 
 	if (parres) {
 		struct ckrm_shares *par = &parres->shares;
 		struct ckrm_shares *self = &res->shares;
 
 
-
 		if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
 			res->cnt_guarantee = CKRM_SHARE_DONTCARE;
 		} else if (par->total_guarantee) {
-			u64 temp = (u64) self->my_guarantee * 
+			temp = (u64) self->my_guarantee * 
 				parres->cnt_guarantee;
 			do_div(temp, par->total_guarantee);
 			res->cnt_guarantee = (int) temp;
@@ -204,16 +189,36 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
 			res->cnt_guarantee = 0;
 		}
 
+
+		if (parres->cnt_limit == CKRM_SHARE_DONTCARE) {
+			res->cnt_limit = CKRM_SHARE_DONTCARE;
+			atomic_set(&res->cfqpriv.sectorate,CKI_MINSECTORATE);
+		} else {
+			if (par->max_limit) {
+				temp = (u64) self->my_limit * 
+					parres->cnt_limit;
+				do_div(temp, par->max_limit);
+				res->cnt_limit = (int) temp;
+			} else {
+				res->cnt_limit = 0;
+			}
+			atomic_set(&res->cfqpriv.sectorate,res->cnt_limit);
+		}
+		
 		if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) {
 			res->cnt_unused = CKRM_SHARE_DONTCARE;
-		} else if (self->total_guarantee) {
-			u64 temp = (u64) self->unused_guarantee * 
-				res->cnt_guarantee;
-			do_div(temp, self->total_guarantee);
-			res->cnt_unused = (int) temp;
 		} else {
-			res->cnt_unused = 0;
+			if (self->total_guarantee) {
+				temp = (u64) self->unused_guarantee * 
+					res->cnt_guarantee;
+				do_div(temp, self->total_guarantee);
+				res->cnt_unused = (int) temp;
+			} else {
+				res->cnt_unused = 0;
+			}
+
 		}
+		
 	}
 	// propagate to children
 	ckrm_lock_hier(res->core);
@@ -228,50 +233,6 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
 	ckrm_unlock_hier(res->core);
 }
 
-#if 0
-static inline int cki_recalc(cki_icls_t *icls, int rel2abs)
-{
-	u64 temp;
-
-	if (icls->parent == NULL) {
-		/* Root, as parent, always gets all */
-
-		temp = icls->shares.my_guarantee * (IOPRIO_NR-1);
-		do_div(temp, icls->shares.total_guarantee);
-
-		icls->total = IOPRIO_NR-1;
-		icls->ioprio = temp ;
-		icls->unused = icls->total - icls->ioprio;
-//		icls->unused = (IOPRIO_NR-1)-icls->ioprio;
-
-	} else {
-		cki_icls_t *parres;
-		int partot ;
-		
-		parres = ckrm_get_res_class(icls->parent,
-					    cki_rcbs.resid,
-					    cki_icls_t);
-		if (!parres) {
-			printk(KERN_ERR "cki_recalc: error getting "
-			       "resclass from core \n");
-			return -EINVAL;
-		}
-
-
-		temp = (icls->shares.my_guarantee * 
-			parres->total);
-		do_div(temp, parres->shares.total_guarantee);
-
-		icls->ioprio = temp;
-		icls->unused = 0;
-
-	}
-	
-	return 0;
-
-}
-#endif
-
 void *cki_tsk_icls(struct task_struct *tsk)
 {
 	return (void *) ckrm_get_res_class(class_core(tsk->taskclass),
@@ -279,12 +240,19 @@ void *cki_tsk_icls(struct task_struct *tsk)
 }
 
 int cki_tsk_ioprio(struct task_struct *tsk)
+{
+	/* Don't use I/O priorities for now */
+	return IOPRIO_NORM;
+}
+
+void *cki_tsk_cfqpriv(struct task_struct *tsk)
 {
 	cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass),
 					   cki_rcbs.resid, cki_icls_t);
-	return icls->cnt_unused;
+	return (void *)&(icls->cfqpriv);
 }
 
+
 static void *cki_alloc(struct ckrm_core_class *core,
 			 struct ckrm_core_class *parent)
 {
@@ -301,43 +269,13 @@ static void *cki_alloc(struct ckrm_core_class *core,
 	icls->parent = parent;
 	icls->shares_lock = SPIN_LOCK_UNLOCKED;
 
-	if (parent == NULL) {
-
-		/* Root class gets same as "normal" CFQ priorities to
-		 * retain compatibility of behaviour in the absence of 
-		 * other classes
-		 */
-
-		icls->cnt_guarantee = icls->cnt_unused = IOPRIO_NR-1; 
-
-		/* Default gets normal, not minimum */
-		//icls->unused = IOPRIO_NORM;
-		//icls->unused = icls->guarantee-icls->myguarantee;
-		//icls->limit = icls->mylimit = IOPRIO_NR;
-
-		/* Compute shares in abstract units */
-		icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-		// my_guarantee for root is meaningless. Set to default
-		icls->shares.my_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	init_icls_one(icls);
 
-		icls->shares.unused_guarantee = 
-			CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-		//temp = (u64) icls->cnt_unused * icls->shares.total_guarantee;
-		//do_div(temp, CKI_IOPRIO_DIV); 
-		// temp now has root's default's share
-		//icls->shares.unused_guarantee = 
-		// icls->shares.total_guarantee - temp; 
-
-		icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-		icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-		icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-	} else {
-		init_icls_one(icls);
-		/* No propagation to parent needed if icls'
-		   initial share is zero */
+	if (parent == NULL) {
+		icls->cnt_guarantee =  CKI_ROOTSECTORATE;
+		icls->cnt_unused = CKI_ROOTSECTORATE;
+		icls->cnt_limit = CKI_ROOTSECTORATE;
+		atomic_set(&(icls->cfqpriv.sectorate),icls->cnt_limit);
 	}
 	try_module_get(THIS_MODULE);
 	return icls;
@@ -345,7 +283,10 @@ static void *cki_alloc(struct ckrm_core_class *core,
 
 static void cki_free(void *res)
 {
-	cki_icls_t *icls = res, *parres;
+	cki_icls_t *icls = res, *parres, *childres;
+	ckrm_core_class_t *child = NULL;
+	int maxlimit, resid = cki_rcbs.resid;
+
 	
 	if (!res)
 		return;
@@ -361,9 +302,7 @@ static void cki_free(void *res)
 	 *
 	 */
 
-	parres = ckrm_get_res_class(icls->parent,
-				    cki_rcbs.resid,
-				    cki_icls_t);
+	parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t);
 	if (!parres) {
 		printk(KERN_ERR "cki_free: error getting "
 		       "resclass from core \n");
@@ -372,8 +311,23 @@ static void cki_free(void *res)
 
 	/* Update parent's shares */
 	spin_lock(&parres->shares_lock);
+
 	child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0);
 	parres->cnt_unused += icls->cnt_guarantee;
+
+	// run thru parent's children and get the new max_limit of the parent
+	ckrm_lock_hier(parres->core);
+	maxlimit = 0;
+	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+		childres = ckrm_get_res_class(child, resid, cki_icls_t);
+		if (maxlimit < childres->shares.my_limit) {
+			maxlimit = childres->shares.my_limit;
+		}
+	}
+	ckrm_unlock_hier(parres->core);
+	if (parres->shares.cur_max_limit < maxlimit) {
+		parres->shares.cur_max_limit = maxlimit;
+	}
 	spin_unlock(&parres->shares_lock);
 
 	kfree(res);
@@ -388,26 +342,15 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
 	struct ckrm_shares *cur, *par;
 	int rc = -EINVAL, resid = cki_rcbs.resid;
 
-	if (!icls) {
-		printk(KERN_ERR "No class\n");
+	if (!icls) 
 		return rc;
-	}
 
 	cur = &icls->shares; 
-
-	/* limits not supported */
-	if ((new->max_limit != CKRM_SHARE_UNCHANGED)
-	    || (new->my_limit != CKRM_SHARE_UNCHANGED)) {
-		printk(KERN_ERR "limits not supported\n");
-		return -EINVAL;
-	}
-
 	if (icls->parent) {
 		parres =
 		    ckrm_get_res_class(icls->parent, resid, cki_icls_t);
 		if (!parres) {
-			printk(KERN_ERR "cki_setshare: error getting "
-			       "resclass from core \n");
+			pr_debug("cki_setshare: invalid resclass\n");
 			return -EINVAL;
 		}
 		spin_lock(&parres->shares_lock);
@@ -420,10 +363,8 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
 	}
 
 	rc = set_shares(new, cur, par);
-	printk(KERN_ERR "rc from set_shares %d\n", rc);
 
 	if ((!rc) && parres) {
-		
 		if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
 			parres->cnt_unused = CKRM_SHARE_DONTCARE;
 		} else if (par->total_guarantee) {
@@ -435,17 +376,6 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
 			parres->cnt_unused = 0;
 		}
 		cki_recalc_propagate(res, parres);
-	
-#if 0
-		int old = icls->ioprio;
-		
-		rc = cki_recalc(icls,0);
-
-		if (!rc && parres) {
-			int raise_tot = icls->ioprio - old ;
-			parres->unused -= raise_tot ;
-		}
-#endif
 	}
 	spin_unlock(&icls->shares_lock);
 	if (icls->parent) {
@@ -471,15 +401,15 @@ static int cki_getstats(void *res, struct seq_file *sfile)
 	if (!icls)
 		return -EINVAL;
 
-/*	
-	seq_printf(sfile, "%d my_read\n",atomic_read(&icls->mystats.blkrd));
-	seq_printf(sfile, "%d my_write\n",atomic_read(&icls->mystats.blkwr));
-	seq_printf(sfile, "%d total_read\n",atomic_read(&icls->stats.blkrd));
-	seq_printf(sfile, "%d total_write\n",atomic_read(&icls->stats.blkwr));
-*/
-	
-	seq_printf(sfile, "%d total ioprio\n",icls->cnt_guarantee);
-	seq_printf(sfile, "%d unused/default ioprio\n",icls->cnt_unused);
+	seq_printf(sfile, "abs limit %d\n",icls->cnt_limit);
+	seq_printf(sfile, "skip %d timdout %d avsec %lu rate %ld "
+		   " sec0 %ld sec1 %ld\n",
+		   icls->cfqpriv.nskip,
+		   icls->cfqpriv.timedout,
+		   icls->cfqpriv.navsec,
+		   atomic_read(&(icls->cfqpriv.sectorate)),
+		   (unsigned long)icls->cfqpriv.sec[0],
+		   (unsigned long)icls->cfqpriv.sec[1]);
 
 	return 0;
 }
@@ -554,7 +484,7 @@ int __init cki_init(void)
 		resid = ckrm_register_res_ctlr(clstype, &cki_rcbs);
 		if (resid != -1) {
 			cki_rcbs.classtype = clstype;
-			cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio);
+			cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio,cki_tsk_cfqpriv);
 		}
 	}
 	
@@ -566,7 +496,7 @@ void __exit cki_exit(void)
 	ckrm_unregister_res_ctlr(&cki_rcbs);
 	cki_rcbs.resid = -1;
 	cki_rcbs.classtype = NULL; 
-	cki_cfq_set(NULL,NULL);
+	cki_cfq_set(NULL,NULL,NULL);
 }
 
 module_init(cki_init)
diff --git a/drivers/block/ckrm-iostub.c b/drivers/block/ckrm-iostub.c
index c325d8e8d..f4012545b 100644
--- a/drivers/block/ckrm-iostub.c
+++ b/drivers/block/ckrm-iostub.c
@@ -25,13 +25,14 @@ static spinlock_t stub_lock = SPIN_LOCK_UNLOCKED;
 
 static icls_tsk_t tskiclstub;
 static icls_ioprio_t tskiopriostub;
+static icls_tsk_t tskcfqprivstub;
 
-
-void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio)
+void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv)
 {
 	spin_lock(&stub_lock);
 	tskiclstub = tskicls;
 	tskiopriostub = tskioprio;
+	tskcfqprivstub = tskcfqpriv;
 	spin_unlock(&stub_lock);
 }
 
@@ -59,6 +60,19 @@ int cki_ioprio(struct task_struct *tsk)
 	return ret;
 }
 
+void *cki_cfqpriv(struct task_struct *tsk)
+{
+	void *ret;
+	spin_lock(&stub_lock);
+	if (tskiclstub)
+		ret = (*tskcfqprivstub)(tsk);
+	else 
+		ret = NULL;
+	spin_unlock(&stub_lock);
+	return ret;
+}    
+
 EXPORT_SYMBOL(cki_cfq_set);
 EXPORT_SYMBOL(cki_hash_key);
 EXPORT_SYMBOL(cki_ioprio);
+EXPORT_SYMBOL(cki_cfqpriv);
diff --git a/drivers/char/.cvsignore b/drivers/char/.cvsignore
new file mode 100644
index 000000000..83683a2d8
--- /dev/null
+++ b/drivers/char/.cvsignore
@@ -0,0 +1,2 @@
+consolemap_deftbl.c
+defkeymap.c
diff --git a/drivers/pci/.cvsignore b/drivers/pci/.cvsignore
new file mode 100644
index 000000000..d5b21d9ee
--- /dev/null
+++ b/drivers/pci/.cvsignore
@@ -0,0 +1,3 @@
+classlist.h
+devlist.h
+gen-devlist
diff --git a/drivers/scsi/aic7xxx/.cvsignore b/drivers/scsi/aic7xxx/.cvsignore
new file mode 100644
index 000000000..a1a7fcd04
--- /dev/null
+++ b/drivers/scsi/aic7xxx/.cvsignore
@@ -0,0 +1,4 @@
+aic79xx_reg.h
+aic79xx_seq.h
+aic7xxx_reg.h
+aic7xxx_seq.h
diff --git a/fs/aio.c b/fs/aio.c
index 9e7b5928e..2335a0756 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -543,7 +543,7 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	return ioctx;
 }
 
-static void use_mm(struct mm_struct *mm)
+void use_mm(struct mm_struct *mm)
 {
 	struct mm_struct *active_mm;
 
diff --git a/include/.cvsignore b/include/.cvsignore
new file mode 100644
index 000000000..04204c7c9
--- /dev/null
+++ b/include/.cvsignore
@@ -0,0 +1 @@
+config
diff --git a/include/asm-i386/.cvsignore b/include/asm-i386/.cvsignore
new file mode 100644
index 000000000..4ec57ad5b
--- /dev/null
+++ b/include/asm-i386/.cvsignore
@@ -0,0 +1 @@
+asm_offsets.h
diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h
index c689554ad..9513dd889 100644
--- a/include/asm-i386/apicdef.h
+++ b/include/asm-i386/apicdef.h
@@ -86,6 +86,7 @@
 #define			APIC_LVT_REMOTE_IRR		(1<<14)
 #define			APIC_INPUT_POLARITY		(1<<13)
 #define			APIC_SEND_PENDING		(1<<12)
+#define			APIC_MODE_MASK			0x700
 #define			GET_APIC_DELIVERY_MODE(x)	(((x)>>8)&0x7)
 #define			SET_APIC_DELIVERY_MODE(x,y)	(((x)&~0x700)|((y)<<8))
 #define				APIC_MODE_FIXED		0x0
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index d1a4dd68f..43917d930 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -39,6 +39,7 @@ union irq_ctx {
 	u32                     stack[THREAD_SIZE/sizeof(u32)];
 };
 
+#ifdef CONFIG_IRQSTACKS
 extern union irq_ctx *hardirq_ctx[NR_CPUS];
 extern union irq_ctx *softirq_ctx[NR_CPUS];
 
@@ -46,6 +47,10 @@ extern void irq_ctx_init(int cpu);
 
 #define __ARCH_HAS_DO_SOFTIRQ
 
+#else
+#define irq_ctx_init(cpu) do { ; } while (0)
+#endif
+
 struct irqaction;
 struct pt_regs;
 asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *,
diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h
new file mode 100644
index 000000000..eb8fd9868
--- /dev/null
+++ b/include/asm-i386/kexec.h
@@ -0,0 +1,25 @@
+#ifndef _I386_KEXEC_H
+#define _I386_KEXEC_H
+
+#include <asm/fixmap.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
+ * calculation for the amount of memory directly mappable into the
+ * kernel memory space.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+#define KEXEC_CONTROL_CODE_SIZE	4096
+
+#endif /* _I386_KEXEC_H */
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index 614d05f27..263c6f752 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -60,7 +60,19 @@ struct mod_arch_specific
 #define MODULE_REGPARM ""
 #endif
 
+#if (CONFIG_STACK_SIZE_SHIFT < 12)
+#define MODULE_STACKSIZE "TINYSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 12)
 #define MODULE_STACKSIZE "4KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 13)
+#define MODULE_STACKSIZE "8KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 14)
+#define MODULE_STACKSIZE "16KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT > 14)
+#define MODULE_STACKSIZE "HUGESTACKS "
+#else
+#define MODULE_STACKSIZE ""
+#endif
 
 #define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
 
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index cd8708b42..3651a3bb0 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -400,10 +400,10 @@ struct tss_struct {
 
 #define ARCH_MIN_TASKALIGN	16
 
-
-#define STACK_PAGE_COUNT	(4096/PAGE_SIZE)
-
-
+#if ((1<<CONFIG_STACK_SIZE_SHIFT) < PAGE_SIZE)
+#error (1<<CONFIG_STACK_SIZE_SHIFT) must be at least PAGE_SIZE
+#endif
+#define STACK_PAGE_COUNT	((1<<CONFIG_STACK_SIZE_SHIFT)/PAGE_SIZE)
 
 
 struct thread_struct {
diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h
index abe3440a9..ed44e47e6 100644
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -95,4 +95,6 @@
  */
 #define IDT_ENTRIES 256
 
+#define KERN_PHYS_OFFSET (CONFIG_KERN_PHYS_OFFSET * 0x100000)
+
 #endif
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index d941e6d92..da74573aa 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -54,9 +54,10 @@ struct thread_info {
 #endif
 
 #define PREEMPT_ACTIVE		0x4000000
-#define THREAD_SIZE            (4096)
+#define THREAD_SIZE            (1<<CONFIG_STACK_SIZE_SHIFT)
+#define STACK_WARN             (CONFIG_STACK_WARN)
+#define STACK_PANIC            (0x200ul)
 
-#define STACK_WARN             (THREAD_SIZE/8)
 /*
  * macros/functions for gaining access to the thread information structure
  *
diff --git a/include/linux/.cvsignore b/include/linux/.cvsignore
new file mode 100644
index 000000000..c1cddb666
--- /dev/null
+++ b/include/linux/.cvsignore
@@ -0,0 +1,3 @@
+autoconf.h
+compile.h
+version.h
diff --git a/include/linux/ckrm-io.h b/include/linux/ckrm-io.h
index 36040b930..70277c7ef 100644
--- a/include/linux/ckrm-io.h
+++ b/include/linux/ckrm-io.h
@@ -34,6 +34,7 @@ typedef int (*icls_ioprio_t) (struct task_struct *tsk);
 
 extern void *cki_tsk_icls (struct task_struct *tsk);
 extern int cki_tsk_ioprio (struct task_struct *tsk);
+extern void *cki_tsk_cfqpriv (struct task_struct *tsk);
 
 #endif /* CONFIG_CKRM_RES_BLKIO */
 
diff --git a/include/linux/ckrm_ce.h b/include/linux/ckrm_ce.h
index f4e91e91d..3a7c743f6 100644
--- a/include/linux/ckrm_ce.h
+++ b/include/linux/ckrm_ce.h
@@ -90,7 +90,7 @@ typedef struct rbce_eng_callback {
 extern int ckrm_register_engine(const char *name, ckrm_eng_callback_t *);
 extern int ckrm_unregister_engine(const char *name);
 
-extern void *ckrm_classobj(char *, int *classtype);
+extern void *ckrm_classobj(const char *, int *classtype);
 extern int get_exe_path_name(struct task_struct *t, char *filename,
 			     int max_size);
 
diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h
index 3041c8179..a02794d14 100644
--- a/include/linux/ckrm_classqueue.h
+++ b/include/linux/ckrm_classqueue.h
@@ -19,7 +19,17 @@
  * Aug 28, 2003
  *        Created.
  * July 07, 2004
- *   clean up, add comments     
+ *   clean up, add comments
+ *
+ *
+ * Overview:
+ * ---------
+ *
+ * Please read Documentation/ckrm/cpu_sched for a general overview of
+ * how the O(1) CKRM scheduler.
+ *
+ * ckrm_classqueue.h provides the definition to maintain the 
+ * per cpu class runqueue.
  *   
  */
 
@@ -28,14 +38,13 @@
 
 #include <linux/list.h>
 
-#define CLASSQUEUE_SIZE 1024   // acb: changed from 128
-//#define CLASSQUEUE_SIZE 128
+#warning mef: is classqueue_size big enough for PlanetLab
+#define CLASSQUEUE_SIZE_SHIFT 	7
+#define CLASSQUEUE_SIZE ( 1 << CLASSQUEUE_SIZE_SHIFT )
 #define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long))
 
 /**
  * struct cq_prio_array: duplicates prio_array defined in sched.c 
- *
- * I duplicate this data structure to make ckrm_classqueue implementation more modular
  */
 struct cq_prio_array {
 	int nr_active;
@@ -49,42 +58,50 @@ struct cq_prio_array {
  * @base: base priority
  * @base_offset: index in array for the base
  *
- * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks)
- * as task runqueue, each processor has a classqueue
- * a class enters the classqueue when the first task in this class local runqueue shows up
- * a class enters the classqueue when the last task in the local runqueue leaves
- * class local runqueues are ordered based their priority
- *
- * status:
- *   hzheng: is 32bit base long enough?
+ * classqueue can be thought of as runqueue of lrq's (per cpu object of
+ * a CKRM class as task runqueue (instead of runqueue of tasks)
+ * - a class's local lrq is enqueued into the local classqueue when a
+ *   first task is enqueued lrq.
+ * - a class's local lrq is removed from the local classqueue when the 
+ *   last task is dequeued from the lrq.
+ * - lrq's are ordered based on their priority (determined elsewhere)
+ *   ( CKRM: caculated based on it's progress (cvt) and urgency (top_priority)
  */
+
 struct classqueue_struct {
-	struct cq_prio_array array;
+	int enabled;                   // support dynamic on/off
 	unsigned long base;
 	unsigned long base_offset;
+	struct cq_prio_array array;
 };
 
 /** 
- * struct cq_node_struct - the link object between class local runqueue and classqueue
+ * struct cq_node_struct:
+ * - the link object between class local runqueue and classqueue
  * @list: links the class local runqueue to classqueue
- * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority)
+ * @prio: class priority
  * @index: real index into the classqueue array, calculated based on priority
- *
- * NOTE: make sure list is empty when it's not in classqueue
  */
 struct cq_node_struct {
 	struct list_head list;
 	int prio;
 	int index;
+	/*
+	 * set when the class jump out of the class queue window
+	 * class with this value set should be repositioned whenever classqueue slides window
+	 * real_prio is valid when need_repos is set
+	 */
+	int real_prio;
+	int need_repos; 
 };
 typedef struct cq_node_struct cq_node_t;
 
-typedef unsigned long long CVT_t;	// cummulative virtual time
-
 static inline void cq_node_init(cq_node_t * node)
 {
 	node->prio = 0;
 	node->index = -1;
+	node->real_prio = 0;
+	node->need_repos = 0;
 	INIT_LIST_HEAD(&node->list);
 }
 
@@ -95,23 +112,18 @@ static inline int cls_in_classqueue(cq_node_t * node)
 }
 
 /*initialize the data structure*/
-int classqueue_init(struct classqueue_struct *cq);
+int classqueue_init(struct classqueue_struct *cq, int enabled);
 
-/*add the class to classqueue*/
-void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio);
+/*add the class to classqueue at given priority */
+void classqueue_enqueue(struct classqueue_struct *cq, 
+			cq_node_t * node, int prio);
 
-/**
- * classqueue_dequeue - remove the class from classqueue
- * 
- * internal:
- *   called when the last task is removed from the queue
- *   checked on load balancing and schedule
- *   hzheng: why don't I call it on class_dequeue_task?
- */
+/*remove the class from classqueue */
 void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node);
 
 /*change the position of the class in classqueue*/
-void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio);
+void classqueue_update_prio(struct classqueue_struct *cq, 
+			    cq_node_t * node, int new_prio);
 
 /*return the first class in classqueue*/
 cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
@@ -122,7 +134,8 @@ void classqueue_update_base(struct classqueue_struct *cq);
 /**
  * class_compare_prio: compare the priority of this two nodes
  */
-static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2)
+static inline int class_compare_prio(struct cq_node_struct* node1, 
+				     struct cq_node_struct* node2)
 {
 	return ( node1->prio - node2->prio);
 }
diff --git a/include/linux/ckrm_rc.h b/include/linux/ckrm_rc.h
index 1bf2d07b5..a134dbc0d 100644
--- a/include/linux/ckrm_rc.h
+++ b/include/linux/ckrm_rc.h
@@ -113,7 +113,6 @@ typedef struct ckrm_res_ctlr {
 #define CKRM_MAX_TYPENAME_LEN       32
 
 typedef struct ckrm_classtype {
-	/* Hubertus:   Rearrange slots later for cache friendliness */
 
 	/* resource controllers */
 	spinlock_t res_ctlrs_lock;  // protect res ctlr related data
@@ -238,27 +237,6 @@ extern int ckrm_init_core_class(struct ckrm_classtype *clstype,
 				struct ckrm_core_class *parent,
 				const char *name);
 extern int ckrm_release_core_class(struct ckrm_core_class *);	
-// Hubertus .. can disappear after cls del debugging
-extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type,
-						 const char *resname);
-
-#if 0
-
-// Hubertus ... need to straighten out all these I don't think we will even 
-// call this or are we 
-
-/* interface to the RCFS filesystem */
-extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *,
-						     const char *, int);
-
-// Reclassify the given pid to the given core class by force
-extern void ckrm_forced_reclassify_pid(int, struct ckrm_core_class *);
-
-// Reclassify the given net_struct  to the given core class by force
-extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *,
-				       struct ckrm_core_class *);
-
-#endif
 
 extern void ckrm_lock_hier(struct ckrm_core_class *);
 extern void ckrm_unlock_hier(struct ckrm_core_class *);
@@ -290,12 +268,6 @@ extern int ckrm_class_set_shares(struct ckrm_core_class *core,
 extern int ckrm_class_reset_stats(struct ckrm_core_class *core,
 				  const char *resname, const char *unused);
 
-#if 0
-extern void ckrm_ns_hold(struct ckrm_net_struct *);
-extern void ckrm_ns_put(struct ckrm_net_struct *);
-extern void *ckrm_set_rootcore_byname(char *, void *);
-#endif
-
 static inline void ckrm_core_grab(struct ckrm_core_class *core)
 {
 	if (core)
@@ -329,7 +301,6 @@ static inline unsigned int ckrm_is_core_valid(ckrm_core_class_t * core)
            )
 
 extern struct ckrm_classtype *ckrm_classtypes[];	
-/* should provide a different interface */
 
 /*-----------------------------------------------------------------------------
  * CKRM event callback specification for the classtypes or resource controllers 
diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h
index 3611c2d3e..dc00aeaa0 100644
--- a/include/linux/ckrm_sched.h
+++ b/include/linux/ckrm_sched.h
@@ -3,8 +3,6 @@
  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
  * Copyright (C) Hubertus Franke,  IBM Corp. 2004
  * 
- * Latest version, more details at http://ckrm.sf.net
- * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -12,6 +10,17 @@
  *
  */
 
+/*
+ * Overview:
+ * ---------
+ *
+ * Please read Documentation/ckrm/cpu_sched for a general overview of
+ * how the O(1) CKRM scheduler.
+ *
+ * ckrm_sched.h provides the definition for the per class local runqueue.
+ *
+ */
+   
 #ifndef _CKRM_SCHED_H
 #define _CKRM_SCHED_H
 
@@ -27,18 +36,31 @@ struct prio_array {
 	struct list_head queue[MAX_PRIO];
 };
 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-#define rq_active(p,rq)   (get_task_lrq(p)->active)
-#define rq_expired(p,rq)  (get_task_lrq(p)->expired)
-int __init init_ckrm_sched_res(void);
-#else
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+
 #define rq_active(p,rq)   (rq->active)
 #define rq_expired(p,rq)  (rq->expired)
 static inline void init_ckrm_sched_res(void) {}
 static inline int ckrm_cpu_monitor_init(void) {return 0;}
-#endif //CONFIG_CKRM_CPU_SCHEDULE
 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#else
+
+#define rq_active(p,rq)   (get_task_lrq(p)->active)
+#define rq_expired(p,rq)  (get_task_lrq(p)->expired)
+
+enum ckrm_sched_mode {
+	CKRM_SCHED_MODE_DISABLED, /* always use default linux scheduling     */
+				  /* effectively disables the ckrm scheduler */
+	CKRM_SCHED_MODE_ENABLED  /* always uses ckrm scheduling behavior    */
+};
+
+extern unsigned int ckrm_sched_mode;	  /* true internal sched_mode (DIS/EN ABLED) */
+
+int __init init_ckrm_sched_res(void);
+
+typedef unsigned long long CVT_t;	// cummulative virtual time
+
 struct ckrm_runqueue {
 	cq_node_t classqueue_linkobj;	/*links in classqueue */
 	struct ckrm_cpu_class *cpu_class;	// class it belongs to
@@ -52,6 +74,7 @@ struct ckrm_runqueue {
 	   reset to jiffies if expires
 	 */
 	unsigned long expired_timestamp;
+        int best_expired_prio;
 
 	/* 
 	 * highest priority of tasks in active
@@ -62,23 +85,38 @@ struct ckrm_runqueue {
 	CVT_t local_cvt;
 
 	unsigned long lrq_load;
-	int local_weight; 
 
+	/* Three different weights are distinguished:
+	 * local_weight, skewed_weight, over_weight:
+	 *
+	 * - local_weight:  main weight to drive CVT progression
+	 * - over_weight:   weight to reduce savings when over its guarantee
+	 * - skewed_weight: weight to use when local_weight to small
+	 *                  avoids starvation problems.
+	 */
+	int local_weight;   
+	int over_weight;
+	int skewed_weight;
 
 	/*
-	 * unused CPU time accumulated while thoe class 
+	 * unused CPU time accumulated while the class 
 	 * is inactive goes to savings
 	 * 
 	 * initialized to be 0
 	 * a class can't accumulate more than SAVING_THRESHOLD of savings
 	 */
-	unsigned long long savings;
+	CVT_t savings;
 
 	unsigned long magic;	//for debugging
-};
+} ____cacheline_aligned_in_smp;
+
+#define CKRM_LRQ_MAGIC (0xACDC0702)
 
 typedef struct ckrm_runqueue ckrm_lrq_t;
 
+#define ckrm_cpu_disabled() (ckrm_sched_mode == CKRM_SCHED_MODE_DISABLED)   
+#define ckrm_cpu_enabled()  (ckrm_sched_mode == CKRM_SCHED_MODE_ENABLED)   
+
 /**
  * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
  * 
@@ -103,24 +141,31 @@ struct ckrm_cpu_class_stat {
 	 */
 	int eshare;
 	int meshare;
+
+	/* a boolean indicates if the class has savings or not */
+	int has_savings; 
+
+	/*
+	 * a temporary value used by reorder_surplus_queue 
+	 */
+	int demand_per_share;
 };
 
 #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
 
-#define USAGE_SAMPLE_FREQ HZ  //sample every 1 seconds
-#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
-#define USAGE_WINDOW_SIZE 60  //keep the last 60 sample
+#define USAGE_SAMPLE_FREQ  (HZ)  //sample every 1 seconds
+#define USAGE_MAX_HISTORY  (60)  // keep the last 60 usage samples
+#define NS_PER_SAMPLE      (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
 
 struct ckrm_usage {
-	unsigned long samples[USAGE_WINDOW_SIZE]; //record usages 
-	unsigned long sample_pointer; //pointer for the sliding window
-	unsigned long long last_ns; //ns for last sample
-	long long last_sample_jiffies; //in number of jiffies
+	unsigned long samples[USAGE_MAX_HISTORY]; //record usages 
+	unsigned long sample_pointer;  // pointer for the sliding window
+	unsigned long long last_ns;    // ns for last sample
+	long long last_sample_jiffies; // in number of jiffies
 };
 
 /*
- * manages the class status
- * there should be only one instance of this object for each class in the whole system  
+ * CPU controller object allocated for each CLASS
  */
 struct ckrm_cpu_class {
 	struct ckrm_core_class *core;
@@ -129,12 +174,16 @@ struct ckrm_cpu_class {
 	spinlock_t cnt_lock;	// always grab parent's lock first and then child's
 	struct ckrm_cpu_class_stat stat;
 	struct list_head links;	// for linking up in cpu classes
-	ckrm_lrq_t local_queues[NR_CPUS];	// runqueues 
+	struct list_head surplus_queue;	//used for surplus allocation
+	ckrm_lrq_t* local_queues[NR_CPUS];	// runqueues 
 	struct ckrm_usage usage;
 	unsigned long magic;	//for debugging
+#ifdef __SIMULATOR__
+	int class_id;
+#endif
 };
 
-#define cpu_class_weight(cls) (cls->stat.meshare)
+#define cpu_class_weight(cls)   (SHARE_TO_WEIGHT(cls->stat.meshare))
 #define local_class_weight(lrq) (lrq->local_weight)
 
 static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
@@ -150,7 +199,7 @@ static inline void ckrm_usage_init(struct ckrm_usage* usage)
 {
 	int i;
 
-	for (i=0; i < USAGE_WINDOW_SIZE; i++)
+	for (i=0; i < USAGE_MAX_HISTORY; i++)
 		usage->samples[i] = 0;
 	usage->sample_pointer = 0;
 	usage->last_ns = 0;
@@ -188,49 +237,21 @@ static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
 	//	printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
 
 	usage->sample_pointer ++;
-	if (usage->sample_pointer >= USAGE_WINDOW_SIZE)
+	if (usage->sample_pointer >= USAGE_MAX_HISTORY)
 		usage->sample_pointer = 0;
 }
 
-//duration is specified in number of jiffies
-//return the usage in percentage
-static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration)
-{
-	int nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
-	struct ckrm_usage* usage = &clsptr->usage;
-	unsigned long long total = 0;
-	int i, idx;
-
-	if (nr_samples > USAGE_WINDOW_SIZE)
-		nr_samples = USAGE_WINDOW_SIZE;
-
-	idx = usage->sample_pointer;	
-	for (i = 0; i< nr_samples; i++) {
-		if (! idx)
-			idx = USAGE_WINDOW_SIZE;
-		idx --;
-		total += usage->samples[idx];
-	}
-        total *= 100;
-        do_div(total,nr_samples);
-        do_div(total,NS_PER_SAMPLE);
-	do_div(total,cpus_weight(cpu_online_map));
-        return total;
-}
-
-
 #define lrq_nr_running(lrq) \
              (lrq->active->nr_active + lrq->expired->nr_active)
 
-static inline ckrm_lrq_t *
-get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
+static inline ckrm_lrq_t *get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
 {
-	return &(cls->local_queues[cpu]);
+	return cls->local_queues[cpu];
 }
 
 static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
 {
-	return &(p->cpu_class->local_queues[task_cpu(p)]);
+	return p->cpu_class->local_queues[task_cpu(p)];
 }
 
 #define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
@@ -247,16 +268,16 @@ void init_cpu_classes(void);
 void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
 void ckrm_cpu_change_class(void *task, void *old, void *new);
 
-
 #define CPU_DEMAND_ENQUEUE 0
 #define CPU_DEMAND_DEQUEUE 1
 #define CPU_DEMAND_DESCHEDULE 2
 #define CPU_DEMAND_INIT 3
 
 /*functions exported by ckrm_cpu_monitor.c*/
+int update_effectives(void);
 void ckrm_cpu_monitor(int check_min);
 int ckrm_cpu_monitor_init(void);
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares);
 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
 void adjust_local_weight(void);
 
@@ -290,61 +311,53 @@ void adjust_local_weight(void);
  *
  *******************************************************************/
 
-#define CLASS_QUANTIZER 16 	//shift from ns to increase class bonus
-#define PRIORITY_QUANTIZER 2	//controls how much a high prio task can borrow
-
-#define CKRM_SHARE_ACCURACY 13
-#define NSEC_PER_MS 1000000
-#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
-
-
-#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC)  // 10 seconds
-
-#define CVT_UPDATE_TICK     ((HZ/2)?:1)
-
-// ABSOLUTE_CKRM_TUNING determines whether classes can make up
-// lost time in absolute time or in relative values
-
-#define ABSOLUTE_CKRM_TUNING         // preferred due to more predictable behavior
-
-#ifdef ABSOLUTE_CKRM_TUNING
-
-#define MAX_SAVINGS        MAX_SAVINGS_ABSOLUTE
-//an absolute bonus of 200ms for classes when reactivated
-#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
-#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
-
-#define scale_cvt(val,lrq)   ((val)*local_class_weight(lrq))
-#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
-
-#else
-
-#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) 
 /*
- * to improve system responsiveness
- * an inactive class is put a little bit ahead of the current class when it wakes up
- * the amount is set in normalized term to simplify the calculation
- * for class with 100% share, it can be 2s ahead
- * while for class with 10% share, it can be 200ms ahead
+ * The class priority is biasd toward classes with high priority tasks. 
+ * But we need to prevent this bias from starving other classes.
+ * If a class has nice value of -20, how much it can starve the default class?
+ * priority bonus =  (120-100) >> PRIORITY_QUANTIZER, 
+ * if PRIORITY_QUANTIZER = 2, then it's 5 steps ahead
+ * A class without bonus thus can't get to run until: 
+ * bonus * CKRM_MAX_WEIGHT * CVT_INC_PERSHARE = (120-100) >> PRIORITY_QUANTIZER
+ *  (1 << CKRM_WEIGHT_SHIFT)
+ *  (1 << CLASS_QUANTIZER) 
+*/
+
+/* 
+ * CKRM_WEIGHT_SHIFT and CLASS_QUANTIZER control how much a class with 
+ * high priority task can starve a normal priority class, so it should
+ * be constant CLASS_QUANTIZER should not be too small otherwise we 
+ * don't have enough bins in the classqueue.
+ * The ideal value of CLASS_QUANTIZER is 20, but a little smaller is acceptable
  */
-#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS)  
 
-/*
- * normalized savings can't be more than MAX_NORMALIZED_SAVINGS
- * based on the current configuration
- * this means that a class with share 100% will accumulate 10s at most
- * while a class with 1% of the share can only accumulate 100ms
+#define CLASS_QUANTIZER     (18)// shift from ns to increase class bonus
+#define PRIORITY_QUANTIZER  (2) // how much a high prio task can borrow
+#define CKRM_WEIGHT_SHIFT   (8) // 1/2^x == finest weight granularity
+#define CKRM_MAX_WEIGHT     (1<<CKRM_WEIGHT_SHIFT)  // - " -
+
+/* SHARES:
+ * shares are set in a hierarchical path. Since specified share settings 
+ * of a class (c) are relative to the parent (p) and its totals
+ * the shares can get very small, dependent on how many classes are 
+ * specified.
  */
+ 
+#define CKRM_SHARE_SHIFT (13)  
+#define CKRM_SHARE_MAX   (1 << CKRM_SHARE_SHIFT)
 
-//a class with share 100% can get 100ms every 500ms
-//while a class with share 10% can only get 10ms every 500ms
-#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY)
+#define SHARE_TO_WEIGHT(x) ((x) >> (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
+#define WEIGHT_TO_SHARE(x) ((x) << (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
 
-#define scale_cvt(val,lrq)   (val)
-#define unscale_cvt(val,lrq) (val)
+/* Other constants */
 
-#endif
+#define NSEC_PER_MS          (1000000)
+#define NSEC_PER_JIFFIES     (NSEC_PER_SEC/HZ)
 
+#define MAX_SAVINGS_ABSOLUTE (4LLU*NSEC_PER_SEC)  // 4 seconds
+#define CVT_UPDATE_TICK      ((HZ/2)?:1)
+#define MAX_SAVINGS          MAX_SAVINGS_ABSOLUTE
+#define SAVINGS_LEAK_SPEED   (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
 
 /**
  * get_effective_prio: return the effective priority of a class local queue
@@ -361,6 +374,7 @@ static inline int get_effective_prio(ckrm_lrq_t * lrq)
 	int prio;
 
 	prio = lrq->local_cvt >> CLASS_QUANTIZER;  // cumulative usage
+#define URGENCY_SUPPORT 1
 #ifndef URGENCY_SUPPORT
 #warning "ACB removing urgency calculation from get_effective_prio"
 #else
@@ -414,84 +428,11 @@ static inline unsigned long task_load(struct task_struct* p)
 }
 
 /*
- * runqueue load is the local_weight of all the classes on this cpu
- * must be called with class_list_lock held
+ * moved to ckrm_sched.c
+ * but may need to make it static inline to improve performance
  */
-static inline unsigned long ckrm_cpu_load(int cpu)
-{
-	struct ckrm_cpu_class *clsptr;
-	ckrm_lrq_t* lrq;
-	struct ckrm_cpu_demand_stat* l_stat;
-	int total_load = 0;
-	int load;
-
-	list_for_each_entry(clsptr,&active_cpu_classes,links) {
-		lrq =  get_ckrm_lrq(clsptr,cpu);
-		l_stat = get_cls_local_stat(clsptr,cpu);
-		load = lrq->local_weight;
-		if (l_stat->cpu_demand < load)
-			load = l_stat->cpu_demand;
-		total_load += load;
-	}	
-	return total_load;
-}
-
-static inline void class_enqueue_task(struct task_struct *p,
-				      prio_array_t * array)
-{
-	ckrm_lrq_t *lrq;
-	int effective_prio;
-
-	lrq = get_task_lrq(p);
-
-	cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
-	lrq->lrq_load += task_load(p);
-
-	if ((p->prio < lrq->top_priority) && (array == lrq->active))
-		set_top_priority(lrq, p->prio);	
-
-	if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
-		cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
-		effective_prio = get_effective_prio(lrq);
-		classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
-	} 
-
-}
-
-static inline void class_dequeue_task(struct task_struct *p,
-				      prio_array_t * array)
-{
-	ckrm_lrq_t *lrq = get_task_lrq(p);
-	unsigned long load = task_load(p);
-
-	BUG_ON(lrq->lrq_load < load);
-	lrq->lrq_load -= load;
-
-	cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
-
-	if ((array == lrq->active) && (p->prio == lrq->top_priority)
-	    && list_empty(&(array->queue[p->prio])))
-		set_top_priority(lrq,
-				 find_next_bit(array->bitmap, MAX_PRIO,
-					       p->prio));
-}
-
-/*
- *  called after a task is switched out. Update the local cvt accounting 
- *  we need to stick with long instead of long long due to nonexistent 64-bit division
- */
-static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
-{
-	ckrm_lrq_t * lrq = get_task_lrq(p);
-
-	unsigned long cvt_inc = nsec / local_class_weight(lrq);
-
-	lrq->local_cvt += cvt_inc;
-	lrq->uncounted_ns += nsec;
-
-	update_class_priority(lrq);
-}
-
+void update_local_cvt(struct task_struct *p, unsigned long nsec);
+                                                                                
 static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
 {
 	struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
@@ -518,11 +459,14 @@ static inline int get_ckrm_rand(unsigned long val)
 	return rand;
 }
 
-void update_class_cputime(int this_cpu);
+void update_class_cputime(int this_cpu, int idle);
 
 /**********************************************/
 /*          PID_LOAD_BALANCING                */
 /**********************************************/
+
+#define CPU_PID_CTRL_TICK 32
+
 struct ckrm_load_struct {
 	unsigned long load_p; 	/*propotional*/
 	unsigned long load_i;   /*integral   */
@@ -538,26 +482,12 @@ static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
 }
 
 void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
-long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
+long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group);
 #define rq_ckrm_load(rq) (&((rq)->ckrm_load))
 
-static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load)
-{
-	read_lock(&class_list_lock);
-       
-#ifdef CONFIG_SMP
-	ckrm_load_sample(ckrm_load,this_cpu);
-#endif
 
-	if (! (j % CVT_UPDATE_TICK)) {
-		//		printk("ckrm_sched j=%lu\n",j);
-		classqueue_update_base(get_cpu_classqueue(this_cpu));
-		update_class_cputime(this_cpu);
-	}
+#endif /*CONFIG_CKRM_CPU_SCHEDULE */
 
-	read_unlock(&class_list_lock);
-}
+#endif
 
-#endif //CONFIG_CKRM_CPU_SCHEDULE
 
-#endif
diff --git a/include/linux/ckrm_tc.h b/include/linux/ckrm_tc.h
index 5650dd3c3..0caa797e7 100644
--- a/include/linux/ckrm_tc.h
+++ b/include/linux/ckrm_tc.h
@@ -1,3 +1,17 @@
+/* include/linux/ckrm_tc.h - general definitions for the CKRM TaskClass
+ *
+ * Copyright (C) Hubertus Franke,  IBM Corp. 2004
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _CKRM_TC_H
+#define _CKRM_TC_H
+
 #include <linux/ckrm_rc.h>
 
 #define TASK_CLASS_TYPE_NAME "taskclass"
@@ -11,3 +25,5 @@ typedef struct ckrm_task_class {
 #define TC_MF_IDX  0
 
 extern int ckrm_forced_reclassify_pid(int pid, struct ckrm_task_class *cls);
+
+#endif // _CKRM_TC_H
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ece31a727..11067b72d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1603,6 +1603,15 @@ static inline void free_secdata(void *secdata)
 asmlinkage int sys_ioprio_set(int ioprio);
 asmlinkage int sys_ioprio_get(void);
 
+/* common structure for cfq & ckrm I/O controller */
+typedef struct cfqlim {
+	int nskip;
+	unsigned long navsec;
+	int timedout;
+	atomic_t sectorate;
+	u64 sec[2];
+} cfqlim_t ;
+
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
new file mode 100644
index 000000000..8bd6c6b91
--- /dev/null
+++ b/include/linux/kexec.h
@@ -0,0 +1,56 @@
+#ifndef LINUX_KEXEC_H
+#define LINUX_KEXEC_H
+
+#ifdef CONFIG_KEXEC
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/kexec.h>
+
+/*
+ * This structure is used to hold the arguments that are used when loading
+ * kernel binaries.
+ */
+
+typedef unsigned long kimage_entry_t;
+#define IND_DESTINATION  0x1
+#define IND_INDIRECTION  0x2
+#define IND_DONE         0x4
+#define IND_SOURCE       0x8
+
+#define KEXEC_SEGMENT_MAX 8
+struct kexec_segment {
+	void *buf;
+	size_t bufsz;
+	void *mem;
+	size_t memsz;
+};
+
+struct kimage {
+	kimage_entry_t head;
+	kimage_entry_t *entry;
+	kimage_entry_t *last_entry;
+
+	unsigned long destination;
+
+	unsigned long start;
+	struct page *control_code_page;
+
+	unsigned long nr_segments;
+	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+
+	struct list_head control_pages;
+	struct list_head dest_pages;
+	struct list_head unuseable_pages;
+};
+
+
+/* kexec interface functions */
+extern void machine_kexec(struct kimage *image);
+extern int machine_kexec_prepare(struct kimage *image);
+extern void machine_kexec_cleanup(struct kimage *image);
+extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
+	struct kexec_segment *segments);
+extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
+extern struct kimage *kexec_image;
+#endif
+#endif /* LINUX_KEXEC_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3fb18934a..83c64bb32 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -581,7 +581,7 @@ int clear_page_dirty_for_io(struct page *page);
  */
 typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask);
 
-extern long do_mprotect(struct mm_struct *mm, unsigned long start, 
+asmlinkage long do_mprotect(struct mm_struct *mm, unsigned long start, 
 			size_t len, unsigned long prot);
 
 /*
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index a325de54c..f2ded1156 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -52,19 +52,23 @@ enum ip_conntrack_status {
 
 #include <linux/netfilter_ipv4/ip_conntrack_tcp.h>
 #include <linux/netfilter_ipv4/ip_conntrack_icmp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
 
 /* per conntrack: protocol private data */
 union ip_conntrack_proto {
 	/* insert conntrack proto private data here */
+	struct ip_ct_gre gre;
 	struct ip_ct_tcp tcp;
 	struct ip_ct_icmp icmp;
 };
 
 union ip_conntrack_expect_proto {
 	/* insert expect proto private data here */
+	struct ip_ct_gre_expect gre;
 };
 
 /* Add protocol helper include file here */
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
 #include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
 #include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
 #include <linux/netfilter_ipv4/ip_conntrack_irc.h>
@@ -72,6 +76,7 @@ union ip_conntrack_expect_proto {
 /* per expectation: application helper private data */
 union ip_conntrack_expect_help {
 	/* insert conntrack helper private data (expect) here */
+	struct ip_ct_pptp_expect exp_pptp_info;
 	struct ip_ct_amanda_expect exp_amanda_info;
 	struct ip_ct_ftp_expect exp_ftp_info;
 	struct ip_ct_irc_expect exp_irc_info;
@@ -86,16 +91,19 @@ union ip_conntrack_expect_help {
 /* per conntrack: application helper private data */
 union ip_conntrack_help {
 	/* insert conntrack helper private data (master) here */
+	struct ip_ct_pptp_master ct_pptp_info;
 	struct ip_ct_ftp_master ct_ftp_info;
 	struct ip_ct_irc_master ct_irc_info;
 };
 
 #ifdef CONFIG_IP_NF_NAT_NEEDED
 #include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_pptp.h>
 
 /* per conntrack: nat application helper private data */
 union ip_conntrack_nat_help {
 	/* insert nat helper private data here */
+	struct ip_nat_pptp nat_pptp_info;
 };
 #endif
 
@@ -157,6 +165,12 @@ struct ip_conntrack_expect
 	union ip_conntrack_expect_help help;
 };
 
+struct ip_conntrack_counter
+{
+	u_int64_t packets;
+	u_int64_t bytes;
+};
+
 struct ip_conntrack_helper;
 
 struct ip_conntrack
@@ -174,6 +188,11 @@ struct ip_conntrack
 	/* Timer function; drops refcnt when it goes off. */
 	struct timer_list timeout;
 
+#ifdef CONFIG_IP_NF_CT_ACCT
+	/* Accounting Information (same cache line as other written members) */
+	struct ip_conntrack_counter counters[IP_CT_DIR_MAX];
+#endif
+
 	/* If we're expecting another related connection, this will be
            in expected linked list */
 	struct list_head sibling_list;
@@ -249,8 +268,10 @@ extern int invert_tuplepr(struct ip_conntrack_tuple *inverse,
 			  const struct ip_conntrack_tuple *orig);
 
 /* Refresh conntrack for this many jiffies */
-extern void ip_ct_refresh(struct ip_conntrack *ct,
-			  unsigned long extra_jiffies);
+extern void ip_ct_refresh_acct(struct ip_conntrack *ct,
+			       enum ip_conntrack_info ctinfo,
+			       const struct sk_buff *skb,
+			       unsigned long extra_jiffies);
 
 /* These are for NAT.  Icky. */
 /* Call me when a conntrack is destroyed. */
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
index 1e7691189..d2bd0be99 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
@@ -14,7 +14,7 @@
 union ip_conntrack_manip_proto
 {
 	/* Add other protocols here. */
-	u_int16_t all;
+	u_int32_t all;
 
 	struct {
 		u_int16_t port;
@@ -25,6 +25,9 @@ union ip_conntrack_manip_proto
 	struct {
 		u_int16_t id;
 	} icmp;
+	struct {
+		u_int32_t key;
+	} gre;
 };
 
 /* The manipulable part of the tuple. */
@@ -44,7 +47,7 @@ struct ip_conntrack_tuple
 		u_int32_t ip;
 		union {
 			/* Add other protocols here. */
-			u_int16_t all;
+			u_int32_t all;
 
 			struct {
 				u_int16_t port;
@@ -55,6 +58,9 @@ struct ip_conntrack_tuple
 			struct {
 				u_int8_t type, code;
 			} icmp;
+			struct {
+				u_int32_t key;
+			} gre;
 		} u;
 
 		/* The protocol. */
@@ -80,10 +86,16 @@ enum ip_conntrack_dir
 #ifdef __KERNEL__
 
 #define DUMP_TUPLE(tp)						\
-DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n",	\
+DEBUGP("tuple %p: %u %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n",	\
        (tp), (tp)->dst.protonum,				\
-       NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all),		\
-       NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
+       NIPQUAD((tp)->src.ip), ntohl((tp)->src.u.all),		\
+       NIPQUAD((tp)->dst.ip), ntohl((tp)->dst.u.all))
+
+#define DUMP_TUPLE_RAW(x) 						\
+	DEBUGP("tuple %p: %u %u.%u.%u.%u:0x%08x -> %u.%u.%u.%u:0x%08x\n",\
+	(x), (x)->dst.protonum,						\
+	NIPQUAD((x)->src.ip), ntohl((x)->src.u.all), 			\
+	NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.all))
 
 #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
 
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index d60fafc8b..5460e94a1 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -51,6 +51,8 @@ extern void machine_restart(char *cmd);
 extern void machine_halt(void);
 extern void machine_power_off(void);
 
+extern void machine_shutdown(void);
+
 #endif
 
 #endif /* _LINUX_REBOOT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dd5005295..eda93cb65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -607,7 +607,6 @@ struct task_struct {
 	spinlock_t  ckrm_tsklock; 
 	void       *ce_data;
 #ifdef CONFIG_CKRM_TYPE_TASKCLASS
-	// .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS 
 	struct ckrm_task_class *taskclass;
 	struct list_head        taskclass_link;
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 111bb7367..5156e432d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1106,6 +1106,20 @@ extern void	       skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
 
+static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
+				       int len, void *buffer)
+{
+	int hlen = skb_headlen(skb);
+
+	if (offset + len <= hlen)
+		return skb->data + offset;
+
+	if (skb_copy_bits(skb, offset, buffer, len) < 0)
+		return NULL;
+
+	return buffer;
+}
+
 extern void skb_init(void);
 extern void skb_add_mtu(int mtu);
 
diff --git a/init/Kconfig b/init/Kconfig
index 64ca2fcb7..5d28bb7df 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -214,6 +214,18 @@ config CKRM_MEM_LRUORDER_CHANGE
 	  Changing this to yes reduces the checking overhead but violates the approximate
 	  LRU order that is maintained by the paging subsystem.
 
+config CKRM_CPU_SCHEDULE_AT_BOOT
+	bool "Turn on at boot time"
+	depends on CKRM_CPU_SCHEDULE
+	default n
+	help
+	  Enable CKRM CPU Scheduler at boot time. Otherwise
+	  it can be turned on dynamically at runtime. If not
+	  turned on the default Linux Scheduler behavior 
+	  will be obtained.
+
+	  Say N if unsure, Y to use this feature
+
 config CKRM_TYPE_SOCKETCLASS
 	bool "Class Manager for socket groups"
 	depends on CKRM
diff --git a/kernel/.cvsignore b/kernel/.cvsignore
new file mode 100644
index 000000000..21426e906
--- /dev/null
+++ b/kernel/.cvsignore
@@ -0,0 +1,2 @@
+config_data.gz
+config_data.h
diff --git a/kernel/Makefile b/kernel/Makefile
index ec5001052..455ec1eae 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_MODULE_SIG) += module-verify.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile
index b32530977..4956dcb3a 100644
--- a/kernel/ckrm/Makefile
+++ b/kernel/ckrm/Makefile
@@ -8,6 +8,6 @@ endif
     obj-$(CONFIG_CKRM_TYPE_TASKCLASS) 	+= ckrm_tc.o
     obj-$(CONFIG_CKRM_RES_NUMTASKS) 	+= ckrm_numtasks.o
     obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
-    obj-$(CONFIG_CKRM_RES_LISTENAQ) 	+= ckrm_laq.o
+    obj-$(CONFIG_CKRM_RES_LISTENAQ) 	+= ckrm_listenaq.o
     obj-$(CONFIG_CKRM_CPU_SCHEDULE)     += ckrm_cpu_class.o ckrm_cpu_monitor.o
     obj-$(CONFIG_CKRM_RES_MEM) 		+= ckrm_mem.o
diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c
index f1cfb268c..e732fdf53 100644
--- a/kernel/ckrm/ckrm.c
+++ b/kernel/ckrm/ckrm.c
@@ -82,6 +82,7 @@ inline unsigned int is_res_regd(struct ckrm_classtype *clstype, int resid)
 	    );
 }
 
+static 
 struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
 					  const char *resname)
 {
@@ -101,10 +102,8 @@ struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
 	return NULL;
 }
 
-EXPORT_SYMBOL(ckrm_resctlr_lookup);
-
 /* given a classname return the class handle and its classtype*/
-void *ckrm_classobj(char *classname, int *classTypeID)
+void *ckrm_classobj(const char *classname, int *classTypeID)
 {
 	int i;
 
@@ -864,7 +863,10 @@ int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq)
 		atomic_inc(&clstype->nr_resusers[i]);
 		rcbs = clstype->res_ctlrs[i];
 		if (rcbs && rcbs->get_share_values) {
-			(*rcbs->get_share_values) (core->res_class[i], &shares);
+			int rc = (*rcbs->get_share_values)(core->res_class[i], 
+							   &shares);
+			if (rc == -ENOSYS) 
+				continue;
 			seq_printf(seq,"res=%s,guarantee=%d,limit=%d,"
 				   "total_guarantee=%d,max_limit=%d\n",
 				   rcbs->res_name, shares.my_guarantee,
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c
index 917875b18..1bf482f21 100644
--- a/kernel/ckrm/ckrm_cpu_class.c
+++ b/kernel/ckrm/ckrm_cpu_class.c
@@ -22,9 +22,35 @@
 #include <linux/ckrm_sched.h>
 #include <linux/ckrm_classqueue.h>
 #include <linux/seq_file.h>
+#include <linux/parser.h>
+
+#define CPU_CTRL_NAME  "cpu"
 
 struct ckrm_res_ctlr cpu_rcbs;
 
+#define CKRM_CPU_USAGE_DETAIL_MAX 3
+static int usage_detail = 3;  /* 0: show usage 
+			       * 1: show settings
+			       * 2: show effectives
+			       * 3: show per runqueue stats
+			       */
+
+static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode);
+
+/*
+ * update effective share setting after:
+ * -- remove class
+ * -- change class share
+ * we don't need to call update_effectives() when add new class since 
+ * the defaults grt of new class is 0
+ * CAUTION: might need a lock here
+ */
+static inline void update_class_effectives(void) 
+{
+	//	update_effectives();
+	ckrm_cpu_monitor(0);
+}
+
 /**
  * insert_cpu_class - insert a class to active_cpu_class list
  *
@@ -38,49 +64,81 @@ static inline void insert_cpu_class(struct ckrm_cpu_class *cls)
 /*
  *  initialize a class object and its local queues
  */
+
+CVT_t get_min_cvt_locking(int cpu);
+ckrm_lrq_t *rq_get_dflt_lrq(int cpu);
+
+static void init_cpu_class_lrq(struct ckrm_cpu_class *cls, 
+			       int cpu, int isdflt)
+{
+	int j,k;
+	ckrm_lrq_t *queue = cls->local_queues[cpu];
+
+	queue->active   = queue->arrays;
+	queue->expired  = queue->arrays+1;
+	
+	for (j = 0; j < 2; j++) {
+		prio_array_t *array = queue->arrays + j;
+		for (k = 0; k < MAX_PRIO; k++) {
+			INIT_LIST_HEAD(array->queue + k);
+			__clear_bit(k, array->bitmap);
+		}
+		// delimiter for bitsearch
+		__set_bit(MAX_PRIO, array->bitmap);
+		array->nr_active = 0;
+	}
+	
+	queue->expired_timestamp = 0;
+	queue->best_expired_prio = MAX_PRIO;
+	
+	queue->cpu_class = cls;
+	queue->classqueue = get_cpu_classqueue(cpu);
+	queue->top_priority = MAX_PRIO;
+	cq_node_init(&queue->classqueue_linkobj);
+	queue->local_cvt = isdflt ? 0 : get_min_cvt_locking(cpu);
+	queue->lrq_load = 0;
+	queue->local_weight = cpu_class_weight(cls);
+	if (queue->local_weight == 0)
+		queue->local_weight = 1;
+	queue->over_weight = 0;
+	queue->skewed_weight = CKRM_MAX_WEIGHT/2; /*otherwise class might starve on start*/
+	queue->uncounted_ns = 0;
+	queue->savings = 0;
+	queue->magic = CKRM_LRQ_MAGIC;
+}
+
 void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
 {
-	int i,j,k;      
-	prio_array_t *array; 	
-	ckrm_lrq_t* queue;
+	int i;      
+	int isdflt;
+	struct ckrm_cpu_class *dfltcls;
+
+	dfltcls = get_default_cpu_class();
+
+	isdflt = (cls==dfltcls);
 
 	cls->shares = *shares;
 	cls->cnt_lock = SPIN_LOCK_UNLOCKED;
-	ckrm_cpu_stat_init(&cls->stat);
+	ckrm_cpu_stat_init(&cls->stat,isdflt ? CKRM_SHARE_MAX : 1);
 	ckrm_usage_init(&cls->usage);
 	cls->magic = CKRM_CPU_CLASS_MAGIC;
 
-	for (i = 0 ; i < NR_CPUS ; i++) {
-		queue = &cls->local_queues[i];
-		queue->active  = queue->arrays;
-		queue->expired = queue->arrays+1;
-		
-		for (j = 0; j < 2; j++) {
-			array = queue->arrays + j;
-			for (k = 0; k < MAX_PRIO; k++) {
-				INIT_LIST_HEAD(array->queue + k);
-				__clear_bit(k, array->bitmap);
-			}
-			// delimiter for bitsearch
-			__set_bit(MAX_PRIO, array->bitmap);
-			array->nr_active = 0;
+	memset(cls->local_queues,0,NR_CPUS*sizeof(ckrm_lrq_t*));
+	
+	if (isdflt) {
+		for (i=0; i< NR_CPUS; i++) {
+			cls->local_queues[i] = rq_get_dflt_lrq(i);
+			init_cpu_class_lrq(cls,i,1);
+		}
+	} else {
+		for_each_cpu(i) {
+			cls->local_queues[i] = kmalloc(sizeof(ckrm_lrq_t),
+						       GFP_KERNEL);
+			BUG_ON(cls->local_queues[i]==NULL);
+			init_cpu_class_lrq(cls,i,0);
 		}
-
-		queue->expired_timestamp = 0;
-		
-		queue->cpu_class = cls;
-		queue->classqueue = get_cpu_classqueue(i);
-		queue->top_priority = MAX_PRIO;
-		cq_node_init(&queue->classqueue_linkobj);
-		queue->local_cvt = 0;
-		queue->lrq_load = 0;
-		queue->local_weight = cpu_class_weight(cls);
-		queue->uncounted_ns = 0;
-		queue->savings = 0;
-		queue->magic = 0x43FF43D7;
 	}
 
-	// add to class list
 	write_lock(&class_list_lock);
 	insert_cpu_class(cls);
 	write_unlock(&class_list_lock);
@@ -100,14 +158,14 @@ struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core)
 {
 	struct ckrm_cpu_class * cls;
 	cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
-	if (valid_cpu_class(cls))
-		return cls;
+  	if (valid_cpu_class(cls))
+		return (ckrm_cpu_enabled() ? cls : get_default_cpu_class());
 	else
 		return NULL;
 }
 
-
-void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) 
+void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, 
+			   struct ckrm_core_class *parent) 
 {		
 	struct ckrm_cpu_class *cls;
 
@@ -128,7 +186,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
 			set_default_share(&shares);
 			init_cpu_class(cls,&shares);
 			cls->core = core;
-			cls->parent = parent;
+			cls->parent = parent;			
 		}
 	} else
 		printk(KERN_ERR"alloc_cpu_class failed\n");
@@ -136,15 +194,14 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
 	return cls;
 }		
 
-/*
- * hzheng: this is not a stable implementation
- *         need to check race condition issue here
- */		
+void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr);
+
 static void ckrm_free_cpu_class(void *my_res) 
 {			
 	struct ckrm_cpu_class *cls = my_res, *parres, *childres;
 	ckrm_core_class_t *child = NULL;
 	int maxlimit;
+	int i;
 
 	if (!cls) 
 		return;
@@ -179,10 +236,19 @@ static void ckrm_free_cpu_class(void *my_res)
 	list_del(&cls->links);
 	write_unlock(&class_list_lock);
 
+	ckrm_cpu_class_queue_delete_sync(cls);
+
+	for_each_cpu(i) {
+		ckrm_lrq_t *lrq = get_ckrm_lrq(cls,i);
+		if (!lrq) continue;
+		lrq->magic = -99;
+		kfree(lrq);
+	}
 	kfree(cls);
 
-	//call ckrm_cpu_monitor after class removed
-	ckrm_cpu_monitor(0);
+	//call ckrm_cpu_monitor after class is removed
+	if (ckrm_cpu_enabled())
+		update_class_effectives();
 }				
 
 /*
@@ -194,8 +260,12 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
         struct ckrm_shares *cur = &cls->shares, *par;
         int rc = -EINVAL;
 
-        if (!cls) 
-                return rc;
+	if (ckrm_cpu_disabled())
+		return -ENOSYS;
+        if (!cls)
+		return rc;
+	if (new_share->total_guarantee > CKRM_SHARE_MAX)
+		return -E2BIG;
 
         if (cls->parent) {
                 parres = ckrm_get_cpu_class(cls->parent);
@@ -215,7 +285,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
 		new_share->my_guarantee = 0;
 
 	rc = set_shares(new_share, cur, par);
-	if (cur->my_limit == CKRM_SHARE_DONTCARE)
+	if (!rc && cur->my_limit == CKRM_SHARE_DONTCARE)
 		cur->my_limit = cur->max_limit;
 
 
@@ -225,7 +295,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
 	}
 
 	//call ckrm_cpu_monitor after changes are changed
-	ckrm_cpu_monitor(0);
+	update_class_effectives();
 
 	return rc;
 }							
@@ -235,22 +305,90 @@ static int ckrm_cpu_get_share(void *my_res,
 {			
 	struct ckrm_cpu_class *cls = my_res;
 
-	if (!cls) 
+	if (ckrm_cpu_disabled())
+		return -ENOSYS;
+        if (!cls)
 		return -EINVAL;
+
 	*shares = cls->shares;
 	return 0;
 }				
 
+/*
+ *   get_ckrm_usage():
+ *     obtain a sequence of <num> usage informations
+ *     returns number of usages reported.
+ *
+ *     report IN:  specifies the sequence of jiffies for which to report
+ *                 must be ordered (smallest first)
+ *            OUT: returns the usage in each field
+ *
+ */
+
+
+int ckrm_cpu_get_usage(struct ckrm_cpu_class* clsptr, 
+		       int num, ulong report[])
+{
+	struct ckrm_usage* usage = &clsptr->usage;
+	unsigned long long total = 0;
+	int i, idx, cur, num_ofs;
+
+	num_ofs = cur = i = 0;
+	idx = usage->sample_pointer;	
+
+	for ( num_ofs = 0; num_ofs < num ; num_ofs++ ) {
+		int nr_samples;
+		int duration = report[num_ofs];	
+		unsigned long long totval = 0;
+
+		nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
+		
+		if (nr_samples > USAGE_MAX_HISTORY)
+			nr_samples = USAGE_MAX_HISTORY;
+
+		for ( ; i< nr_samples; i++) {
+			if (! idx)
+				idx = USAGE_MAX_HISTORY;
+			idx --;
+			total += usage->samples[idx];
+		}
+		totval = total * 1000;
+		do_div(totval,NS_PER_SAMPLE);
+		do_div(totval,nr_samples * cpus_weight(cpu_online_map));
+		report[num_ofs] = totval;
+	}
+
+        return num;
+}
+
 int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
 {
 	struct ckrm_cpu_class *cls = my_res;
 	struct ckrm_cpu_class_stat* stat = &cls->stat;
 	ckrm_lrq_t* lrq;
 	int i;
+	ulong usage[3] = { 2*HZ, 10*HZ, 60*HZ };
 
-	if (!cls) 
+	if (!cls || ckrm_cpu_disabled()) 
 		return -EINVAL;
 
+	ckrm_cpu_get_usage(cls,3,usage);
+
+	/* this will after full stabilization become the only cpu usage stats
+	 */
+
+	seq_printf(sfile, "cpu-usage(2,10,60)= %lu %lu %lu\n",
+		   usage[0],usage[1],usage[2]);
+
+	if (usage_detail < 1) 
+		return 0;
+
+	/* the extended statistics we can decide whether we want to make the 
+	 * additional statistics available over config options
+	 * eitherway they should be reported in a more concised form
+	 * during stabilization, this is OK
+	 */
+
 	seq_printf(sfile, "-------- CPU Class Status Start---------\n");
 	seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
 		   cls->shares.my_guarantee,
@@ -261,26 +399,35 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
 		   cls->shares.unused_guarantee,
 		   cls->shares.cur_max_limit);
 
+	if (usage_detail < 2) 
+		goto out;
+
 	seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
 	seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
 	seq_printf(sfile, "\tehl= %d\n",stat->ehl);
 	seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
 	seq_printf(sfile, "\teshare= %d\n",stat->eshare);
-	seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls));
+	seq_printf(sfile, "\tmeshare= %d\n",stat->meshare);
 	seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
 	seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
-	seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n",
-		   get_ckrm_usage(cls,2*HZ),
-		   get_ckrm_usage(cls,10*HZ),
-		   get_ckrm_usage(cls,60*HZ)
-		   );
+	seq_printf(sfile, "\tusage(2,10,60)= %lu %lu %lu\n",
+		   usage[0],usage[1],usage[2]);
+
+	if (usage_detail < 3) 
+		goto out;
+
+	/* provide per run queue information */
 	for_each_online_cpu(i) {
 		lrq = get_ckrm_lrq(cls,i);		
-		seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings);
+		seq_printf(sfile, "\tlrq %d demand= %lu weight= %d "
+			   "lrq_load= %lu cvt= %llu sav= %llu\n",
+			   i,stat->local_stats[i].cpu_demand,
+			   local_class_weight(lrq),lrq->lrq_load,
+			   lrq->local_cvt,lrq->savings);
 	}
 
+out:
 	seq_printf(sfile, "-------- CPU Class Status END ---------\n");
-
 	return 0;
 }
 
@@ -296,10 +443,34 @@ void ckrm_cpu_change_class(void *task, void *old, void *new)
 	if (!task || ! old || !new)
 		return; 
 
+	if (ckrm_cpu_disabled())
+		newcls = get_default_cpu_class();
 	_ckrm_cpu_change_class(tsk,newcls);
 }							
 
-/*dummy function, not used*/
+enum config_token_t {
+	config_usage_detail,   /* define usage level 			  */
+	config_disable,        /* always use default linux scheduling     */
+			       /* effectively disables the ckrm scheduler */
+	config_enable,         /* always uses ckrm scheduling behavior    */
+	config_err             /* parsing error */
+};
+
+#define CKRM_SCHED_MODE_DISABLED_STR "disabled"
+#define CKRM_SCHED_MODE_ENABLED_STR  "enabled"
+
+static char *ckrm_sched_mode_str[] = { 
+		CKRM_SCHED_MODE_DISABLED_STR,
+		CKRM_SCHED_MODE_ENABLED_STR
+};
+
+static match_table_t config_tokens = {
+	{ config_disable,      "mode="CKRM_SCHED_MODE_DISABLED_STR },
+	{ config_enable,       "mode="CKRM_SCHED_MODE_ENABLED_STR  },
+	{ config_usage_detail, "usage_detail=%u" 		   },
+	{ config_err,          NULL                                }
+};
+
 static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
 {
 	struct ckrm_cpu_class *cls = my_res;
@@ -307,23 +478,61 @@ static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
 	if (!cls) 
 		return -EINVAL;
 
-	seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class");
+	seq_printf(sfile, "res=%s,mode=%s",
+		   CPU_CTRL_NAME,ckrm_sched_mode_str[ckrm_sched_mode]);
+	if (!ckrm_cpu_disabled())  /* enabled || mixed */
+		seq_printf(sfile, ",usage_detail=%u",usage_detail);
+	seq_printf(sfile,"\n");
 	return 0;
 }
 
-/*dummy function, not used*/
 static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
 {
 	struct ckrm_cpu_class *cls = my_res;
+	char *p;
+	char **cfgstr_p = (char**)&cfgstr;
+	substring_t args[MAX_OPT_ARGS];
+	int option,rc;
+	enum ckrm_sched_mode new_sched_mode;
 
 	if (!cls) 
 		return -EINVAL;
-	printk(KERN_DEBUG "ckrm_cpu config='%s'\n",cfgstr);
-	return 0;
+
+	new_sched_mode = ckrm_sched_mode;	
+	rc = 0;
+
+	while ((p = strsep(cfgstr_p, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		
+		token = match_token(p, config_tokens, args);
+		switch (token) {
+		case config_usage_detail:
+			if (ckrm_cpu_disabled() || 
+			    (match_int(&args[0], &option)) ||
+			    (option > CKRM_CPU_USAGE_DETAIL_MAX))
+			{
+				return -EINVAL;
+			}
+			usage_detail = option;
+			break;
+		case config_disable:
+		 	new_sched_mode = CKRM_SCHED_MODE_DISABLED;
+			break;
+		case config_enable:
+		 	new_sched_mode = CKRM_SCHED_MODE_ENABLED;
+			break;
+		case config_err:
+			return -EINVAL;
+		}
+	}
+	rc = ckrm_cpu_set_mode(new_sched_mode);
+	return rc;
 }
 	
 struct ckrm_res_ctlr cpu_rcbs = {
-	.res_name          = "cpu",
+	.res_name          = CPU_CTRL_NAME,
 	.res_hdepth        = 1,
 	.resid             = -1,
 	.res_alloc         = ckrm_alloc_cpu_class,
@@ -364,14 +573,69 @@ void init_cpu_classes(void)
 
 	//init classqueues for each processor
 	for (i=0; i < NR_CPUS; i++)
-		classqueue_init(get_cpu_classqueue(i)); 
+		classqueue_init(get_cpu_classqueue(i),ckrm_cpu_enabled()); 
 
-	/*
-	 * hzheng: initialize the default cpu class
-	 *  required for E14/E15 since ckrm_init is called after sched_init
-	 */
 	ckrm_alloc_cpu_class(NULL,NULL);
 }
 
+void ckrm_cpu_class_queue_update(int on);
+void ckrm_cpu_start_monitor(void);
+void ckrm_cpu_kill_monitor(void);
+
+static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode) 
+{
+        struct task_struct *proc, *tsk;
+	struct ckrm_cpu_class *new_cls = NULL;
+	int i;
+
+	if (mode == ckrm_sched_mode)
+		return 0;
+
+	printk("ckrm_cpu_set_mode from <%s> to <%s> pid=%d\n",
+		   ckrm_sched_mode_str[ckrm_sched_mode],
+		   ckrm_sched_mode_str[mode], 
+		   current->pid);
+
+	if (mode == CKRM_SCHED_MODE_DISABLED) {
+		ckrm_cpu_kill_monitor();
+		new_cls = get_default_cpu_class();
+	} else {
+		ckrm_cpu_class_queue_update(1);
+	}
+                             
+	/* run twice through the list to catch everyone,
+	 * current and transient once
+         */
+
+        read_lock(&tasklist_lock);
+
+	ckrm_sched_mode = mode;
+	/* we have to run through the list twice
+	 * first catch all existing tasks
+	 * and then deal with some potential race condition
+	 */
+	for ( i=2 ; i-- ; ) {
+		/* lock class_list_lock ? */
+	
+	        do_each_thread(proc, tsk) {
+			if (mode == CKRM_SCHED_MODE_ENABLED) {
+				new_cls = ckrm_get_res_class(class_core(tsk->taskclass),
+							     cpu_rcbs.resid,
+							     struct ckrm_cpu_class);
+			}	
+			_ckrm_cpu_change_class(tsk,new_cls);
+	        } while_each_thread(proc, tsk);
+	}
+        read_unlock(&tasklist_lock);
+
+	if (mode == CKRM_SCHED_MODE_DISABLED) 
+		ckrm_cpu_class_queue_update(0);
+	else 
+		ckrm_cpu_start_monitor();
+	return 0;
+}
 
 EXPORT_SYMBOL(ckrm_get_cpu_class);
+
+
+
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c
index d8c199a20..d8d6bd307 100644
--- a/kernel/ckrm/ckrm_cpu_monitor.c
+++ b/kernel/ckrm/ckrm_cpu_monitor.c
@@ -28,21 +28,30 @@
 #include <asm/div64.h>
 #include <linux/ckrm_sched.h>
 
+// #define CONFIG_CKRM_SUPPORT_MAXLIMITS
+
 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
-#define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
 
 #define CKRM_CPU_DEMAND_RUN 0
 #define CKRM_CPU_DEMAND_SLEEP 1
-//sample task cpu demand every 64ms
-#define CPU_DEMAND_TASK_RECALC  (64000000LL)
-#define CPU_DEMAND_CLASS_RECALC (256000000LL)
+//sample task cpu demand every 32ms
+#define CPU_DEMAND_TASK_RECALC  ( 32*1000*1000LL)
+#define CPU_DEMAND_CLASS_RECALC (256*1000*1000LL)
 #define CPU_DEMAND_TP_CLASS 0
 #define CPU_DEMAND_TP_TASK 1
 
+static void update_ckrm_idle(unsigned long surplus);
+
+void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu);
+int alloc_surplus(struct ckrm_core_class *root_core);
 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
-void update_ckrm_idle(unsigned long surplus);
 
 /*interface to share definition*/
+static inline int get_my_grt(struct ckrm_cpu_class *cls)
+{
+	return cls->shares.unused_guarantee;
+}
+
 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
 {
 	return cls->shares.my_limit;
@@ -63,6 +72,57 @@ static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
 	return cls->shares.total_guarantee;
 }
 
+static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
+				       int new_share)
+{
+	if (!new_share)
+		new_share = 1;
+
+	BUG_ON(new_share < 0);
+	stat->eshare = new_share;
+}
+
+static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
+					    int new_share)
+{
+	if (!new_share)
+		new_share = 1;
+
+	BUG_ON(new_share < 0);
+	stat->meshare = new_share;
+}
+
+/**
+ *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
+ *
+ * self_cpu_demand = sum(cpu demand of all local queues) 
+ */
+static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
+{
+	int cpu_demand = 0;
+	int i;
+	int cpuonline = 0;
+
+	for_each_online_cpu(i) {
+		cpu_demand_check_sleep(stat,i);
+		cpu_demand += stat->local_stats[i].cpu_demand;
+		cpuonline ++;
+	}
+
+	return (cpu_demand/cpuonline);
+}
+
+/*
+ * my max demand = min(cpu_demand, my effective hard limit)
+ */
+static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
+{
+	unsigned long mmax_demand = get_self_cpu_demand(stat);
+	if (mmax_demand > stat->mehl)
+		mmax_demand = stat->mehl;
+
+	return mmax_demand;
+}
 
 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
 {
@@ -85,7 +145,7 @@ static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat,
 	}
 }
 
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares)
 {
 	int i;
 
@@ -93,7 +153,7 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
 	stat->total_ns = 0;
 	stat->max_demand = 0;
 
-	for (i=0; i< NR_CPUS; i++) {
+	for (i=0; i<NR_CPUS; i++) {
 		cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
 	}
 
@@ -102,10 +162,517 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
 	stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
 	stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
 
-	stat->eshare = CKRM_SHARE_MAX;
-	stat->meshare = CKRM_SHARE_MAX;
+	stat->eshare = eshares;
+	stat->meshare = eshares;
+
+	stat->has_savings = 0;  
+	stat->demand_per_share = 0;
+
+}
+
+#if 0  // keep handy for debugging if necessary
+void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num)
+{
+	struct ckrm_cpu_class_stat* stat = &clsptr->stat;
+	printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num,
+		clsptr, (clsptr == get_default_cpu_class()),
+	        clsptr->shares.my_guarantee, 
+		clsptr->shares.my_limit, 
+		clsptr->shares.total_guarantee,
+	        clsptr->shares.max_limit, 
+		clsptr->shares.unused_guarantee);
+	printk("      egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n",
+		stat->egrt,stat->megrt,stat->ehl,stat->mehl,
+		stat->eshare,stat->meshare);
+}
+#endif
+
+/**********************************************/
+/*          surplus allocation                */
+/**********************************************/
+
+/*
+ * surplus = egrt - demand
+ * if surplus < 0, surplus = 0 
+ */
+static inline int get_node_surplus(struct ckrm_cpu_class *cls)
+{
+	int surplus = cls->stat.egrt - cls->stat.max_demand;
+
+	if (surplus < 0)
+		surplus = 0;
+
+	return surplus;
+}
+
+/*
+ * consume savings in advance because this class give surplus to others
+ * this is a quick hack, should be integrated with balance_savings()
+ */
+static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr, 
+					    int savings_consumed) 
+{
+	long long total_savings;
+	ckrm_lrq_t* lrq;
+	int i;
+	int cpu_online = 0;
+	
+	total_savings = 0;
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		total_savings += lrq->savings;
+		cpu_online ++;
+	}
+	
+	total_savings -= savings_consumed;
+	if (total_savings < 0)
+		total_savings = 0;
+
+	//get the average savings
+	do_div(total_savings,cpu_online);	
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		lrq->savings = total_savings;
+	}
+}
+
+static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
+{
+	int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
+	int savings_consumed;
+
+	if (surplus < 0)
+		surplus = 0;
+
+	/*
+	 * a quick hack about the hierarchy savings distribution 
+	 * may not be the right way to do
+	 *
+	 * since this node give its surplus to other nodes, 
+	 * it's savings should be consumed
+	 * suppose CPU_MONITOR_INTERVAL = (HZ) 
+	 * savings_consumed is roughly how much savings will be consumed for the next second
+	 */
+	if (surplus) {
+		savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT);
+		consumed_surplus_savings(cls, savings_consumed) ;
+	}
+
+	return surplus;
+}
+
+/*
+ * all the class in the queue consume the surplus in order
+ * each class consume the amount propotional to its egrt
+ */
+static int consume_surplus_in_order(struct list_head* queue,
+					   struct ckrm_cpu_class *p_cls,
+					   int total_surplus)
+{
+	int total_grt = 0;
+	struct ckrm_cpu_class *clsptr;	
+
+	/*
+	 * get total_grt of the classes in the queue
+	 * total_grt can be maintained instead of re-calcuated each time
+	 */
+	list_for_each_entry(clsptr,queue,surplus_queue) {
+		if (unlikely(clsptr == p_cls))
+			total_grt += clsptr->stat.megrt;
+		else
+			total_grt += clsptr->stat.egrt;
+	}
+
+	if (! total_grt)
+		goto consume_out;
+	
+	//allocate in order
+	list_for_each_entry(clsptr,queue,surplus_queue) {		
+		int surplus_per_share;
+		int consumed, my_grt;
+
+		BUG_ON(! total_grt);
+		surplus_per_share = 
+			(total_surplus << CKRM_SHARE_SHIFT) / total_grt;
+
+		if (surplus_per_share <= 0)
+			break;
+
+		if (unlikely(clsptr == p_cls))  //self_node consuming
+			my_grt =  clsptr->stat.megrt;
+		else
+			my_grt = clsptr->stat.egrt;
+
+		BUG_ON(clsptr->stat.demand_per_share <= 0);
+
+		if (clsptr->stat.demand_per_share < surplus_per_share)
+			surplus_per_share = clsptr->stat.demand_per_share;
+
+		consumed = surplus_per_share * my_grt;
+		consumed >>= CKRM_SHARE_SHIFT;
+		total_surplus -= consumed;
+		BUG_ON(total_surplus < 0);
+		total_grt -= my_grt;
+
+		if (unlikely(clsptr == p_cls))
+			set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed);			
+		else
+			set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed);
+	}	
+ consume_out:	
+	if (total_surplus <= 1) //if total_suplus too small, no need to allocate again
+		total_surplus = 0;
+	return total_surplus;
+}
+
+/*
+ * link all the children of parent and the parent itself using their surplus_queue field
+ * link the whole queue using src_queue
+ * if anything wrong return -1
+ */
+static int get_class_surplus_queue(struct ckrm_core_class *parent,
+				   struct list_head* src_queue)
+{
+	struct ckrm_core_class *child_core = NULL;
+	struct ckrm_cpu_class *p_cls,*c_cls;
+	int ret = -1;
+
+	p_cls = ckrm_get_cpu_class(parent);
+	if (! p_cls)
+		goto link_out;
+
+	INIT_LIST_HEAD(src_queue);
+
+	//add the parent node itself
+	list_add(&p_cls->surplus_queue,src_queue);
+	do {
+		child_core = ckrm_get_next_child(parent, child_core);
+		if (child_core) {
+			c_cls = ckrm_get_cpu_class(child_core);				
+			if (! c_cls)
+				goto link_out;
+			list_add(&c_cls->surplus_queue,src_queue);
+		}
+	} while (child_core);
+
+	ret = 0;
+
+ link_out:
+	return ret;
+}
+
+/*
+ * insert the class to queue based on stat->demand_per_share
+ * status: tested
+ */
+static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr)
+{
+	struct ckrm_cpu_class *cur_cls = NULL;	
+	int end_of_queue = 1;
+
+	list_for_each_entry(cur_cls,queue,surplus_queue) {
+		if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) {
+			end_of_queue = 0;
+			break;
+		}
+	}
+
+	//insert the clsptr
+	if (! cur_cls || end_of_queue)
+		list_add_tail(&clsptr->surplus_queue,queue);
+	else
+		list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue);
+}
+
+/*
+ * copy all classes in src_queue to dst_queue,
+ * reorder the classes based on their normalized demand 
+ * if a class already saturate (eshare >= demand), also remove it from src_queue
+ * return the total guarantee of the selected classes
+ *
+ * @src_queue: source queue
+ * @dst_queue: destination queue
+ * @check_sl: check soft limit
+ * @check_savings: only class has savings should be considered
+ */
+
+static unsigned long reorder_surplus_queue(struct list_head* src_queue, 
+					   struct list_head* dst_queue, 
+					   int check_sl, int check_savings, 
+					   struct ckrm_cpu_class *p_cls) 
+{
+	struct ckrm_cpu_class *clsptr, *tmp;	
+
+	INIT_LIST_HEAD(dst_queue);
+
+	list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) {
+		struct ckrm_cpu_class_stat* stat = &clsptr->stat;
+		int inc_limit;
+		int max_demand, eshare, esl,grt;
+
+		if (unlikely(clsptr == p_cls)) {
+			max_demand = get_mmax_demand(stat);
+			eshare  = stat->meshare;
+			esl = get_mysoft_limit(clsptr);
+			grt = stat->megrt;
+		} else {
+			max_demand = stat->max_demand;
+			eshare = stat->eshare;
+			esl = get_soft_limit(clsptr);
+			grt = stat->egrt;
+		}
+
+		//hard limit and demand limit
+		inc_limit = max_demand - eshare;
+		
+		//no additional share needed
+		if (inc_limit <= 0 || ! grt) {
+			list_del(&clsptr->surplus_queue);
+			continue;
+		}
+			
+		//or no more savings
+		if (check_savings && ! stat->has_savings)
+			continue;
+		
+		//check soft limit
+		if (check_sl) {
+			int soft_limit;
+
+			soft_limit = p_cls->stat.eshare * esl
+				/ p_cls->shares.total_guarantee;
+
+			if (soft_limit < max_demand)
+				inc_limit = soft_limit - eshare;
+			if ( inc_limit <= 0)   /* can turn negative */
+				continue;
+		}
+
+		BUG_ON(! grt);
+		//get the stat->demand_per_share
+		stat->demand_per_share = 
+			(inc_limit << CKRM_SHARE_SHIFT) / grt;	
+
+		list_del_init(&clsptr->surplus_queue);
+		//insert the class to the queue
+		insert_surplus_queue(dst_queue,clsptr);
+	}
+	return 0;
+}
+
+/*
+ * get all the surplus that should be reallocated to the children
+ */
+static inline int get_total_surplus(struct ckrm_cpu_class *p_cls,
+				    struct ckrm_core_class *parent) 
+{
+	struct ckrm_cpu_class *c_cls;
+	int total_surplus;
+	struct ckrm_core_class *child_core = NULL;
+
+	//additional share assigned to this sub node from parent
+	total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
+	BUG_ON(total_surplus < 0);
+
+	//surplus of this node
+	total_surplus += get_my_node_surplus(p_cls);
+	do {
+		child_core = ckrm_get_next_child(parent, child_core);
+		if (child_core) {
+			c_cls = ckrm_get_cpu_class(child_core);				
+			if (! c_cls) {
+				total_surplus = 0;
+				break;
+			}
+
+			total_surplus += get_node_surplus(c_cls);			
+		}
+	} while (child_core);
+
+	return total_surplus;
+}
+/**
+ * alloc_surplus_node: re-allocate the shares for a single level
+ * @parent: parent node
+ * return the remaining surplus
+ *
+ * The surplus reallocation policy is like below.
+ * -- the classes that have eshare >= demand don't need any additional share. 
+ *     So they don't participate the surplus allocation.
+ * -- all the other classes received share in this order:
+ * 1. has savings, not over soft limit
+ * 2. has savings, but over soft limit
+ * 3. no savings, not over soft limit
+ * 4. no savings, over soft limit
+ * 
+ * In each of the 4 levels above, classes get surplus propotionally to its guarantee
+ */
+static int alloc_surplus_node(struct ckrm_core_class *parent)
+{
+	struct ckrm_cpu_class *p_cls;
+	int total_surplus;
+	int ret = -1;
+	struct list_head src_queue, dst_queue;
+
+	p_cls = ckrm_get_cpu_class(parent);
+	if (! p_cls) //safty check
+		goto realloc_out;
+
+	ret = 0;
+	total_surplus = get_total_surplus(p_cls,parent);
+
+	if (! total_surplus) //no surplus to be allocated 
+		goto realloc_out;
+
+	/* 
+	 * first round, allocated to tasks with savings, check_sl
+	 */
+	get_class_surplus_queue(parent,&src_queue);
+	reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls);
+	if (! list_empty(&dst_queue)) {
+		total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+		if (! total_surplus)
+			goto realloc_out;
+	}
+
+	/* 
+	 * second round, check savings, but no check_sl
+	 */
+	//merge the src_queue and dst_queue and reorder
+	list_splice(&dst_queue, &src_queue);
+	reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls);
+	if (! list_empty(&dst_queue)) {
+		total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+		if (! total_surplus)
+			goto realloc_out;
+	}
+
+	/* 
+	 * third round, no check savings, but check_sl
+	 */
+	//merge the src_queue and dst_queue and reorder
+	list_splice(&dst_queue, &src_queue);
+	reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls);
+	if (! list_empty(&dst_queue)) {
+		total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+		if (! total_surplus)
+			goto realloc_out;
+	}
+	/* 
+	 * fourth round, no check savings, no check_sl
+	 */
+	//merge the src_queue and dst_queue and reorder
+	list_splice(&dst_queue, &src_queue);
+	reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls);
+	if (! list_empty(&dst_queue))
+		total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);	
+	
+ realloc_out:
+	return ret;
+}
+
+/*
+ * return true if the class total savings > MIN_SAVINGS 
+ */
+static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online)
+{
+	unsigned long long total_savings;
+	ckrm_lrq_t* lrq;
+	int i;
+#define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS)
+	
+	total_savings = 0;
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		total_savings += lrq->savings;
+	}
+
+	if (total_savings < CLASS_MIN_SAVINGS)
+		return 0;
+
+	//get the average savings
+	do_div(total_savings,cpu_online);	
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		lrq->savings = total_savings;
+	}
+
+	/*
+	 * hzheng: this is another quick hack
+	 * only say I have savings when this node has more demand
+	 * ignoring the requirement of child classes
+	 */
+	if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat))
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * check savings status
+ * set has_savings field if the class or its sub class has savings
+ */
+static void check_savings_status(struct ckrm_core_class *root_core)
+{
+	struct ckrm_cpu_class *clsptr;
+	int cpu_online;
+
+	cpu_online = cpus_weight(cpu_online_map);	
+
+	//class status: demand, share,total_ns prio, index
+	list_for_each_entry(clsptr,&active_cpu_classes,links) 
+		clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online);
+}
+
+/**
+ * alloc_surplus - reallocate unused shares
+ *
+ * class A's usused share should be allocated to its siblings
+ * the re-allocation goes downward from the top
+ */
+int alloc_surplus(struct ckrm_core_class *root_core)
+{
+	struct ckrm_core_class *cur_core, *child_core;
+	//	struct ckrm_cpu_class *cls;
+	int ret = -1;
+
+	check_savings_status(root_core);
+
+	/*initialize*/
+	cur_core = root_core;
+	child_core = NULL;
+	//	cls = ckrm_get_cpu_class(cur_core);
+
+	/*the ckrm idle tasks get all what's remaining*/
+	/*hzheng: uncomment the following like for hard limit support */
+	//	update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
+	
+ repeat:
+	//check exit
+	if (!cur_core)
+		return 0;
+
+	//visit this node only once
+	if (! child_core) 
+		if ( alloc_surplus_node(cur_core) < 0 )
+			return ret;
+
+	//next child
+	child_core = ckrm_get_next_child(cur_core, child_core);
+	if (child_core) {
+		//go down
+		cur_core = child_core;
+		child_core = NULL;
+		goto repeat;
+	} else {		//no more child, go back
+		child_core = cur_core;
+		cur_core = child_core->hnode.parent;
+	}
+	goto repeat;
 }
 
+
+
 /**********************************************/
 /*          cpu demand                        */
 /**********************************************/
@@ -134,27 +701,29 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  * how often should we recalculate the cpu demand
  * the number is in ns
  */
-static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
+static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,
+					  int state, unsigned long long len)
 {	
 	local_stat->total += len;
 	if (state == CKRM_CPU_DEMAND_RUN)
 		local_stat->run += len;
 
 	if (local_stat->total >= local_stat->recalc_interval) {
-		local_stat->total >>= CKRM_SHARE_ACCURACY;
-		if (unlikely(local_stat->run > 0xFFFFFFFF))
-			local_stat->run = 0xFFFFFFFF;
+		local_stat->total >>= CKRM_SHARE_SHIFT;
+		if (unlikely(local_stat->run > ULONG_MAX))
+			local_stat->run = ULONG_MAX;
 
-		if (local_stat->total > 0xFFFFFFFF) 
-			local_stat->total = 0xFFFFFFFF;
+		if (unlikely(local_stat->total > ULONG_MAX))
+			local_stat->total = ULONG_MAX;
 			
 		do_div(local_stat->run,(unsigned long)local_stat->total);
 
-		if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
+		if (unlikely(local_stat->total > ULONG_MAX)) {
+			//happens after very long sleep
 			local_stat->cpu_demand = local_stat->run;
-		else {
-			local_stat->cpu_demand += local_stat->run;
-			local_stat->cpu_demand >>= 1;
+		} else { 
+			local_stat->cpu_demand = 
+                            (local_stat->cpu_demand + local_stat->run) >> 1;
 		}
 		local_stat->total = 0;
 		local_stat->run = 0;
@@ -190,57 +759,25 @@ void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsign
 		break;
 	default:
 		BUG();
-	}
-}
-
-/** 
- * check all the class local queue
- * 
- * to deal with excessive long run/sleep state
- * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
- */
-static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
-{
-	struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
-	unsigned long long sleep,now;
-	if (local_stat->last_sleep) {
-		now = sched_clock();
-		sleep = now - local_stat->last_sleep;
-		local_stat->last_sleep = now;
-		update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
-	}
-}
-
-/**
- *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
- *
- * self_cpu_demand = sum(cpu demand of all local queues) 
- */
-static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
-{
-	int cpu_demand = 0;
-	int i;
-	int cpuonline = 0;
-
-	for_each_online_cpu(i) {
-		cpu_demand_check_sleep(stat,i);
-		cpu_demand += stat->local_stats[i].cpu_demand;
-		cpuonline ++;
-	}
-
-	return (cpu_demand/cpuonline);
+	}
 }
 
-/*
- * my max demand = min(cpu_demand, my effective hard limit)
+/** 
+ * check all the class local queue
+ * 
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
  */
-static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
+void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
 {
-	unsigned long mmax_demand = get_self_cpu_demand(stat);
-	if (mmax_demand > stat->mehl)
-		mmax_demand = stat->mehl;
-
-	return mmax_demand;
+	struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
+	unsigned long long sleep,now;
+	if (local_stat->last_sleep) {
+		now = sched_clock();
+		sleep = now - local_stat->last_sleep;
+		local_stat->last_sleep = now;
+		update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
+	}
 }
 
 /**
@@ -301,26 +838,6 @@ static int update_max_demand(struct ckrm_core_class *root_core)
 /**********************************************/
 /*          effective guarantee & limit       */
 /**********************************************/
-static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
-				       int new_share)
-{
-	if (!new_share)
-		new_share = 1;
-
-	BUG_ON(new_share < 0);
-	stat->eshare = new_share;
-}
-
-static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
-					    int new_share)
-{
-	if (!new_share)
-		new_share = 1;
-
-	BUG_ON(new_share < 0);
-	stat->meshare = new_share;
-}
-
 /**
  *update_child_effective - update egrt, ehl, mehl for all children of parent
  *@parent: the parent node
@@ -346,7 +863,7 @@ static int update_child_effective(struct ckrm_core_class *parent)
 		    p_cls->stat.egrt *
 		    c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
 
-		c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
+		c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls)
 			/ c_cls->shares.total_guarantee;
 		
 		c_cls->stat.ehl =
@@ -372,8 +889,9 @@ static int update_child_effective(struct ckrm_core_class *parent)
  *
  * return -1 if anything wrong happened (eg: the structure changed during the process)
  */
-static int update_effectives(struct ckrm_core_class *root_core)
+int update_effectives(void)
 {
+	struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 	struct ckrm_core_class *cur_core, *child_core;
 	struct ckrm_cpu_class *cls;
 	int ret = -1;
@@ -384,7 +902,7 @@ static int update_effectives(struct ckrm_core_class *root_core)
 
 	//initialize the effectives for root 
 	cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
-	cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
+	cls->stat.megrt = cls->stat.egrt * get_my_grt(cls)
 		/ cls->shares.total_guarantee;
 	cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
 		/ cls->shares.total_guarantee;
@@ -418,288 +936,11 @@ static int update_effectives(struct ckrm_core_class *root_core)
 }
 
 /**********************************************/
-/*          surplus allocation                */
+/*           CKRM Idle Tasks                  */
 /**********************************************/
 
-/*
- * surplus = egrt - demand
- * if surplus < 0, surplus = 0 
- */
-static inline int get_node_surplus(struct ckrm_cpu_class *cls)
-{
-	int surplus = cls->stat.egrt - cls->stat.max_demand;
-
-	if (surplus < 0)
-		surplus = 0;
-
-	return surplus;
-}
-
-static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
-{
-	int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
-
-	if (surplus < 0)
-		surplus = 0;
-
-	return surplus;
-}
-
-/**
- * consume_surplus: decides how much surplus a node can consume
- * @ckeck_sl: if check_sl is set, then check soft_limitx
- * return how much consumed
- *
- * implements all the CKRM Scheduling Requirement
- * assume c_cls is valid
- */
-static inline int consume_surplus(int surplus,
-				       struct ckrm_cpu_class *c_cls,
-				       struct ckrm_cpu_class *p_cls,
-				       int check_sl
-				       )
-{
-	int consumed = 0;
-	int inc_limit;
-	int total_grt = p_cls->shares.total_guarantee;
-
- 	BUG_ON(surplus < 0);
-
-	/*can't consume more than demand or hard limit*/
-	if (c_cls->stat.eshare >= c_cls->stat.max_demand)
-		goto out;
-
-	//the surplus allocation is propotional to grt
-	consumed =
-		surplus * c_cls->shares.my_guarantee / total_grt;
-
-	if (! consumed) //no more share
-		goto out;
-
-	//hard limit and demand limit
-	inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
-
-	if (check_sl) {
-		int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
-			/total_grt;
-		if (esl < c_cls->stat.max_demand)
-			inc_limit = esl - c_cls->stat.eshare;
-	}
-
-	if (consumed > inc_limit)
-		consumed = inc_limit;
-
-        BUG_ON(consumed < 0);
- out:		
-	return consumed;
-}
-
-/*
- * how much a node can consume for itself?
- */
-static inline int consume_self_surplus(int surplus,
-				       struct ckrm_cpu_class *p_cls,
-				       int check_sl
-				       )
-{
-	int consumed = 0;
-	int inc_limit;
-	int total_grt = p_cls->shares.total_guarantee;
-	int max_demand = get_mmax_demand(&p_cls->stat);
-
- 	BUG_ON(surplus < 0);
-
-	/*can't consume more than demand or hard limit*/
-	if (p_cls->stat.meshare >= max_demand)
-		goto out;
-
-	//the surplus allocation is propotional to grt
-	consumed =
-		surplus * p_cls->shares.unused_guarantee / total_grt;
-
-	if (! consumed) //no more share
-		goto out;
-
-	//hard limit and demand limit
-	inc_limit = max_demand - p_cls->stat.meshare;
-
-	if (check_sl) {
-		int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
-			/total_grt;
-		if (mesl < max_demand)
-			inc_limit = mesl - p_cls->stat.meshare;
-	}
-
-	if (consumed > inc_limit)
-		consumed = inc_limit;
-
-        BUG_ON(consumed < 0);
- out:		
-	return consumed;
-}
-
-
-/*
- * allocate surplus to all its children and also its default class
- */
-static int alloc_surplus_single_round(
-				      int surplus,
-				      struct ckrm_core_class *parent,
-				      struct ckrm_cpu_class *p_cls,
-				      int check_sl)
-{
-	struct ckrm_cpu_class *c_cls;
-	struct ckrm_core_class *child_core = NULL;
-	int total_consumed = 0,consumed;
-
-	//first allocate to the default class
-	consumed  =
-		consume_self_surplus(surplus,p_cls,check_sl);
-
-	if (consumed > 0) {
-		set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
-		total_consumed += consumed;
-	}
-
-	do {
-		child_core = ckrm_get_next_child(parent, child_core);
-		if (child_core)  {
-			c_cls = ckrm_get_cpu_class(child_core);
-			if (! c_cls)
-				return -1;
-
-			consumed    =
-				consume_surplus(surplus, c_cls,
-						     p_cls,check_sl);
-			if (consumed > 0) {
-				set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
-				total_consumed += consumed;
-			}
-		}
-	} while (child_core);
-
-	return total_consumed;
-}
-
-/**
- * alloc_surplus_node: re-allocate the shares for children under parent
- * @parent: parent node
- * return the remaining surplus
- *
- * task:
- *  1. get total surplus
- *  2. allocate surplus
- *  3. set the effective_share of each node
- */
-static int alloc_surplus_node(struct ckrm_core_class *parent)
-{
-	struct ckrm_cpu_class *p_cls,*c_cls;
-	int total_surplus,consumed;
-	int check_sl;
-	int ret = -1;
-	struct ckrm_core_class *child_core = NULL;
-
-	p_cls = ckrm_get_cpu_class(parent);
-	if (! p_cls)
-		goto realloc_out;
-
-	/*
-	 * get total surplus
-	 */
-	total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
-	BUG_ON(total_surplus < 0);
-	total_surplus += get_my_node_surplus(p_cls);
-
-	do {
-		child_core = ckrm_get_next_child(parent, child_core);
-		if (child_core) {
-			c_cls = ckrm_get_cpu_class(child_core);				
-			if (! c_cls)
-				goto realloc_out;
-
-			total_surplus += get_node_surplus(c_cls);
-		}
-	} while (child_core);
-
-
-	if (! total_surplus) {
-		ret = 0;
-		goto realloc_out;
-	}
-
-	/* 
-	 * distributing the surplus 
-	 * first with the check_sl enabled
-	 * once all the tasks has research the soft limit, disable check_sl and try again
-	 */
-	
-	check_sl = 1;
-	do {
-		consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
-		if (consumed < 0) //something is wrong
-			goto realloc_out;
-
-		if (! consumed)
-			check_sl = 0;
-		else
-			total_surplus -= consumed;
-
-	} while ((total_surplus > 0) && (consumed || check_sl) );
-
-	ret = 0;
-	
- realloc_out:
-	return ret;
-}
-
-/**
- * alloc_surplus - reallocate unused shares
- *
- * class A's usused share should be allocated to its siblings
- * the re-allocation goes downward from the top
- */
-static int alloc_surplus(struct ckrm_core_class *root_core)
-{
-	struct ckrm_core_class *cur_core, *child_core;
-	//	struct ckrm_cpu_class *cls;
-	int ret = -1;
-
-	/*initialize*/
-	cur_core = root_core;
-	child_core = NULL;
-	//	cls = ckrm_get_cpu_class(cur_core);
-
-	/*the ckrm idle tasks get all what's remaining*/
-	/*hzheng: uncomment the following like for hard limit support */
-	//	update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
-	
- repeat:
-	//check exit
-	if (!cur_core)
-		return 0;
-
-	//visit this node only once
-	if (! child_core) 
-		if ( alloc_surplus_node(cur_core) < 0 )
-			return ret;
-
-	//next child
-	child_core = ckrm_get_next_child(cur_core, child_core);
-	if (child_core) {
-		//go down
-		cur_core = child_core;
-		child_core = NULL;
-		goto repeat;
-	} else {		//no more child, go back
-		child_core = cur_core;
-		cur_core = child_core->hnode.parent;
-	}
-	goto repeat;
-}
+#ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS
 
-/**********************************************/
-/*           CKRM Idle Tasks                  */
-/**********************************************/
 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
 struct task_struct* ckrm_idle_tasks[NR_CPUS];
 
@@ -710,7 +951,7 @@ static inline int get_nr_idle(unsigned long surplus)
 	int nr_idle = 0; 
 	
 	nr_idle = surplus * cpu_online;
-	nr_idle >>= CKRM_SHARE_ACCURACY;
+	nr_idle >>= CKRM_SHARE_SHIFT;
 
 	if (surplus) 
 		nr_idle ++;
@@ -722,7 +963,8 @@ static inline int get_nr_idle(unsigned long surplus)
 }
 
 /**
- * update_ckrm_idle: update the status of the idle class according to the new surplus
+ * update_ckrm_idle: update the status of the idle class according 
+ *                   to the new surplus
  * surplus: new system surplus
  *
  * Task:
@@ -816,6 +1058,20 @@ void ckrm_start_ckrm_idle(void)
 	}
 }
 
+void ckrm_stop_ckrm_idle(void)
+{
+	BUG_ON(1);   // not yet implemented
+}
+
+#else
+
+static inline void ckrm_start_ckrm_idle(void) { };
+static inline void ckrm_stop_ckrm_idle(void) { };
+static inline void update_ckrm_idle(unsigned long surplus) { };
+
+#endif
+
+
 /**********************************************/
 /*          Local Weight                      */
 /**********************************************/
@@ -831,8 +1087,19 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
 	int i;
 	unsigned long class_weight;
 	unsigned long long lw;	
-
-	//get total pressure
+	struct ckrm_cpu_class_stat *stat;
+	unsigned long oweight;
+	unsigned long skewed_limit;
+	/*
+	 * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare
+	 * then we set the skewed_share 
+	 */
+#define SKEWED_SHARE_RATIO 8
+#define SKEWED_WEIGHT_MIN 3
+	
+	/* get total pressure of the class, if there is not pressure (.. class is
+	 * idle, then leave the weights as is
+	 */
 	for_each_online_cpu(i) {
 		lrq = get_ckrm_lrq(clsptr,i);
 		total_pressure += lrq->lrq_load;
@@ -841,32 +1108,61 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
 	if (! total_pressure)
 		return;
 	
+	stat = &clsptr->stat;
+
 	class_weight = cpu_class_weight(clsptr) * cpu_online;
 
+	/* calculate or skewed limit weight */
+	skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO);
+	if (skewed_limit < SKEWED_WEIGHT_MIN)
+		skewed_limit = SKEWED_WEIGHT_MIN;
+
+	/* calculate over_weight */	
+	BUG_ON(stat->meshare < stat->megrt);
+	oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare;
+	oweight = SHARE_TO_WEIGHT(oweight);
+
 	/*
 	 * update weight for each cpu, minimun is 1
 	 */
 	for_each_online_cpu(i) {
 		lrq = get_ckrm_lrq(clsptr,i);
-		if (! lrq->lrq_load)
-			/*give idle class a high share to boost interactiveness */
+		lrq->over_weight = oweight;
+		if (! lrq->lrq_load) {
+			/* give idle class a high share to boost 
+			 * interactiveness 
+			 */
 			lw = cpu_class_weight(clsptr); 
-		else {
-			lw = lrq->lrq_load * class_weight;
+			if (unlikely(lw==0))
+				lw = 1;
+		} else {
+			lw = lrq->lrq_load;
+			lw *= class_weight;
 			do_div(lw,total_pressure);
-			if (!lw)
+			if (unlikely(lw==0))
 				lw = 1;
-			else if (lw > CKRM_SHARE_MAX)
-				lw = CKRM_SHARE_MAX;
-		}
-		
+			else if (unlikely(lw > CKRM_MAX_WEIGHT))
+				lw = CKRM_MAX_WEIGHT;
+		}	
+		BUG_ON(lw > CKRM_MAX_WEIGHT);
+
+		/* 
+		 * set is_skewed and local_weight in proper order
+		 * to avoid race condition
+		 */
 		lrq->local_weight = lw;
+		if (lw < skewed_limit) 
+			lrq->skewed_weight = skewed_limit;
+		else
+			lrq->skewed_weight = 0;
+		BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight));
 	}
 }
 
 /*
  * assume called with class_list_lock read lock held
  */
+
 void adjust_local_weight(void)
 {
 	static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
@@ -904,9 +1200,11 @@ void ckrm_cpu_monitor(int check_min)
 	static unsigned long long last_check = 0;
 	struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 	unsigned long long now;	
-#define MIN_CPU_MONITOR_INTERVAL 100000000UL
+	int loc;
+
+#define MIN_CPU_MONITOR_INTERVAL (100*1000*1000)  /* 100 MSEC */
 
-	if (!root_core)
+	if (ckrm_cpu_disabled() || !root_core)
 		return;
 
 	//do nothing if someone already holding the lock
@@ -918,29 +1216,37 @@ void ckrm_cpu_monitor(int check_min)
 	now = sched_clock();
 
 	//consecutive check should be at least 100ms apart
-	if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL))
-		goto outunlock;
+	if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL))
+		goto outunlock_np;
 
 	last_check = now;
 
-	if (update_effectives(root_core) != 0)
+	if (update_effectives() != 0) {
+		loc = 0;
 		goto outunlock;
+	}
 	
-	if (update_max_demand(root_core) != 0)
+	if (update_max_demand(root_core) != 0) {
+		loc = 1;
 		goto outunlock;
+	}
 	
-#ifndef ALLOC_SURPLUS_SUPPORT
-#warning "MEF taking out alloc_surplus"
-#else
-	if (alloc_surplus(root_core) != 0)
+#warning mef: alloc_surplus call back in system;
+	if (alloc_surplus(root_core) != 0) {
+		loc = 2;
 		goto outunlock;
-#endif
+	}
 	
 	adjust_local_weight();
 
- outunlock:	
+ outunlock_np:
 	read_unlock(&class_list_lock);
 	spin_unlock(&lock);
+	return;
+
+ outunlock:	
+	printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc);
+	goto outunlock_np;
 }
 
 /*****************************************************/
@@ -952,6 +1258,8 @@ static int thread_exit = 0;
 static int ckrm_cpu_monitord(void *nothing)
 {
 	daemonize("ckrm_cpu_ctrld");
+	printk("cpu_monitord started\n");
+	thread_exit = 0;
 	for (;;) {
 		/*sleep for sometime before next try*/
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -967,15 +1275,19 @@ static int ckrm_cpu_monitord(void *nothing)
 	return 0;
 }
 
-void ckrm_start_monitor(void)
+void ckrm_cpu_start_monitor(void)
 {
+	if (cpu_monitor_pid != -1) {
+		/* already started ... */
+		return;
+	}	
 	cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
 	if (cpu_monitor_pid < 0) {
 		printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
 	}
 }
 
-void ckrm_kill_monitor(void)
+void ckrm_cpu_kill_monitor(void)
 {
 	printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
 	if (cpu_monitor_pid > 0) {
@@ -987,22 +1299,12 @@ void ckrm_kill_monitor(void)
 	}
 }
 
-int ckrm_cpu_monitor_init(void)
+static int __init ckrm_cpu_init_monitor(void)
 {
-	ckrm_start_monitor();
-	/*hzheng: uncomment the following like for hard limit support */
-	//	ckrm_start_ckrm_idle();
+	if (ckrm_cpu_enabled()) 
+		ckrm_cpu_start_monitor();
 	return 0;
 }
 
-void ckrm_cpu_monitor_exit(void)
-{
-	ckrm_kill_monitor();
-}
-
-module_init(ckrm_cpu_monitor_init);
-module_exit(ckrm_cpu_monitor_exit);
+__initcall(ckrm_cpu_init_monitor);
 
-MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
-MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
-MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm/ckrm_laq.c b/kernel/ckrm/ckrm_laq.c
deleted file mode 100644
index b64205a06..000000000
--- a/kernel/ckrm/ckrm_laq.c
+++ /dev/null
@@ -1,495 +0,0 @@
-/* ckrm_socketaq.c - accept queue resource controller
- *
- * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
- * 
- * Latest version, more details at http://ckrm.sf.net
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-/* Changes
- * Initial version
- */
-
-/* Code Description: TBD
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/errno.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/ckrm.h>
-#include <linux/ckrm_rc.h>
-#include <net/tcp.h>
-
-#include <linux/ckrm_net.h>
-
-#define hnode_2_core(ptr) \
-        ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
-
-#define CKRM_SAQ_MAX_DEPTH	3	// 0 => /rcfs
-				  // 1 => socket_aq
-				  // 2 => socket_aq/listen_class
-				  // 3 => socket_aq/listen_class/accept_queues
-				  // 4 => Not allowed
-
-typedef struct ckrm_laq_res {
-	spinlock_t reslock;
-	atomic_t refcnt;
-	struct ckrm_shares shares;
-	struct ckrm_core_class *core;
-	struct ckrm_core_class *pcore;
-	int my_depth;
-	int my_id;
-	unsigned int min_ratio;
-} ckrm_laq_res_t;
-
-static int my_resid = -1;
-
-extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int);
-extern struct ckrm_core_class *rcfs_make_core(struct dentry *,
-					      struct ckrm_core_class *);
-
-void laq_res_hold(struct ckrm_laq_res *res)
-{
-	atomic_inc(&res->refcnt);
-	return;
-}
-
-void laq_res_put(struct ckrm_laq_res *res)
-{
-	if (atomic_dec_and_test(&res->refcnt))
-		kfree(res);
-	return;
-}
-
-/* Initialize rescls values
- */
-static void laq_res_initcls(void *my_res)
-{
-	ckrm_laq_res_t *res = my_res;
-
-	res->shares.my_guarantee = CKRM_SHARE_DONTCARE;
-	res->shares.my_limit = CKRM_SHARE_DONTCARE;
-	res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-	res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
-	res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-	res->shares.cur_max_limit = 0;
-}
-
-static int atoi(char *s)
-{
-	int k = 0;
-	while (*s)
-		k = *s++ - '0' + (k * 10);
-	return k;
-}
-
-static char *laq_get_name(struct ckrm_core_class *c)
-{
-	char *p = (char *)c->name;
-
-	while (*p)
-		p++;
-	while (*p != '/' && p != c->name)
-		p--;
-
-	return ++p;
-}
-
-static void *laq_res_alloc(struct ckrm_core_class *core,
-			   struct ckrm_core_class *parent)
-{
-	ckrm_laq_res_t *res, *pres;
-	int pdepth;
-
-	if (parent)
-		pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t);
-	else
-		pres = NULL;
-
-	if (core == core->classtype->default_class)
-		pdepth = 1;
-	else {
-		if (!parent)
-			return NULL;
-		pdepth = 1 + pres->my_depth;
-	}
-
-	res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC);
-	if (res) {
-		memset(res, 0, sizeof(res));
-		spin_lock_init(&res->reslock);
-		laq_res_hold(res);
-		res->my_depth = pdepth;
-		if (pdepth == 2)	// listen class
-			res->my_id = 0;
-		else if (pdepth == 3)
-			res->my_id = atoi(laq_get_name(core));
-		res->core = core;
-		res->pcore = parent;
-
-		// rescls in place, now initialize contents other than 
-		// hierarchy pointers
-		laq_res_initcls(res);	// acts as initialising value
-	}
-
-	return res;
-}
-
-static void laq_res_free(void *my_res)
-{
-	ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res;
-	ckrm_laq_res_t *parent;
-
-	if (!res)
-		return;
-
-	if (res->my_depth != 3) {
-		kfree(res);
-		return;
-	}
-
-	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-	if (!parent)		// Should never happen
-		return;
-
-	spin_lock(&parent->reslock);
-	spin_lock(&res->reslock);
-
-	// return child's guarantee to parent node
-	// Limits have no meaning for accept queue control
-	child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0);
-
-	spin_unlock(&res->reslock);
-	laq_res_put(res);
-	spin_unlock(&parent->reslock);
-	return;
-}
-
-/**************************************************************************
- * 			SHARES					        ***
- **************************************************************************/
-
-void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio)
-{
-	int i;
-	struct tcp_opt *tp;
-
-	tp = tcp_sk(ns->ns_sk);
-	for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
-		tp->acceptq[i].aq_ratio = aq_ratio[i];
-	return;
-}
-void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio)
-{
-
-	struct ckrm_net_struct *ns;
-	struct ckrm_core_class *core = parent->core;
-
-	class_lock(core);
-	list_for_each_entry(ns, &core->objlist, ckrm_link) {
-		laq_set_aq_value(ns, aq_ratio);
-	}
-	class_unlock(core);
-	return;
-}
-
-static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio)
-{
-	struct ckrm_hnode *chnode;
-	ckrm_laq_res_t *child;
-	unsigned int min;
-	int i;
-
-	min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee;
-
-	list_for_each_entry(chnode, &res->core->hnode.children, siblings) {
-		child = hnode_2_core(chnode)->res_class[my_resid];
-
-		aq_ratio[child->my_id] =
-		    (unsigned int)child->shares.my_guarantee;
-		if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE)
-			aq_ratio[child->my_id] = 0;
-		if (aq_ratio[child->my_id] &&
-		    ((unsigned int)aq_ratio[child->my_id] < min))
-			min = (unsigned int)child->shares.my_guarantee;
-	}
-
-	if (min == 0) {
-		min = 1;
-		// default takes all if nothing specified
-		aq_ratio[0] = 1;	
-	}
-	res->min_ratio = min;
-
-	for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
-		aq_ratio[i] = aq_ratio[i] / min;
-}
-
-static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
-{
-	ckrm_laq_res_t *res = my_res;
-	ckrm_laq_res_t *parent;
-	unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
-	int rc = 0;
-
-	if (!res)
-		return -EINVAL;
-
-	if (!res->pcore) {
-		// something is badly wrong
-		printk(KERN_ERR "socketaq internal inconsistency\n");
-		return -EBADF;
-	}
-
-	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-	if (!parent)		// socketclass does not have a share interface
-		return -EINVAL;
-
-	// Ensure that we ignore limit values
-	shares->my_limit = CKRM_SHARE_DONTCARE;
-	shares->max_limit = CKRM_SHARE_UNCHANGED;
-
-	if (res->my_depth == 0) {
-		printk(KERN_ERR "socketaq bad entry\n");
-		return -EBADF;
-	} else if (res->my_depth == 1) {
-		// can't be written to. This is an internal default.
-		return -EINVAL;
-	} else if (res->my_depth == 2) {
-		//nothin to inherit
-		if (!shares->total_guarantee) {
-			return -EINVAL;
-		}
-		parent = res;
-		shares->my_guarantee = CKRM_SHARE_DONTCARE;
-	} else if (res->my_depth == 3) {
-		// accept queue itself. 
-		shares->total_guarantee = CKRM_SHARE_UNCHANGED;
-	}
-
-	ckrm_lock_hier(parent->pcore);
-	spin_lock(&parent->reslock);
-	rc = set_shares(shares, &res->shares,
-			(parent == res) ? NULL : &parent->shares);
-	if (rc) {
-		spin_unlock(&res->reslock);
-		ckrm_unlock_hier(res->pcore);
-		return rc;
-	}
-	calculate_aq_ratios(parent, aq_ratio);
-	laq_set_aq_values(parent, aq_ratio);
-	spin_unlock(&parent->reslock);
-	ckrm_unlock_hier(parent->pcore);
-
-	return rc;
-}
-
-static int laq_get_share_values(void *my_res, struct ckrm_shares *shares)
-{
-	ckrm_laq_res_t *res = my_res;
-
-	if (!res)
-		return -EINVAL;
-	*shares = res->shares;
-	return 0;
-}
-
-/**************************************************************************
- * 			STATS						***
- **************************************************************************/
-
-void
-laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i)
-{
-	seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t"
-		   "queued: %u\n\twait_time: %u\n",
-		   i, taq->acceptq_count, taq->acceptq_qcount,
-		   jiffies_to_msecs(taq->acceptq_wait_time));
-
-	if (i)
-		return;
-
-	for (i = 1; i < NUM_ACCEPT_QUEUES; i++) {
-		taq[0].acceptq_wait_time += taq[i].acceptq_wait_time;
-		taq[0].acceptq_qcount += taq[i].acceptq_qcount;
-		taq[0].acceptq_count += taq[i].acceptq_count;
-	}
-
-	seq_printf(sfile, "Totals :\n\taccepted: %u\n\t"
-		   "queued: %u\n\twait_time: %u\n",
-		   taq->acceptq_count, taq->acceptq_qcount,
-		   jiffies_to_msecs(taq->acceptq_wait_time));
-
-	return;
-}
-
-void
-laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres,
-		 struct tcp_acceptq_info *taq)
-{
-	struct ckrm_net_struct *ns;
-	struct ckrm_core_class *core = pres->core;
-	struct tcp_opt *tp;
-	int a = mres->my_id;
-	int z;
-
-	if (a == 0)
-		z = NUM_ACCEPT_QUEUES;
-	else
-		z = a + 1;
-
-	// XXX Instead of holding a  class_lock introduce a rw
-	// lock to be write locked by listen callbacks and read locked here.
-	// - VK
-	class_lock(pres->core);
-	list_for_each_entry(ns, &core->objlist, ckrm_link) {
-		tp = tcp_sk(ns->ns_sk);
-		for (; a < z; a++) {
-			taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time;
-			taq->acceptq_qcount += tp->acceptq[a].aq_qcount;
-			taq->acceptq_count += tp->acceptq[a].aq_count;
-			taq++;
-		}
-	}
-	class_unlock(pres->core);
-}
-
-static int laq_get_stats(void *my_res, struct seq_file *sfile)
-{
-	ckrm_laq_res_t *res = my_res;
-	ckrm_laq_res_t *parent;
-	struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES];
-	int rc = 0;
-
-	if (!res)
-		return -EINVAL;
-
-	if (!res->pcore) {
-		// something is badly wrong
-		printk(KERN_ERR "socketaq internal inconsistency\n");
-		return -EBADF;
-	}
-
-	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-	if (!parent) {		// socketclass does not have a stat interface
-		printk(KERN_ERR "socketaq internal fs inconsistency\n");
-		return -EINVAL;
-	}
-
-	memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES);
-
-	switch (res->my_depth) {
-
-	default:
-	case 0:
-		printk(KERN_ERR "socket class bad entry\n");
-		rc = -EBADF;
-		break;
-
-	case 1:		// can't be read from. this is internal default.
-		// return -EINVAL
-		rc = -EINVAL;
-		break;
-
-	case 2:		// return the default and total
-		ckrm_lock_hier(res->core);	// block any deletes
-		laq_get_aq_stats(res, res, &taq[0]);
-		laq_print_aq_stats(sfile, &taq[0], 0);
-		ckrm_unlock_hier(res->core);	// block any deletes
-		break;
-
-	case 3:
-		ckrm_lock_hier(parent->core);	// block any deletes
-		laq_get_aq_stats(parent, res, &taq[res->my_id]);
-		laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id);
-		ckrm_unlock_hier(parent->core);	// block any deletes
-		break;
-	}
-
-	return rc;
-}
-
-/*
- * The network connection is reclassified to this class. Update its shares.
- * The socket lock is held. 
- */
-static void laq_change_resclass(void *n, void *old, void *r)
-{
-	struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n;
-	struct ckrm_laq_res *res = (struct ckrm_laq_res *)r;
-	unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
-
-	if (res->my_depth != 2)
-		return;
-
-	// a change to my_depth == 3 ie. the accept classes cannot happen.
-	// there is no target file
-	if (res->my_depth == 2) {	// it is one of the socket classes
-		ckrm_lock_hier(res->pcore);
-		// share rule: hold parent resource lock. then self.
-		// However, since my_depth == 1 is a generic class it is not
-		// needed here. Self lock is enough.
-		spin_lock(&res->reslock);
-		calculate_aq_ratios(res, aq_ratio);
-		class_lock(res->pcore);
-		laq_set_aq_value(ns, aq_ratio);
-		class_unlock(res->pcore);
-		spin_unlock(&res->reslock);
-		ckrm_unlock_hier(res->pcore);
-	}
-
-	return;
-}
-
-struct ckrm_res_ctlr laq_rcbs = {
-	.res_name = "laq",
-	.resid = -1,		// dynamically assigned
-	.res_alloc = laq_res_alloc,
-	.res_free = laq_res_free,
-	.set_share_values = laq_set_share_values,
-	.get_share_values = laq_get_share_values,
-	.get_stats = laq_get_stats,
-	.change_resclass = laq_change_resclass,
-	//.res_initcls       = laq_res_initcls,  //HUBERTUS: unnecessary !!
-};
-
-int __init init_ckrm_laq_res(void)
-{
-	struct ckrm_classtype *clstype;
-	int resid;
-
-	clstype = ckrm_find_classtype_by_name("socketclass");
-	if (clstype == NULL) {
-		printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
-		return -ENOENT;
-	}
-
-	if (my_resid == -1) {
-		resid = ckrm_register_res_ctlr(clstype, &laq_rcbs);
-		if (resid >= 0)
-			my_resid = resid;
-		printk(KERN_DEBUG "........init_ckrm_listen_aq_res -> %d\n", my_resid);
-	}
-	return 0;
-
-}
-
-void __exit exit_ckrm_laq_res(void)
-{
-	ckrm_unregister_res_ctlr(&laq_rcbs);
-	my_resid = -1;
-}
-
-module_init(init_ckrm_laq_res)
-    module_exit(exit_ckrm_laq_res)
-
-    MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c
index 0fe858633..103e3f957 100644
--- a/kernel/ckrm/ckrm_listenaq.c
+++ b/kernel/ckrm/ckrm_listenaq.c
@@ -1,4 +1,4 @@
-/* ckrm_socketaq.c - accept queue resource controller
+/* ckrm_listenaq.c - accept queue resource controller
  *
  * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
  * 
@@ -251,7 +251,7 @@ static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
 	}
 
 	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-	if (!parent)		// socket_class does not have a share interface
+	if (!parent)		// socketclass does not have a share interface
 		return -EINVAL;
 
 	// Ensure that we ignore limit values
@@ -380,7 +380,7 @@ static int laq_get_stats(void *my_res, struct seq_file *sfile)
 	}
 
 	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-	if (!parent) {		// socket_class does not have a stat interface
+	if (!parent) {		// socketclass does not have a stat interface
 		printk(KERN_ERR "socketaq internal fs inconsistency\n");
 		return -EINVAL;
 	}
@@ -451,7 +451,7 @@ static void laq_change_resclass(void *n, void *old, void *r)
 }
 
 struct ckrm_res_ctlr laq_rcbs = {
-	.res_name = "laq",
+	.res_name = "listenaq",
 	.resid = -1,		// dynamically assigned
 	.res_alloc = laq_res_alloc,
 	.res_free = laq_res_free,
@@ -467,9 +467,9 @@ int __init init_ckrm_laq_res(void)
 	struct ckrm_classtype *clstype;
 	int resid;
 
-	clstype = ckrm_find_classtype_by_name("socket_class");
+	clstype = ckrm_find_classtype_by_name("socketclass");
 	if (clstype == NULL) {
-		printk(KERN_INFO " Unknown ckrm classtype<socket_class>");
+		printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
 		return -ENOENT;
 	}
 
diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c
index 555ba0a4e..143b259e8 100644
--- a/kernel/ckrm/rbce/rbcemod.c
+++ b/kernel/ckrm/rbce/rbcemod.c
@@ -422,7 +422,7 @@ static struct rbce_class *create_rbce_class(const char *classname,
 	return cls;
 }
 
-static struct rbce_class *get_class(char *classname, int *classtype)
+static struct rbce_class *get_class(const char *classname, int *classtype)
 {
 	struct rbce_class *cls;
 	void *classobj;
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c
index 0400844a3..fd7f8a2b4 100644
--- a/kernel/ckrm_classqueue.c
+++ b/kernel/ckrm_classqueue.c
@@ -27,14 +27,19 @@
 #include <linux/ckrm_classqueue.h>
 
 #define cq_nr_member(cq) (cq->array.nr_active)
+#define CLASSQUEUE_MASK   (CLASSQUEUE_SIZE - 1)  
 
 /**
- * get_index - translate the logical priority to the real index in the queue
+ * get_node_index - 
+ *      translate the logical priority to the real index in the queue
  * 
  * validate the position
  * a valid prio is [cq->base,cq->base + size -1]
+ * check whether node is supposed to be enqeued beyond above window and 
+ * if so set the need_repos flag 
  */
-static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
+static inline unsigned long get_node_index(struct classqueue_struct *cq, 
+					   cq_node_t * node)
 {
 	unsigned long index;
 	int max_prio;
@@ -43,22 +48,24 @@ static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
 		return 0;
 
 	max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
-	if (*prio > max_prio)
-		*prio = max_prio;
-	if (*prio < cq->base)
-		*prio = cq->base;
+	if (unlikely(node->prio > max_prio)) {
+		node->real_prio = node->prio;
+		node->prio = max_prio;
+		node->need_repos = 1;
+	} else
+		node->need_repos = 0;
 
-       	index = (cq->base_offset + (*prio - cq->base)) ;
-	if (index >= CLASSQUEUE_SIZE)
-		index -= CLASSQUEUE_SIZE;
+	if (unlikely(node->prio < cq->base))
+		node->prio = cq->base;
 
-	return index;
+       	index = (cq->base_offset + (node->prio - cq->base)) ;
+	return ( index & CLASSQUEUE_MASK );   // ensure its in limits
 }
 
 /**
  * initialize a class queue object
  */
-int classqueue_init(struct classqueue_struct *cq)
+int classqueue_init(struct classqueue_struct *cq, int enabled)
 {
 	int i;
 	struct cq_prio_array *array;
@@ -73,7 +80,8 @@ int classqueue_init(struct classqueue_struct *cq)
 	array->nr_active = 0;
 
 	cq->base = 0;
-	cq->base_offset = -1;	//not valid yet
+	cq->base_offset = 0;
+	cq->enabled = enabled;
 
 	return 0;
 }
@@ -87,8 +95,8 @@ void classqueue_enqueue(struct classqueue_struct *cq,
 	int index;
 
 	//get real index
-	if (cq_nr_member(cq)) {
-		index = get_index(cq, &prio);
+	if (cq_nr_member(cq)) {		
+		index = get_node_index(cq, node);
 	} else {		//the first one
 		cq->base = prio;
 		cq->base_offset = 0;
@@ -123,8 +131,8 @@ void classqueue_update_prio(struct classqueue_struct *cq,
 	if (! cls_in_classqueue(node)) 
 		return;
 
-	index = get_index(cq, &new_pos);
 	node->prio = new_pos;
+	index = get_node_index(cq, node);
 
 	//remove from the original position
 	list_del_init(&(node->list));
@@ -137,10 +145,32 @@ void classqueue_update_prio(struct classqueue_struct *cq,
 	node->index = index;
 }
 
+
+static inline void __classqueue_update_base(struct classqueue_struct *cq, 
+					    int new_base)
+{
+	int max_prio; 
+	if (unlikely(new_base <= cq->base)) // base will never move back
+		return; 
+	if (unlikely(!cq_nr_member(cq))) {  
+		cq->base_offset = 0;
+		cq->base = new_base;        // is this necessary ??
+		return;
+	}
+	    
+	max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
+	if (unlikely(new_base > max_prio))
+		new_base = max_prio;
+
+       	cq->base_offset = (cq->base_offset + (new_base - cq->base)) & CLASSQUEUE_MASK; 
+	cq->base = new_base;
+}
+ 
 /**
  *classqueue_get_min_prio: return the priority of the last node in queue
  *
  * this function can be called without runqueue lock held
+ * return 0 if there's nothing in the queue
  */
 static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
 {
@@ -171,9 +201,13 @@ static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
  */
 cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
 {
-	cq_node_t *result = NULL;
+	cq_node_t *node;
 	int pos;
+	int index;
+	int new_base;
 
+search_again:
+	node = NULL;
 	/* 
 	 * search over the bitmap to get the first class in the queue
 	 */
@@ -183,10 +217,38 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
 		pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
 
 	if (pos < CLASSQUEUE_SIZE) {
-		BUG_ON(list_empty(&cq->array.queue[pos]));
-		result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+		//BUG_ON(list_empty(&cq->array.queue[pos]));
+ 		node = list_entry(cq->array.queue[pos].next, cq_node_t, list);
 	}
-	return result;
+
+	//check if the node need to be repositioned
+	if (likely(! node || ! node->need_repos)) 
+		return node;
+
+	// We need to reposition this node in the class queue
+	// BUG_ON(node->prio == node->real_prio);
+	
+	//remove from the original position
+	list_del_init(&(node->list));
+	if (list_empty(&cq->array.queue[node->index]))
+	  __clear_bit(node->index, cq->array.bitmap);
+	
+	new_base = classqueue_get_min_prio(cq);
+	node->prio = node->real_prio;
+	
+	if (! new_base)
+		new_base  = node->real_prio;
+	else if (node->real_prio < new_base)
+		new_base  = node->real_prio;
+	__classqueue_update_base(cq,new_base);
+	
+	index = get_node_index(cq, node);		
+	//add to new positon, round robin for classes with same priority
+	list_add_tail(&(node->list), &cq->array.queue[index]);
+	__set_bit(index, cq->array.bitmap);	
+	node->index = index;
+	
+	goto search_again;		
 }
 
 /**
@@ -198,14 +260,11 @@ void classqueue_update_base(struct classqueue_struct *cq)
 	int new_base;
 	
 	if (! cq_nr_member(cq)) {
-		cq->base_offset = -1;	//not defined
+		cq->base = 0;
+		cq->base_offset = 0;
 		return;
 	}
 
 	new_base = classqueue_get_min_prio(cq);
-	
-	if (new_base > cq->base) {
-		cq->base_offset = get_index(cq, &new_base);
-		cq->base = new_base;
-	}
+       	__classqueue_update_base(cq,new_base);
 }
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c
index 5142b2eaa..26ffc69d8 100644
--- a/kernel/ckrm_sched.c
+++ b/kernel/ckrm_sched.c
@@ -20,6 +20,28 @@ LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
 
 struct ckrm_cpu_class default_cpu_class_obj;
 
+unsigned int ckrm_sched_mode __cacheline_aligned_in_smp = 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT
+			CKRM_SCHED_MODE_ENABLED;
+#else
+			CKRM_SCHED_MODE_DISABLED;
+#endif
+
+static int __init ckrm_cpu_enabled_setup(char *str)
+{
+	ckrm_sched_mode = CKRM_SCHED_MODE_ENABLED;
+	return 1;
+}
+
+static int __init ckrm_cpu_disabled_setup(char *str)
+{
+	ckrm_sched_mode = CKRM_SCHED_MODE_DISABLED;
+	return 1;
+}
+
+__setup("ckrmcpu",  ckrm_cpu_enabled_setup);
+__setup("nockrmcpu",ckrm_cpu_disabled_setup);
+
 struct ckrm_cpu_class * get_default_cpu_class(void) {
 	return (&default_cpu_class_obj);
 }
@@ -28,7 +50,10 @@ struct ckrm_cpu_class * get_default_cpu_class(void) {
 /*                CVT Management                       */
 /*******************************************************/
 
-static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
+//an absolute bonus of 200ms for classes when reactivated
+#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
+
+static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
 {
 	CVT_t min_cvt;
 	CVT_t bonus;
@@ -37,6 +62,7 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
 	if (unlikely(! cur_cvt))
 		return; 
 
+#define INTERACTIVE_BONUS_SUPPORT 1
 #ifndef INTERACTIVE_BONUS_SUPPORT
 #warning "ACB taking out interactive bonus calculation"	
 	bonus = 0;
@@ -50,51 +76,40 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
 #endif
 
 	//cvt can't be negative
-	if (cur_cvt > bonus)
+	if (likely(cur_cvt > bonus))
 		min_cvt = cur_cvt - bonus;
 	else
 		min_cvt = 0;
-	
-	if (lrq->local_cvt < min_cvt) {
+
+	if (lrq->local_cvt < min_cvt) {	
+		//	if (lrq->local_cvt < min_cvt && ! lrq_nr_running(lrq)) {
 		CVT_t lost_cvt;
 
-		lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq);
+		if (unlikely(lrq->local_cvt == 0)) {
+			lrq->local_cvt = cur_cvt;
+			return;
+		}
+		lost_cvt = min_cvt - lrq->local_cvt;
+		lost_cvt *= local_class_weight(lrq);
 		lrq->local_cvt = min_cvt;
+		BUG_ON(lost_cvt < 0);
 
 		/* add what the class lost to its savings*/
-		lrq->savings += lost_cvt;
+#if 1 /*zhq debugging*/
+		lrq->savings += lost_cvt;	       
+#endif
 		if (lrq->savings > MAX_SAVINGS)
 			lrq->savings = MAX_SAVINGS; 
-	} else if (lrq->savings) {
-		/*
-		 *if a class saving and falling behind
-		 * then start to use it saving in a leaking bucket way
-		 */
-		CVT_t savings_used;
-
-		savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq);
-		if (savings_used > lrq->savings)
-			savings_used = lrq->savings;
-		
-		if (savings_used > SAVINGS_LEAK_SPEED)
-			savings_used = SAVINGS_LEAK_SPEED;
-
-		BUG_ON(lrq->savings < savings_used);
-		lrq->savings -= savings_used;
-		unscale_cvt(savings_used,lrq);
-		BUG_ON(lrq->local_cvt < savings_used);
-#ifndef CVT_SAVINGS_SUPPORT
-#warning "ACB taking out cvt saving"
-#else
-		lrq->local_cvt -= savings_used;
+#if 0 /* zhq debugging*/
+		printk("lrq= %x savings: %llu lost= %llu\n",(int)lrq,lrq->savings,lost_cvt);
 #endif
-	}		
+	}
 }
 
 /*
  * return the max_cvt of all the classes
  */
-static inline CVT_t get_max_cvt(int this_cpu)
+CVT_t get_max_cvt(int this_cpu)
 {
         struct ckrm_cpu_class *clsptr;
         ckrm_lrq_t * lrq;
@@ -102,7 +117,6 @@ static inline CVT_t get_max_cvt(int this_cpu)
 
         max_cvt = 0;
 
-        /*update class time, at the same time get max_cvt */
         list_for_each_entry(clsptr, &active_cpu_classes, links) {
                 lrq = get_ckrm_lrq(clsptr, this_cpu);
                 if (lrq->local_cvt > max_cvt)
@@ -112,6 +126,23 @@ static inline CVT_t get_max_cvt(int this_cpu)
 	return max_cvt;
 }
 
+CVT_t get_min_cvt(int this_cpu)
+{
+        struct ckrm_cpu_class *clsptr;
+        ckrm_lrq_t * lrq;
+        CVT_t max_cvt;
+
+        max_cvt = 0xFFFFFFFFFFFFFLLU;
+
+        list_for_each_entry(clsptr, &active_cpu_classes, links) {
+                lrq = get_ckrm_lrq(clsptr, this_cpu);
+                if (lrq->local_cvt < max_cvt)
+                        max_cvt = lrq->local_cvt;
+        }
+
+	return max_cvt;
+}
+
 /**
  * update_class_cputime - updates cvt of inactive classes
  * -- an inactive class shouldn't starve others when it comes back
@@ -120,7 +151,7 @@ static inline CVT_t get_max_cvt(int this_cpu)
  * 
  * class_list_lock must have been acquired 
  */
-void update_class_cputime(int this_cpu)
+void update_class_cputime(int this_cpu, int idle)
 {
 	struct ckrm_cpu_class *clsptr;
 	ckrm_lrq_t * lrq;
@@ -178,24 +209,45 @@ void update_class_cputime(int this_cpu)
 /*******************************************************/
 /*                PID load balancing stuff             */
 /*******************************************************/
-#define PID_SAMPLE_T 32
 #define PID_KP 20
 #define PID_KI 60
 #define PID_KD 20
 
+/*
+ * runqueue load is the local_weight of all the classes on this cpu
+ * must be called with class_list_lock held
+ */
+static unsigned long ckrm_cpu_load(int cpu)
+{
+	struct ckrm_cpu_class *clsptr;
+	ckrm_lrq_t* lrq;
+	struct ckrm_cpu_demand_stat* l_stat;
+	int total_load = 0;
+	int load;
+
+	list_for_each_entry(clsptr,&active_cpu_classes,links) {
+		lrq =  get_ckrm_lrq(clsptr,cpu);
+		l_stat = get_cls_local_stat(clsptr,cpu);
+
+		load = WEIGHT_TO_SHARE(lrq->local_weight);
+		
+		if (l_stat->cpu_demand < load)
+			load = l_stat->cpu_demand;
+		total_load += load;
+	}	
+	return total_load;
+}
+
+
 /**
  * sample pid load periodically
  */
+
 void ckrm_load_sample(ckrm_load_t* pid,int cpu)
 {
 	long load;
 	long err;
 
-	if (jiffies % PID_SAMPLE_T)
-		return;
-
-	adjust_local_weight();	
-
 	load = ckrm_cpu_load(cpu);
 	err = load - pid->load_p;
 	pid->load_d = err;
@@ -205,7 +257,7 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu)
 	pid->load_i /= 10;
 }
 
-long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
+long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group)
 {
 	long pressure;
 	pressure = ckrm_load->load_p * PID_KP;
@@ -214,3 +266,58 @@ long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
 	pressure /= 100;
 	return pressure;
 }
+
+/*
+ *  called after a task is switched out. Update the local cvt accounting 
+ *  we need to stick with long instead of long long due to nonexistent 
+ *  64-bit division
+ */
+void update_local_cvt(struct task_struct *p, unsigned long nsec)
+{
+	ckrm_lrq_t * lrq = get_task_lrq(p);
+	unsigned long cvt_inc;
+
+	/*
+	 * consume from savings if eshare is larger than egrt
+	 */
+	if (lrq->savings && lrq->over_weight) {
+		unsigned long savings_used;
+
+		savings_used = nsec;
+		savings_used >>= CKRM_WEIGHT_SHIFT;
+		savings_used *= lrq->over_weight;
+		if (savings_used > lrq->savings)
+			savings_used = lrq->savings;
+		lrq->savings -= savings_used;	
+	}
+
+	//BUG_ON(local_class_weight(lrq) == 0);
+	cvt_inc = nsec / local_class_weight(lrq); 
+
+	/* 
+	 * For a certain processor, CKRM allocates CPU time propotional 
+	 * to the class's local_weight. So once a class consumed nsec, 
+	 * it will wait for X (nsec) for its next turn.
+	 *
+	 * X is calculated based on the following fomular
+	 *     nsec / local_weight < X / (CKRM_MAX_WEIGHT - local_weight)
+	 * if local_weight is small, then approximated as
+	 *     nsec / local_weight < X / (CKRM_MAX_WEIGHT)
+	 */
+#define CVT_STARVATION_LIMIT (200LL*NSEC_PER_MS)
+#define CVT_STARVATION_INC_LIMIT (CVT_STARVATION_LIMIT >> CKRM_WEIGHT_SHIFT)
+
+	if (unlikely(lrq->skewed_weight)) {
+		unsigned long long starvation_limit = CVT_STARVATION_INC_LIMIT;
+		
+		starvation_limit *= local_class_weight(lrq);
+		if (unlikely(cvt_inc > starvation_limit))	  
+			cvt_inc = nsec / lrq->skewed_weight;
+	}
+
+	/* now update the CVT accounting */
+
+	lrq->local_cvt += cvt_inc;
+	lrq->uncounted_ns += nsec;
+	update_class_priority(lrq);
+}
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000..b59023fbf
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,640 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <net/checksum.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+
+
+static int kimage_alloc(struct kimage **rimage,
+	unsigned long nr_segments, struct kexec_segment *segments)
+{
+	int result;
+	struct kimage *image;
+	size_t segment_bytes;
+	unsigned long i;
+
+	/* Allocate a controlling structure */
+	result = -ENOMEM;
+	image = kmalloc(sizeof(*image), GFP_KERNEL);
+	if (!image) {
+		goto out;
+	}
+	memset(image, 0, sizeof(*image));
+	image->head = 0;
+	image->entry = &image->head;
+	image->last_entry = &image->head;
+
+	/* Initialize the list of control pages */
+	INIT_LIST_HEAD(&image->control_pages);
+
+	/* Initialize the list of destination pages */
+	INIT_LIST_HEAD(&image->dest_pages);
+
+	/* Initialize the list of unuseable pages */
+	INIT_LIST_HEAD(&image->unuseable_pages);
+
+	/* Read in the segments */
+	image->nr_segments = nr_segments;
+	segment_bytes = nr_segments * sizeof*segments;
+	result = copy_from_user(image->segment, segments, segment_bytes);
+	if (result)
+		goto out;
+
+	/*
+	 * Verify we have good destination addresses.  The caller is
+	 * responsible for making certain we don't attempt to load
+	 * the new image into invalid or reserved areas of RAM.  This
+	 * just verifies it is an address we can use.
+	 */
+	result = -EADDRNOTAVAIL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mend;
+		mend = ((unsigned long)(image->segment[i].mem)) +
+			image->segment[i].memsz;
+		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+			goto out;
+	}
+
+	/*
+	 * Find a location for the control code buffer, and add it
+	 * the vector of segments so that it's pages will also be
+	 * counted as destination pages.
+	 */
+	result = -ENOMEM;
+	image->control_code_page = kimage_alloc_control_pages(image,
+		get_order(KEXEC_CONTROL_CODE_SIZE));
+	if (!image->control_code_page) {
+		printk(KERN_ERR "Could not allocate control_code_buffer\n");
+		goto out;
+	}
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+}
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end)
+{
+	unsigned long i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = (unsigned long)image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend)) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+	struct page *pages;
+	pages = alloc_pages(gfp_mask, order);
+	if (pages) {
+		unsigned int count, i;
+		pages->mapping = NULL;
+		pages->private = order;
+		count = 1 << order;
+		for(i = 0; i < count; i++) {
+			SetPageReserved(pages + i);
+		}
+	}
+	return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+	unsigned int order, count, i;
+	order = page->private;
+	count = 1 << order;
+	for(i = 0; i < count; i++) {
+		ClearPageReserved(page + i);
+	}
+	__free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	list_for_each_safe(pos, next, list) {
+		struct page *page;
+
+		page = list_entry(pos, struct page, lru);
+		list_del(&page->lru);
+
+		kimage_free_pages(page);
+	}
+}
+
+struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * At worst this runs in O(N) of the image size.
+	 */
+	struct list_head extra_pages;
+	struct page *pages;
+	unsigned int count;
+
+	count = 1 << order;
+	INIT_LIST_HEAD(&extra_pages);
+
+	/* Loop while I can allocate a page and the page allocated
+	 * is a destination page.
+	 */
+	do {
+		unsigned long pfn, epfn, addr, eaddr;
+		pages = kimage_alloc_pages(GFP_KERNEL, order);
+		if (!pages)
+			break;
+		pfn   = page_to_pfn(pages);
+		epfn  = pfn + count;
+		addr  = pfn << PAGE_SHIFT;
+		eaddr = epfn << PAGE_SHIFT;
+		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+			kimage_is_destination_range(image, addr, eaddr))
+		{
+			list_add(&pages->lru, &extra_pages);
+			pages = NULL;
+		}
+	} while(!pages);
+	if (pages) {
+		/* Remember the allocated page... */
+		list_add(&pages->lru, &image->control_pages);
+
+		/* Because the page is already in it's destination
+		 * location we will never allocate another page at
+		 * that address.  Therefore kimage_alloc_pages
+		 * will not return it (again) and we don't need
+		 * to give it an entry in image->segment[].
+		 */
+	}
+	/* Deal with the destination pages I have inadvertently allocated.
+	 *
+	 * Ideally I would convert multi-page allocations into single
+	 * page allocations, and add everyting to image->dest_pages.
+	 *
+	 * For now it is simpler to just free the pages.
+	 */
+	kimage_free_page_list(&extra_pages);
+	return pages;
+
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+	if (*image->entry != 0) {
+		image->entry++;
+	}
+	if (image->entry == image->last_entry) {
+		kimage_entry_t *ind_page;
+		struct page *page;
+		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+		if (!page) {
+			return -ENOMEM;
+		}
+		ind_page = page_address(page);
+		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		image->entry = ind_page;
+		image->last_entry =
+			ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+	}
+	*image->entry = entry;
+	image->entry++;
+	*image->entry = 0;
+	return 0;
+}
+
+static int kimage_set_destination(
+	struct kimage *image, unsigned long destination)
+{
+	int result;
+
+	destination &= PAGE_MASK;
+	result = kimage_add_entry(image, destination | IND_DESTINATION);
+	if (result == 0) {
+		image->destination = destination;
+	}
+	return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+	int result;
+
+	page &= PAGE_MASK;
+	result = kimage_add_entry(image, page | IND_SOURCE);
+	if (result == 0) {
+		image->destination += PAGE_SIZE;
+	}
+	return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+	/* Walk through and free any extra destination pages I may have */
+	kimage_free_page_list(&image->dest_pages);
+
+	/* Walk through and free any unuseable pages I have cached */
+	kimage_free_page_list(&image->unuseable_pages);
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+	int result;
+
+	result = kimage_add_entry(image, IND_DONE);
+	if (result == 0) {
+		/* Point at the terminating element */
+		image->entry--;
+		kimage_free_extra_pages(image);
+	}
+	return result;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+		ptr = (entry & IND_INDIRECTION)? \
+			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+	struct page *page;
+
+	page = pfn_to_page(entry >> PAGE_SHIFT);
+	kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+	kimage_entry_t ind = 0;
+
+	if (!image)
+		return;
+	kimage_free_extra_pages(image);
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_INDIRECTION) {
+			/* Free the previous indirection page */
+			if (ind & IND_INDIRECTION) {
+				kimage_free_entry(ind);
+			}
+			/* Save this indirection page until we are
+			 * done with it.
+			 */
+			ind = entry;
+		}
+		else if (entry & IND_SOURCE) {
+			kimage_free_entry(entry);
+		}
+	}
+	/* Free the final indirection page */
+	if (ind & IND_INDIRECTION) {
+		kimage_free_entry(ind);
+	}
+
+	/* Handle any machine specific cleanup */
+	machine_kexec_cleanup(image);
+
+	/* Free the kexec control pages... */
+	kimage_free_page_list(&image->control_pages);
+	kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+{
+	kimage_entry_t *ptr, entry;
+	unsigned long destination = 0;
+
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_DESTINATION) {
+			destination = entry & PAGE_MASK;
+		}
+		else if (entry & IND_SOURCE) {
+			if (page == destination) {
+				return ptr;
+			}
+			destination += PAGE_SIZE;
+		}
+	}
+	return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+{
+	/*
+	 * Here we implement safeguards to ensure that a source page
+	 * is not copied to its destination page before the data on
+	 * the destination page is no longer useful.
+	 *
+	 * To do this we maintain the invariant that a source page is
+	 * either its own destination page, or it is not a
+	 * destination page at all.
+	 *
+	 * That is slightly stronger than required, but the proof
+	 * that no problems will not occur is trivial, and the
+	 * implementation is simply to verify.
+	 *
+	 * When allocating all pages normally this algorithm will run
+	 * in O(N) time, but in the worst case it will run in O(N^2)
+	 * time.   If the runtime is a problem the data structures can
+	 * be fixed.
+	 */
+	struct page *page;
+	unsigned long addr;
+
+	/*
+	 * Walk through the list of destination pages, and see if I
+	 * have a match.
+	 */
+	list_for_each_entry(page, &image->dest_pages, lru) {
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+		if (addr == destination) {
+			list_del(&page->lru);
+			return page;
+		}
+	}
+	page = NULL;
+	while (1) {
+		kimage_entry_t *old;
+
+		/* Allocate a page, if we run out of memory give up */
+		page = kimage_alloc_pages(gfp_mask, 0);
+		if (!page) {
+			return 0;
+		}
+		/* If the page cannot be used file it away */
+		if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+			list_add(&page->lru, &image->unuseable_pages);
+			continue;
+		}
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+
+		/* If it is the destination page we want use it */
+		if (addr == destination)
+			break;
+
+		/* If the page is not a destination page use it */
+		if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+			break;
+
+		/*
+		 * I know that the page is someones destination page.
+		 * See if there is already a source page for this
+		 * destination page.  And if so swap the source pages.
+		 */
+		old = kimage_dst_used(image, addr);
+		if (old) {
+			/* If so move it */
+			unsigned long old_addr;
+			struct page *old_page;
+
+			old_addr = *old & PAGE_MASK;
+			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			copy_highpage(page, old_page);
+			*old = addr | (*old & ~PAGE_MASK);
+
+			/* The old page I have found cannot be a
+			 * destination page, so return it.
+			 */
+			addr = old_addr;
+			page = old_page;
+			break;
+		}
+		else {
+			/* Place the page on the destination list I
+			 * will use it later.
+			 */
+			list_add(&page->lru, &image->dest_pages);
+		}
+	}
+	return page;
+}
+
+static int kimage_load_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{
+	unsigned long mstart;
+	int result;
+	unsigned long offset;
+	unsigned long offset_end;
+	unsigned char *buf;
+
+	result = 0;
+	buf = segment->buf;
+	mstart = (unsigned long)segment->mem;
+
+	offset_end = segment->memsz;
+
+	result = kimage_set_destination(image, mstart);
+	if (result < 0) {
+		goto out;
+	}
+	for (offset = 0;  offset < segment->memsz; offset += PAGE_SIZE) {
+		struct page *page;
+		char *ptr;
+		size_t size, leader;
+		page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
+		if (page == 0) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
+		if (result < 0) {
+			goto out;
+		}
+		ptr = kmap(page);
+		if (segment->bufsz < offset) {
+			/* We are past the end zero the whole page */
+			memset(ptr, 0, PAGE_SIZE);
+			kunmap(page);
+			continue;
+		}
+		size = PAGE_SIZE;
+		leader = 0;
+		if ((offset == 0)) {
+			leader = mstart & ~PAGE_MASK;
+		}
+		if (leader) {
+			/* We are on the first page zero the unused portion */
+			memset(ptr, 0, leader);
+			size -= leader;
+			ptr += leader;
+		}
+		if (size > (segment->bufsz - offset)) {
+			size = segment->bufsz - offset;
+		}
+		if (size < (PAGE_SIZE - leader)) {
+			/* zero the trailing part of the page */
+			memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
+		}
+		result = copy_from_user(ptr, buf + offset, size);
+		kunmap(page);
+		if (result) {
+			result = (result < 0) ? result : -EIO;
+			goto out;
+		}
+	}
+ out:
+	return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and the copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = NULL;
+
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+	struct kexec_segment *segments, unsigned long flags)
+{
+	struct kimage *image;
+	int result;
+
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_BOOT))
+		return -EPERM;
+
+	/*
+	 * In case we need just a little bit of special behavior for
+	 * reboot on panic.
+	 */
+	if (flags != 0)
+		return -EINVAL;
+
+	if (nr_segments > KEXEC_SEGMENT_MAX)
+		return -EINVAL;
+
+	image = NULL;
+	result = 0;
+
+	if (nr_segments > 0) {
+		unsigned long i;
+		result = kimage_alloc(&image, nr_segments, segments);
+		if (result) {
+			goto out;
+		}
+		result = machine_kexec_prepare(image);
+		if (result) {
+			goto out;
+		}
+		image->start = entry;
+		for (i = 0; i < nr_segments; i++) {
+			result = kimage_load_segment(image, &image->segment[i]);
+			if (result) {
+				goto out;
+			}
+		}
+		result = kimage_terminate(image);
+		if (result) {
+			goto out;
+		}
+	}
+
+	image = xchg(&kexec_image, image);
+
+ out:
+	kimage_free(image);
+	return result;
+}
diff --git a/kernel/sched.c b/kernel/sched.c
index 20b09215e..42af615a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -45,6 +45,8 @@
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
+#include <linux/ckrm_classqueue.h>
+#include <linux/ckrm_sched.h>
 
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
@@ -205,8 +207,6 @@ unsigned int task_timeslice(task_t *p)
  */
 
 typedef struct runqueue runqueue_t;
-#include <linux/ckrm_classqueue.h>
-#include <linux/ckrm_sched.h>
 
 /*
  * This is the main, per-CPU runqueue data structure.
@@ -227,17 +227,19 @@ struct runqueue {
 	unsigned long cpu_load;
 #endif
 	unsigned long long nr_switches, nr_preempt;
-	unsigned long expired_timestamp, nr_uninterruptible;
+	unsigned long nr_uninterruptible;
 	unsigned long long timestamp_last_tick;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
 	struct classqueue_struct classqueue;   
 	ckrm_load_t ckrm_load;
+	ckrm_lrq_t   dflt_lrq; /* local runqueue of the default class */
 #else
         prio_array_t *active, *expired, arrays[2];
-#endif
+	unsigned long expired_timestamp;
 	int best_expired_prio;
+#endif
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
@@ -320,10 +322,72 @@ static inline void rq_unlock(runqueue_t *rq)
 	spin_unlock_irq(&rq->lock);
 }
 
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq);
+static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq);
+
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
+
+#define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled)
+#define ckrm_rq_cpu_enabled(rq)  ( rq->classqueue.enabled)
+
+static inline void class_enqueue_task(struct task_struct *p,
+			              prio_array_t * array)
+{
+	ckrm_lrq_t *lrq;
+	int effective_prio;
+	
+	if (ckrm_rq_cpu_disabled(task_rq(p)))
+		return;
+	
+	lrq = get_task_lrq(p);
+	// BUG_ON(lrq==NULL); 
+	
+	cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
+	lrq->lrq_load += task_load(p);
+	
+	if ((p->prio < lrq->top_priority) && (array == lrq->active))
+		set_top_priority(lrq, p->prio);	
+	
+	if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
+		cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
+		effective_prio = get_effective_prio(lrq);
+		classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, 
+				   effective_prio);
+	} 
+	
+}
+
+static inline void class_dequeue_task(struct task_struct *p,
+				      prio_array_t * array)
+{
+	ckrm_lrq_t *lrq;
+	unsigned long load;
+	
+	if (ckrm_rq_cpu_disabled(task_rq(p)))
+		return;
+	
+	lrq = get_task_lrq(p);
+	load = task_load(p); 
+	
+	// BUG_ON(lrq->lrq_load < load); 	
+	
+	lrq->lrq_load -= load;
+	
+	cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
+	
+	if ((array == lrq->active) && (p->prio == lrq->top_priority)
+	    && list_empty(&(array->queue[p->prio])))
+		set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO,
+						   p->prio));
+}
+
 static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
 {
-	cq_node_t *node = classqueue_get_head(&rq->classqueue);
+	cq_node_t *node;
+
+	if (ckrm_rq_cpu_disabled(rq)) 
+		return &rq->dflt_lrq;
+	node = classqueue_get_head(&rq->classqueue);
 	return ((node) ? class_list_entry(node) : NULL);
 }
 
@@ -342,51 +406,189 @@ CVT_t get_local_cur_cvt(int cpu)
 		return 0;
 }
 
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
+						    int cpu) 
 {
 	prio_array_t               *array;
 	struct task_struct         *next;
 	ckrm_lrq_t *queue;
 	int idx;
-	int cpu = smp_processor_id();
 
-	// it is guaranteed be the ( rq->nr_running > 0 ) check in 
-	// schedule that a task will be found.
+	if (ckrm_rq_cpu_disabled(rq)) {
+		/* original code from schedule(void) 
+		 * see also code in non CKRM configuration
+		 */
+		struct list_head *array_queue;
+		ckrm_lrq_t  *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu);
+
+		if (unlikely(!rq->nr_running)) {
+			idle_balance(cpu, rq);
+			if (!rq->nr_running) {
+				rq->dflt_lrq.expired_timestamp = 0;
+				wake_sleeping_dependent(cpu, rq);
+				return NULL;
+			}
+		}
+
+	        array = lrq->active;
+		if (unlikely(!array->nr_active)) {
+			/*
+			 * Switch the active and expired arrays.
+			 */
+			lrq->active = lrq->expired;
+			lrq->expired = array;
+			array = lrq->active; 
+			lrq->expired_timestamp = 0;
+			lrq->best_expired_prio = MAX_PRIO;
+		}
+
+        	idx = sched_find_first_bit(array->bitmap);
+        	array_queue = array->queue + idx;
+        	next = list_entry(array_queue->next, task_t, run_list);
+		return next;
+	}
 
+	/*-- CKRM SCHEDULER --*/
+	
  retry_next_class:
+	/* we can't use (rq->nr_running == 0) to declare idleness
+	 * first we have to make sure that the class runqueue is properly
+	 * processed. This is due to two facts/requirements:
+	 * (a) when the last task is removed form an lrq we do not remove
+	 *     the lrq from the class runqueue. As a result the lrq is 
+	 *     selected again and we can perform necessary 
+	 *     expired switches.
+	 * (b) perform outstanding expired switches
+	 * 
+	 */
+
 	queue = rq_get_next_class(rq);
-	// BUG_ON( !queue );
+	if (unlikely(queue == NULL)) {
+		idle_balance(cpu, rq);
+		if (!rq->nr_running) {
+			rq->dflt_lrq.expired_timestamp = 0;
+			wake_sleeping_dependent(cpu, rq);
+			return NULL;
+		}
+		goto retry_next_class; // try again
+	}
 
 	array = queue->active;
 	if (unlikely(!array->nr_active)) {
 		queue->active = queue->expired;
 		queue->expired = array;
+		array = queue->active;
 		queue->expired_timestamp = 0;
 
-		if (queue->active->nr_active)
+		if (array->nr_active)
 			set_top_priority(queue,
-					 find_first_bit(queue->active->bitmap, MAX_PRIO));
+					 find_first_bit(array->bitmap,MAX_PRIO));
 		else {
+			/* since we do not dequeue a lrq when it becomes empty
+	  		 * but rely on the switching mechanism, we must dequeue
+			 * at this point
+			 */
 			classqueue_dequeue(queue->classqueue,
 					   &queue->classqueue_linkobj);
-			cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
+			cpu_demand_event(get_rq_local_stat(queue,cpu),
+					 CPU_DEMAND_DEQUEUE,0);
 		}
 		goto retry_next_class; 				
 	}
-	// BUG_ON(!array->nr_active);
 
 	idx = queue->top_priority;
-	// BUG_ON (idx == MAX_PRIO);
+	//BUG_ON(!array->nr_active);
+	//BUG_ON(idx == MAX_PRIO);
+	//BUG_ON(list_empty(array->queue+idx));
 	next = task_list_entry(array->queue[idx].next);
 	return next;
 }
+
+static inline void ckrm_account_task(struct runqueue* rq, 
+				     struct task_struct *prev, 
+			             unsigned long long now)
+{
+  	if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) {
+		unsigned long long run = now - prev->timestamp;
+		ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+		lrq->lrq_load -= task_load(prev);
+		cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+		lrq->lrq_load += task_load(prev);
+
+		cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+  		update_local_cvt(prev, run);
+	}
+
+}
+
+#ifdef CONFIG_SMP
+#define COND_SMP(dflt,cond) (cond)
+#else
+#define COND_SMP(dflt,cond) (dflt)
+#endif
+
+static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle,
+				   runqueue_t *rq)
+{
+	/* first determine whether we have to do anything
+	 * without grabing the global lock
+	 */
+
+	int sample, update;
+
+#ifdef __SIMULATOR__
+	if ((this_cpu == 0) && (j % 1000) == 0) {
+		ckrm_cpu_monitor(1);
+	}
+#endif
+	
+	if (ckrm_rq_cpu_disabled(rq))
+		return;
+	
+	update = (j % CVT_UPDATE_TICK);
+	sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK)); 
+	
+// avoid taking the global class_list lock on every tick 
+	if (likely(update && sample))
+		return;   // nothing to be done;
+	
+	read_lock(&class_list_lock);
+	
+#ifdef CONFIG_SMP
+	if (sample==0) {
+		ckrm_load_sample(rq_ckrm_load(rq),this_cpu);
+	}
+#endif
+	
+	if (update==0) {
+		classqueue_update_base(get_cpu_classqueue(this_cpu));
+		update_class_cputime(this_cpu,idle);
+		// occasionally we need to call the weight adjustment
+		// for SMP systems
+		if (COND_SMP(0,(this_cpu==0)))
+			adjust_local_weight();   
+	}
+	
+	read_unlock(&class_list_lock);
+}
+
 #else /*! CONFIG_CKRM_CPU_SCHEDULE*/
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
+						    int cpu) 
 {
 	prio_array_t *array;
         struct list_head *queue;
 	int idx;
 
+	if (unlikely(!rq->nr_running)) {
+		idle_balance(cpu, rq);
+                if (!rq->nr_running) {
+                        rq->expired_timestamp = 0;
+                        wake_sleeping_dependent(cpu, rq);
+                        return NULL;
+                }
+	}
 	array = rq->active;
 	if (unlikely(!array->nr_active)) {
 		/*
@@ -404,11 +606,17 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
 	return list_entry(queue->next, task_t, run_list);
 }
 
-static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
-static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_enqueue_task(struct task_struct* p, 
+				      prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, 
+				      prio_array_t *array) { }
 static inline void init_cpu_classes(void) { }
+static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {}
+static inline void ckrm_account_task(struct runqueue* rq, struct 
+				     task_struct *prev, 
+				     unsigned long long now)  { }
 #define rq_ckrm_load(rq) NULL
-static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
+
 #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
 
 /*
@@ -1558,261 +1766,129 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 	return 1;
 }
 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
-				      int phase, enum idle_type idle)
-{
-	long pressure = task_load(tmp);
-	
-	if (pressure > max) 
-		return 0;
-
-	if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
-		return 0;
-	return 1;
-}
-
 /*
- * move tasks for a specic local class
- * return number of tasks pulled
+ * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
+ * as part of a balancing operation within "domain". Returns the number of
+ * tasks moved.
+ *
+ * Called with both runqueues locked.
  */
-static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
-				      runqueue_t *this_rq,
-				      runqueue_t *busiest,
-				      struct sched_domain *sd,
-				      int this_cpu,
-				      enum idle_type idle,
-				      long* pressure_imbalance) 
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+		      unsigned long max_nr_move, struct sched_domain *sd,
+		      enum idle_type idle)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
+	int idx, pulled = 0;
 	task_t *tmp;
-	int idx;
-	int pulled = 0;
-	int phase = -1;
-	long pressure_min, pressure_max;
-	/*hzheng: magic : 90% balance is enough*/
-	long balance_min = *pressure_imbalance / 10; 
-/*
- * we don't want to migrate tasks that will reverse the balance
- *     or the tasks that make too small difference
- */
-#define CKRM_BALANCE_MAX_RATIO	100
-#define CKRM_BALANCE_MIN_RATIO	1
- start:
-	phase ++;
+#if CONFIG_CKRM_CPU_SCHEDULE
+	/* need to distinguish between the runqueues and the class
+         * local runqueues.
+	 * we know we can get here only if the dflt class is present
+	 */
+	ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq;
+	ckrm_lrq_t *l_busiest = &busiest->dflt_lrq;
+#else
+#define l_busiest busiest
+#define l_this_rq this_rq
+#endif
+
+	if (max_nr_move <= 0 || busiest->nr_running <= 1)
+		goto out;
+
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
 	 * on them.
 	 */
-	if (src_lrq->expired->nr_active) {
-		array = src_lrq->expired;
-		dst_array = dst_lrq->expired;
+	if (l_busiest->expired->nr_active) {
+		array = l_busiest->expired;
+		dst_array = l_this_rq->expired;
 	} else {
-		array = src_lrq->active;
-		dst_array = dst_lrq->active;
+		array = l_busiest->active;
+		dst_array = l_this_rq->active;
 	}
-	
- new_array:
+
+new_array:
 	/* Start searching at priority 0: */
 	idx = 0;
- skip_bitmap:
+skip_bitmap:
 	if (!idx)
 		idx = sched_find_first_bit(array->bitmap);
 	else
 		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
-		if (array == src_lrq->expired && src_lrq->active->nr_active) {
-			array = src_lrq->active;
-			dst_array = dst_lrq->active;
+		if (array == l_busiest->expired && l_busiest->active->nr_active) {
+			array = l_busiest->active;
+			dst_array = l_this_rq->active;
 			goto new_array;
 		}
-		if ((! phase) && (! pulled) && (idle != IDLE))
-			goto start; //try again
-		else 
-			goto out; //finished search for this lrq
+		goto out;
 	}
-	
+
 	head = array->queue + idx;
 	curr = head->prev;
- skip_queue:
+skip_queue:
 	tmp = list_entry(curr, task_t, run_list);
-	
+
 	curr = curr->prev;
-	
+
 	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
 		goto skip_bitmap;
 	}
+	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+	pulled++;
 
-	pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
-	pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
-	/*
-	 * skip the tasks that will reverse the balance too much
-	 */
-	if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
-		*pressure_imbalance -= task_load(tmp);
-		pull_task(busiest, array, tmp, 
-			  this_rq, dst_array, this_cpu);
-		pulled++;
-
-		if (*pressure_imbalance <= balance_min)
-			goto out;
+	/* We only want to steal up to the prescribed number of tasks. */
+	if (pulled < max_nr_move) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
 	}
-		
-	if (curr != head)
-		goto skip_queue;
-	idx++;
-	goto skip_bitmap;
- out:	       
+out:
 	return pulled;
 }
 
-static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
-{
-	long imbalance;
-	/*
-	 * make sure after balance, imbalance' > - imbalance/2
-	 * we don't want the imbalance be reversed too much
-	 */
-	imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) 
-		- pid_get_pressure(rq_ckrm_load(this_rq),1);
-	imbalance /= 2;
-	return imbalance;
-}
-
 /*
- * try to balance the two runqueues
- *
- * Called with both runqueues locked.
- * if move_tasks is called, it will try to move at least one task over
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the number of tasks which should be
+ * moved to restore balance via the imbalance parameter.
  */
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle)
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+		   unsigned long *imbalance, enum idle_type idle)
 {
-	struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
-	ckrm_lrq_t* src_lrq,*dst_lrq;
-	long pressure_imbalance, pressure_imbalance_old;
-	int src_cpu = task_cpu(busiest->curr);
-	struct list_head *list;
-	int pulled = 0;
-	long imbalance;
-
-	imbalance =  ckrm_rq_imbalance(this_rq,busiest);
+	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 
-	if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
-		goto out;
+	max_load = this_load = total_load = total_pwr = 0;
 
-	//try to find the vip class
-        list_for_each_entry(clsptr,&active_cpu_classes,links) {
-		src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+	do {
+		cpumask_t tmp;
+		unsigned long load;
+		int local_group;
+		int i, nr_cpus = 0;
 
-		if (! lrq_nr_running(src_lrq))
-			continue;
-
-		if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
-			{
-				vip_cls = clsptr;
-			}
-	}
-
-	/*
-	 * do search from the most significant class
-	 * hopefully, less tasks will be migrated this way
-	 */
-	clsptr = vip_cls;
-
- move_class:
-	if (! clsptr)
-		goto out;
-	
-
-	src_lrq = get_ckrm_lrq(clsptr,src_cpu);
-	if (! lrq_nr_running(src_lrq))
-		goto other_class;
-	
-	dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
-
-	//how much pressure for this class should be transferred
-	pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
-	if (pulled && ! pressure_imbalance) 
-		goto other_class;
-	
-	pressure_imbalance_old = pressure_imbalance;
-	
-	//move tasks
-	pulled += 
-		ckrm_cls_move_tasks(src_lrq,dst_lrq,
-				    this_rq,
-				    busiest,
-				    sd,this_cpu,idle,
-				    &pressure_imbalance);
-
-	/* 
-	 * hzheng: 2 is another magic number
-	 * stop balancing if the imbalance is less than 25% of the orig
-	 */
-	if (pressure_imbalance <= (pressure_imbalance_old >> 2))
-		goto out;
-		
-	//update imbalance
-	imbalance *= pressure_imbalance / pressure_imbalance_old;
- other_class:
-	//who is next?
-	list = clsptr->links.next;
-	if (list == &active_cpu_classes)
-		list = list->next;
-	clsptr = list_entry(list, typeof(*clsptr), links);
-	if (clsptr != vip_cls)
-		goto move_class;
- out:
-	return pulled;
-}
-
-/**
- * ckrm_check_balance - is load balancing necessary?
- * return 0 if load balancing is not necessary
- * otherwise return the average load of the system
- * also, update nr_group
- *
- * heuristics: 
- *   no load balancing if it's load is over average
- *   no load balancing if it's load is far more than the min
- * task:
- *   read the status of all the runqueues
- */
-static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
-					     enum idle_type idle, int* nr_group)
-{
-	struct sched_group *group = sd->groups;
-	unsigned long min_load, max_load, avg_load;
-	unsigned long total_load, this_load, total_pwr;
-
-	max_load = this_load = total_load = total_pwr = 0;
-	min_load = 0xFFFFFFFF;
-	*nr_group = 0;
-
-	do {
-		cpumask_t tmp;
-		unsigned long load;
-		int local_group;
-		int i, nr_cpus = 0;
+		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
 		cpus_and(tmp, group->cpumask, cpu_online_map);
 		if (unlikely(cpus_empty(tmp)))
 			goto nextgroup;
 
-		avg_load = 0;
-		local_group = cpu_isset(this_cpu, group->cpumask);
-
 		for_each_cpu_mask(i, tmp) {
-			load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = target_load(i);
+			else
+				load = source_load(i);
+
 			nr_cpus++;
 			avg_load += load;
 		}
@@ -1828,386 +1904,86 @@ static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
 
 		if (local_group) {
 			this_load = avg_load;
+			this = group;
 			goto nextgroup;
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
-		}      
-		if (avg_load < min_load) {
-			min_load = avg_load;
+			busiest = group;
 		}
 nextgroup:
 		group = group->next;
-		*nr_group = *nr_group + 1;
 	} while (group != sd->groups);
 
-	if (!max_load || this_load >= max_load)
+	if (!busiest || this_load >= max_load)
 		goto out_balanced;
 
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 
-	/* hzheng: debugging: 105 is a magic number
-	 * 100*max_load <= sd->imbalance_pct*this_load)
-	 * should use imbalance_pct instead
-	 */
-	if (this_load > avg_load 
-	    || 100*max_load < 105*this_load
-	    || 100*min_load < 70*this_load
-	    )
+	if (this_load >= avg_load ||
+			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 
-	return avg_load;
- out_balanced:
-	return 0;
-}
-
-/**
- * any group that has above average load is considered busy
- * find the busiest queue from any of busy group
- */
-static runqueue_t *
-ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
-		     unsigned long avg_load, enum idle_type idle,
-		     int nr_group)
-{
-	struct sched_group *group;
-	runqueue_t * busiest=NULL;
-	unsigned long rand;
-	
-	group = sd->groups;
-	rand = get_ckrm_rand(nr_group);
-	nr_group = 0;
+	/*
+	 * We're trying to get all the cpus to the average_load, so we don't
+	 * want to push ourselves above the average load, nor do we wish to
+	 * reduce the max loaded cpu below the average load, as either of these
+	 * actions would just result in more rebalancing later, and ping-pong
+	 * tasks around. Thus we look for the minimum possible imbalance.
+	 * Negative imbalances (*we* are more loaded than anyone else) will
+	 * be counted as no imbalance for these purposes -- we can't fix that
+	 * by pulling tasks to us.  Be careful of negative numbers as they'll
+	 * appear as very large values with unsigned longs.
+	 */
+	*imbalance = min(max_load - avg_load, avg_load - this_load);
 
-	do {
-		unsigned long load,total_load,max_load;
-		cpumask_t tmp;
-		int i;
-		runqueue_t * grp_busiest;
+	/* How much load to actually move to equalise the imbalance */
+	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+				/ SCHED_LOAD_SCALE;
 
-		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (unlikely(cpus_empty(tmp)))
-			goto find_nextgroup;
+	if (*imbalance < SCHED_LOAD_SCALE - 1) {
+		unsigned long pwr_now = 0, pwr_move = 0;
+		unsigned long tmp;
 
-		total_load = 0;
-		max_load = 0;
-		grp_busiest = NULL;
-		for_each_cpu_mask(i, tmp) {
-			load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
-			total_load += load;
-			if (load > max_load) {
-				max_load = load;
-				grp_busiest = cpu_rq(i);
-			}				
+		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
+			*imbalance = 1;
+			return busiest;
 		}
 
-		total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
-		if (total_load > avg_load) {
-			busiest = grp_busiest;
-			if (nr_group >= rand)
-				break;
-		}
-	find_nextgroup:		
-		group = group->next;
-		nr_group ++;
-	} while (group != sd->groups);
+		/*
+		 * OK, we don't have enough imbalance to justify moving tasks,
+		 * however we may be able to increase total CPU power used by
+		 * moving them.
+		 */
 
-	return busiest;
-}
+		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now /= SCHED_LOAD_SCALE;
 
-/**
- * load_balance - pressure based load balancing algorithm used by ckrm
- */
-static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
-			struct sched_domain *sd, enum idle_type idle)
-{
-	runqueue_t *busiest;
-	unsigned long avg_load;
-	int nr_moved,nr_group;
+		/* Amount of load we'd subtract */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		if (max_load > tmp)
+			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+							max_load - tmp);
 
-	avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
-	if (! avg_load)
-		goto out_balanced;
+		/* Amount of load we'd add */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+		if (max_load < tmp)
+			tmp = max_load;
+		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+		pwr_move /= SCHED_LOAD_SCALE;
 
-	busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
-	if (! busiest)
-		goto out_balanced;
-	/*
-	 * This should be "impossible", but since load
-	 * balancing is inherently racy and statistical,
-	 * it could happen in theory.
-	 */
-	if (unlikely(busiest == this_rq)) {
-		WARN_ON(1);
-		goto out_balanced;
-	}
+		/* Move if we gain another 8th of a CPU worth of throughput */
+		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+			goto out_balanced;
 
-	nr_moved = 0;
-	if (busiest->nr_running > 1) {
-		/*
-		 * Attempt to move tasks. If find_busiest_group has found
-		 * an imbalance but busiest->nr_running <= 1, the group is
-		 * still unbalanced. nr_moved simply stays zero, so it is
-		 * correctly treated as an imbalance.
-		 */
-		double_lock_balance(this_rq, busiest);
-		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-				      0,sd, idle);		
-		spin_unlock(&busiest->lock);
-		if (nr_moved) {
-			adjust_local_weight();
-		}
+		*imbalance = 1;
+		return busiest;
 	}
 
-	if (!nr_moved) 
-		sd->nr_balance_failed ++;
-	else
-		sd->nr_balance_failed  = 0;		
-
-	/* We were unbalanced, so reset the balancing interval */
-	sd->balance_interval = sd->min_interval;
+	/* Get rid of the scaling factor, rounding down as we divide */
+	*imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
 
-	return nr_moved;
-
-out_balanced:
-	/* tune up the balancing interval */
-	if (sd->balance_interval < sd->max_interval)
-		sd->balance_interval *= 2;
-
-	return 0;
-}
-
-/*
- * this_rq->lock is already held
- */
-static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
-				       struct sched_domain *sd)
-{
-	int ret;
-	read_lock(&class_list_lock);
-	ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
-	read_unlock(&class_list_lock);
-	return ret;
-}
-
-static inline int load_balance(int this_cpu, runqueue_t *this_rq,
-			struct sched_domain *sd, enum idle_type idle)
-{
-	int ret;
-
-	spin_lock(&this_rq->lock);
-	read_lock(&class_list_lock);
-	ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
-	read_unlock(&class_list_lock);
-	spin_unlock(&this_rq->lock);
-	return ret;
-}
-#else /*! CONFIG_CKRM_CPU_SCHEDULE */
-/*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle)
-{
-	prio_array_t *array, *dst_array;
-	struct list_head *head, *curr;
-	int idx, pulled = 0;
-	task_t *tmp;
-
-	if (max_nr_move <= 0 || busiest->nr_running <= 1)
-		goto out;
-
-	/*
-	 * We first consider expired tasks. Those will likely not be
-	 * executed in the near future, and they are most likely to
-	 * be cache-cold, thus switching CPUs has the least effect
-	 * on them.
-	 */
-	if (busiest->expired->nr_active) {
-		array = busiest->expired;
-		dst_array = this_rq->expired;
-	} else {
-		array = busiest->active;
-		dst_array = this_rq->active;
-	}
-
-new_array:
-	/* Start searching at priority 0: */
-	idx = 0;
-skip_bitmap:
-	if (!idx)
-		idx = sched_find_first_bit(array->bitmap);
-	else
-		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
-	if (idx >= MAX_PRIO) {
-		if (array == busiest->expired && busiest->active->nr_active) {
-			array = busiest->active;
-			dst_array = this_rq->active;
-			goto new_array;
-		}
-		goto out;
-	}
-
-	head = array->queue + idx;
-	curr = head->prev;
-skip_queue:
-	tmp = list_entry(curr, task_t, run_list);
-
-	curr = curr->prev;
-
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
-		if (curr != head)
-			goto skip_queue;
-		idx++;
-		goto skip_bitmap;
-	}
-	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
-	pulled++;
-
-	/* We only want to steal up to the prescribed number of tasks. */
-	if (pulled < max_nr_move) {
-		if (curr != head)
-			goto skip_queue;
-		idx++;
-		goto skip_bitmap;
-	}
-out:
-	return pulled;
-}
-
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
- * moved to restore balance via the imbalance parameter.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum idle_type idle)
-{
-	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
-
-	max_load = this_load = total_load = total_pwr = 0;
-
-	do {
-		cpumask_t tmp;
-		unsigned long load;
-		int local_group;
-		int i, nr_cpus = 0;
-
-		local_group = cpu_isset(this_cpu, group->cpumask);
-
-		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
-		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (unlikely(cpus_empty(tmp)))
-			goto nextgroup;
-
-		for_each_cpu_mask(i, tmp) {
-			/* Bias balancing toward cpus of our domain */
-			if (local_group)
-				load = target_load(i);
-			else
-				load = source_load(i);
-
-			nr_cpus++;
-			avg_load += load;
-		}
-
-		if (!nr_cpus)
-			goto nextgroup;
-
-		total_load += avg_load;
-		total_pwr += group->cpu_power;
-
-		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
-		if (local_group) {
-			this_load = avg_load;
-			this = group;
-			goto nextgroup;
-		} else if (avg_load > max_load) {
-			max_load = avg_load;
-			busiest = group;
-		}
-nextgroup:
-		group = group->next;
-	} while (group != sd->groups);
-
-	if (!busiest || this_load >= max_load)
-		goto out_balanced;
-
-	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
-
-	if (this_load >= avg_load ||
-			100*max_load <= sd->imbalance_pct*this_load)
-		goto out_balanced;
-
-	/*
-	 * We're trying to get all the cpus to the average_load, so we don't
-	 * want to push ourselves above the average load, nor do we wish to
-	 * reduce the max loaded cpu below the average load, as either of these
-	 * actions would just result in more rebalancing later, and ping-pong
-	 * tasks around. Thus we look for the minimum possible imbalance.
-	 * Negative imbalances (*we* are more loaded than anyone else) will
-	 * be counted as no imbalance for these purposes -- we can't fix that
-	 * by pulling tasks to us.  Be careful of negative numbers as they'll
-	 * appear as very large values with unsigned longs.
-	 */
-	*imbalance = min(max_load - avg_load, avg_load - this_load);
-
-	/* How much load to actually move to equalise the imbalance */
-	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
-				/ SCHED_LOAD_SCALE;
-
-	if (*imbalance < SCHED_LOAD_SCALE - 1) {
-		unsigned long pwr_now = 0, pwr_move = 0;
-		unsigned long tmp;
-
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = 1;
-			return busiest;
-		}
-
-		/*
-		 * OK, we don't have enough imbalance to justify moving tasks,
-		 * however we may be able to increase total CPU power used by
-		 * moving them.
-		 */
-
-		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
-		pwr_now /= SCHED_LOAD_SCALE;
-
-		/* Amount of load we'd subtract */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
-		if (max_load > tmp)
-			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-							max_load - tmp);
-
-		/* Amount of load we'd add */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-		if (max_load < tmp)
-			tmp = max_load;
-		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
-		pwr_move /= SCHED_LOAD_SCALE;
-
-		/* Move if we gain another 8th of a CPU worth of throughput */
-		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
-			goto out_balanced;
-
-		*imbalance = 1;
-		return busiest;
-	}
-
-	/* Get rid of the scaling factor, rounding down as we divide */
-	*imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
-
-	return busiest;
+	return busiest;
 
 out_balanced:
 	if (busiest && (idle == NEWLY_IDLE ||
@@ -2249,6 +2025,17 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
  *
  * Called with this_rq unlocked.
  */
+
+static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+				    struct sched_domain *sd, 
+				    enum idle_type idle)
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+{
+	return -1;
+}
+#endif
+;
+
 static int load_balance(int this_cpu, runqueue_t *this_rq,
 			struct sched_domain *sd, enum idle_type idle)
 {
@@ -2259,6 +2046,9 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 
 	spin_lock(&this_rq->lock);
 
+	if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1)
+		goto out_balanced;
+
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
 	if (!group)
 		goto out_balanced;
@@ -2344,8 +2134,12 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	struct sched_group *group;
 	runqueue_t *busiest = NULL;
 	unsigned long imbalance;
-	int nr_moved = 0;
+	int nr_moved;
+
+	if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1)
+		goto out;
 
+	nr_moved = 0;
 	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
 	if (!group)
 		goto out;
@@ -2365,8 +2159,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 out:
 	return nr_moved;
 }
-#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
-
 
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
@@ -2472,6 +2264,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
 
+	ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq);
+
 	/* Update our load */
 	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -2510,7 +2304,9 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
  */
 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
 {
+	ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq);
 }
+
 static inline void idle_balance(int cpu, runqueue_t *rq)
 {
 }
@@ -2547,15 +2343,19 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 
 #ifndef CONFIG_CKRM_CPU_SCHEDULE
 #define EXPIRED_STARVING(rq) \
-	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+		((STARVATION_LIMIT && ((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
 			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
 			((rq)->curr->static_prio > (rq)->best_expired_prio))
 #else
+/* we need to scale the starvation based on weight 
+ * classes with small weight have longer expiration starvation
+ */
 #define EXPIRED_STARVING(rq) \
- 		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
+                ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
  		(jiffies - (rq)->expired_timestamp >= \
- 			STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
+ 			(((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \
+			(this_rq()->curr->static_prio > (rq)->best_expired_prio))
 #endif
 
 /*
@@ -2598,7 +2398,6 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			cpustat->idle += sys_ticks;
 		if (wake_priority_sleeper(rq))
 			goto out;
-		ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
 		rebalance_tick(cpu, rq, IDLE);
 		return;
 	}
@@ -2639,8 +2438,11 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	}
 	if (vx_need_resched(p)) {
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
-		/* Hubertus ... we can abstract this out */
-		ckrm_lrq_t* rq = get_task_lrq(p);
+		/* we redefine RQ to be a local runqueue */
+		ckrm_lrq_t* rq;
+		runqueue_t *cpu_rq = this_rq();
+		rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p) 
+					         : &(cpu_rq->dflt_lrq);
 #endif
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
@@ -2652,8 +2454,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			rq->expired_timestamp = jiffies;
 		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			enqueue_task(p, rq->expired);
-			if (p->static_prio < this_rq()->best_expired_prio)
-				this_rq()->best_expired_prio = p->static_prio;
+			if (p->static_prio < rq->best_expired_prio)
+				rq->best_expired_prio = p->static_prio;
 		} else
 			enqueue_task(p, rq->active);
 	} else {
@@ -2687,7 +2489,6 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 out_unlock:
 	spin_unlock(&rq->lock);
 out:
-	ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
 	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
@@ -2788,21 +2589,17 @@ asmlinkage void __sched schedule(void)
 	unsigned long long now;
 	unsigned long run_time;
 	int cpu;
-#ifdef	CONFIG_VSERVER_HARDCPU		
-	struct vx_info *vxi;
-	int maxidle = -HZ;
-#endif
 
- 	/*
+
+	/*
 	 * If crash dump is in progress, this other cpu's
 	 * need to wait until it completes.
 	 * NB: this code is optimized away for kernels without
 	 * dumping enabled.
 	 */
-	if (unlikely(dump_oncpu))
-		goto dump_scheduling_disabled;
+	 if (unlikely(dump_oncpu))
+		 goto dump_scheduling_disabled;
 
-	//WARN_ON(system_state == SYSTEM_BOOTING);
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
@@ -2837,19 +2634,8 @@ need_resched:
 
 	spin_lock_irq(&rq->lock);
 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-  	if (prev != rq->idle) {
-		unsigned long long run = now - prev->timestamp;
-		ckrm_lrq_t * lrq = get_task_lrq(prev);
-
-		lrq->lrq_load -= task_load(prev);
-		cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
-		lrq->lrq_load += task_load(prev);
+	ckrm_account_task(rq,prev,now);
 
-		cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
-  		update_local_cvt(prev, run);
-	}
-#endif
 	/*
 	 * if entering off of a kernel preemption go straight
 	 * to picking the next task.
@@ -2865,8 +2651,9 @@ need_resched:
 	}
 
 	cpu = smp_processor_id();
+
 #ifdef	CONFIG_VSERVER_HARDCPU		
-	if (!list_empty(&rq->hold_queue)) {
+ 	if (!list_empty(&rq->hold_queue)) {
 		struct list_head *l, *n;
 		int ret;
 
@@ -2875,7 +2662,7 @@ need_resched:
 			next = list_entry(l, task_t, run_list);
 			if (vxi == next->vx_info)
 				continue;
-
+			
 			vxi = next->vx_info;
 			ret = vx_tokens_recalc(vxi);
 			// tokens = vx_tokens_avail(next);
@@ -2885,51 +2672,43 @@ need_resched:
 				next->state &= ~TASK_ONHOLD;
 				recalc_task_prio(next, now);
 				__activate_task(next, rq);
-				// printk("··· unhold %p\n", next);
+				// printk("×·· unhold %p\n", next);
 				break;
 			}
 			if ((ret < 0) && (maxidle < ret))
 				maxidle = ret;
-		}	
+		}
 	}
-	rq->idle_tokens = -maxidle;
-
-pick_next:
-#endif
-	if (unlikely(!rq->nr_running)) {
-		idle_balance(cpu, rq);
-                if (!rq->nr_running) {
-                        next = rq->idle;
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-                        rq->expired_timestamp = 0;
+ 	rq->idle_tokens = -maxidle;
+	
+ pick_next:
 #endif
-                        wake_sleeping_dependent(cpu, rq);
-                        goto switch_tasks;
-                }
+	next = rq_get_next_task(rq,cpu);
+	if (unlikely(next == NULL)) {
+		next = rq->idle;
+		goto switch_tasks;
 	}
 
-	next = rq_get_next_task(rq);
-
 	if (dependent_sleeper(cpu, rq, next)) {
 		next = rq->idle;
 		goto switch_tasks;
 	}
 
 #ifdef	CONFIG_VSERVER_HARDCPU		
-	vxi = next->vx_info;
-	if (vxi && __vx_flags(vxi->vx_flags,
-		VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
-		int ret = vx_tokens_recalc(vxi);
-
-		if (unlikely(ret <= 0)) {
-			if (ret && (rq->idle_tokens > -ret))
-				rq->idle_tokens = -ret;
-			deactivate_task(next, rq);
-			list_add_tail(&next->run_list, &rq->hold_queue);
-			next->state |= TASK_ONHOLD;			
-			goto pick_next;
-		}
-	}
+ 	vxi = next->vx_info;
+ 	if (vxi && __vx_flags(vxi->vx_flags,
+			      VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
+ 		int ret = vx_tokens_recalc(vxi);
+		
+ 		if (unlikely(ret <= 0)) {
+ 			if (ret && (rq->idle_tokens > -ret))
+ 				rq->idle_tokens = -ret;
+ 			deactivate_task(next, rq);
+ 			list_add_tail(&next->run_list, &rq->hold_queue);
+ 			next->state |= TASK_ONHOLD;			
+ 			goto pick_next;
+ 		}
+ 	}
 #endif
 
 	if (!rt_task(next) && next->activated > 0) {
@@ -2980,15 +2759,16 @@ switch_tasks:
 	if (test_thread_flag(TIF_NEED_RESCHED))
 		goto need_resched;
 
-	return;
-
+	
+ 	return;
+	
  dump_scheduling_disabled:
-	/* allow scheduling only if this is the dumping cpu */
-	if (dump_oncpu != smp_processor_id()+1) {
-		while (dump_oncpu)
-			cpu_relax();
-	}
-	return;
+ 	/* allow scheduling only if this is the dumping cpu */
+ 	if (dump_oncpu != smp_processor_id()+1) {
+ 		while (dump_oncpu)
+ 			cpu_relax();
+ 	}
+ 	return;
 }
 
 EXPORT_SYMBOL(schedule);
@@ -3175,11 +2955,11 @@ EXPORT_SYMBOL(wait_for_completion);
 	spin_unlock_irqrestore(&q->lock, flags);
 
 #define SLEEP_ON_BKLCHECK				\
-	if (unlikely(!kernel_locked()) &&		\
-	    sleep_on_bkl_warnings < 10) {		\
-		sleep_on_bkl_warnings++;		\
-		WARN_ON(1);				\
-	}
+ 	if (unlikely(!kernel_locked()) &&		\
+ 	    sleep_on_bkl_warnings < 10) {		\
+ 		sleep_on_bkl_warnings++;		\
+ 		WARN_ON(1);				\
+ 	}
 
 static int sleep_on_bkl_warnings;
 
@@ -3202,7 +2982,7 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long
 {
 	SLEEP_ON_VAR
 
-	SLEEP_ON_BKLCHECK
+        SLEEP_ON_BKLCHECK
 
 	current->state = TASK_INTERRUPTIBLE;
 
@@ -3215,11 +2995,26 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long
 
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 
+void fastcall __sched sleep_on(wait_queue_head_t *q)
+{
+	SLEEP_ON_VAR
+
+        SLEEP_ON_BKLCHECK
+
+	current->state = TASK_UNINTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	schedule();
+	SLEEP_ON_TAIL
+}
+
+EXPORT_SYMBOL(sleep_on);
+
 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
-	SLEEP_ON_BKLCHECK
+        SLEEP_ON_BKLCHECK
 
 	current->state = TASK_UNINTERRUPTIBLE;
 
@@ -3346,7 +3141,6 @@ int task_nice(const task_t *p)
 {
 	return TASK_NICE(p);
 }
-
 EXPORT_SYMBOL(task_nice);
 
 /**
@@ -3969,8 +3763,6 @@ void show_state(void)
 	read_unlock(&tasklist_lock);
 }
 
-EXPORT_SYMBOL_GPL(show_state);
-
 void __devinit init_idle(task_t *idle, int cpu)
 {
 	runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
@@ -4657,13 +4449,12 @@ void __init sched_init(void)
 
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
+		rq->best_expired_prio = MAX_PRIO;
 #else
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 #endif
 
-		rq->best_expired_prio = MAX_PRIO;
-
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_init;
 		rq->cpu_load = 0;
@@ -4676,7 +4467,7 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
 #ifdef	CONFIG_VSERVER_HARDCPU		
-		INIT_LIST_HEAD(&rq->hold_queue);
+ 		INIT_LIST_HEAD(&rq->hold_queue);
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 	}
@@ -4712,15 +4503,15 @@ void __might_sleep(char *file, int line, int atomic_depth)
 #ifndef CONFIG_PREEMPT
 	atomic_depth = 0;
 #endif
-	if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
+	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "Debug: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
-			in_atomic(), atomic_depth, irqs_disabled());
+		printk("in_atomic():%d, irqs_disabled():%d\n",
+			in_atomic(), irqs_disabled());
 		dump_stack();
 	}
 #endif
@@ -4783,6 +4574,20 @@ EXPORT_SYMBOL(task_running_sys);
 #endif
 
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
+
+/********************************************************************
+ *
+ *  CKRM Scheduler additions
+ * 
+ *  (a) helper functions
+ *  (b) load balancing code
+ *
+ *  These are required here to avoid having to externalize many
+ *  of the definitions in sched.c
+ *
+ * 
+ ********************************************************************/
+
 /**
  * return the classqueue object of a certain processor
  */
@@ -4811,4 +4616,559 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
 
 	task_rq_unlock(rq,&flags);
 }
+
+/**
+ * get_min_cvt_locking  - get the mininum cvt on a particular cpu under rqlock
+ */
+
+CVT_t get_min_cvt(int cpu);
+
+CVT_t get_min_cvt_locking(int cpu)
+{
+	CVT_t cvt;
+	struct runqueue *rq = cpu_rq(cpu);
+	spin_lock(&rq->lock);
+	cvt = get_min_cvt(cpu);
+	spin_unlock(&rq->lock);
+	return cvt;
+}
+
+ckrm_lrq_t *rq_get_dflt_lrq(int cpu)
+{
+	return &(cpu_rq(cpu)->dflt_lrq);
+}
+
+#ifdef CONFIG_SMP
+
+/**************  CKRM Load Balancing code ************************/
+
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
+				      int phase, enum idle_type idle)
+{
+	long pressure = task_load(tmp);
+	
+	if (pressure > max) 
+		return 0;
+
+	if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+		return 0;
+	return 1;
+}
+
+/*
+ * move tasks for a specic local class
+ * return number of tasks pulled
+ */
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+				      runqueue_t *this_rq,
+				      runqueue_t *busiest,
+				      struct sched_domain *sd,
+				      int this_cpu,
+				      enum idle_type idle,
+				      long* pressure_imbalance) 
+{
+	prio_array_t *array, *dst_array;
+	struct list_head *head, *curr;
+	task_t *tmp;
+	int idx;
+	int pulled = 0;
+	int phase = -1;
+	long pressure_min, pressure_max;
+	/*hzheng: magic : 90% balance is enough*/
+	long balance_min = *pressure_imbalance / 10; 
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ *     or the tasks that make too small difference
+ */
+#define CKRM_BALANCE_MAX_RATIO	100
+#define CKRM_BALANCE_MIN_RATIO	1
+ start:
+	phase ++;
+	/*
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
+	 */
+	if (src_lrq->expired->nr_active) {
+		array = src_lrq->expired;
+		dst_array = dst_lrq->expired;
+	} else {
+		array = src_lrq->active;
+		dst_array = dst_lrq->active;
+	}
+	
+ new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
+ skip_bitmap:
+	if (!idx)
+		idx = sched_find_first_bit(array->bitmap);
+	else
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
+		if (array == src_lrq->expired && src_lrq->active->nr_active) {
+			array = src_lrq->active;
+			dst_array = dst_lrq->active;
+			goto new_array;
+		}
+		if ((! phase) && (! pulled) && (idle != IDLE))
+			goto start; //try again
+		else 
+			goto out; //finished search for this lrq
+	}
+	
+	head = array->queue + idx;
+	curr = head->prev;
+ skip_queue:
+	tmp = list_entry(curr, task_t, run_list);
+	
+	curr = curr->prev;
+	
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+
+	pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+	pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
+	/*
+	 * skip the tasks that will reverse the balance too much
+	 */
+	if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+		*pressure_imbalance -= task_load(tmp);
+		pull_task(busiest, array, tmp, 
+			  this_rq, dst_array, this_cpu);
+		pulled++;
+
+		if (*pressure_imbalance <= balance_min)
+			goto out;
+	}
+		
+	if (curr != head)
+		goto skip_queue;
+	idx++;
+	goto skip_bitmap;
+ out:	       
+	return pulled;
+}
+
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+	long imbalance;
+	/*
+	 * make sure after balance, imbalance' > - imbalance/2
+	 * we don't want the imbalance be reversed too much
+	 */
+	imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0) 
+		- ckrm_get_pressure(rq_ckrm_load(this_rq),1);
+	imbalance /= 2;
+	return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu, 
+			   runqueue_t *busiest,
+			   unsigned long max_nr_move, struct sched_domain *sd,
+			   enum idle_type idle)
+{
+	struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+	ckrm_lrq_t* src_lrq,*dst_lrq;
+	long pressure_imbalance, pressure_imbalance_old;
+	int src_cpu = task_cpu(busiest->curr);
+	struct list_head *list;
+	int pulled = 0;
+	long imbalance;
+
+	imbalance =  ckrm_rq_imbalance(this_rq,busiest);
+
+	if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+		goto out;
+
+	//try to find the vip class
+        list_for_each_entry(clsptr,&active_cpu_classes,links) {
+		src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+
+		if (! lrq_nr_running(src_lrq))
+			continue;
+
+		if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
+			{
+				vip_cls = clsptr;
+			}
+	}
+
+	/*
+	 * do search from the most significant class
+	 * hopefully, less tasks will be migrated this way
+	 */
+	clsptr = vip_cls;
+
+ move_class:
+	if (! clsptr)
+		goto out;
+	
+
+	src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+	if (! lrq_nr_running(src_lrq))
+		goto other_class;
+	
+	dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+	//how much pressure for this class should be transferred
+	pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight);
+	if (pulled && ! pressure_imbalance) 
+		goto other_class;
+	
+	pressure_imbalance_old = pressure_imbalance;
+	
+	//move tasks
+	pulled += 
+		ckrm_cls_move_tasks(src_lrq,dst_lrq,
+				    this_rq,
+				    busiest,
+				    sd,this_cpu,idle,
+				    &pressure_imbalance);
+
+	/* 
+	 * hzheng: 2 is another magic number
+	 * stop balancing if the imbalance is less than 25% of the orig
+	 */
+	if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+		goto out;
+		
+	//update imbalance
+	imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+	//who is next?
+	list = clsptr->links.next;
+	if (list == &active_cpu_classes)
+		list = list->next;
+	clsptr = list_entry(list, typeof(*clsptr), links);
+	if (clsptr != vip_cls)
+		goto move_class;
+ out:
+	return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics: 
+ *   no load balancing if it's load is over average
+ *   no load balancing if it's load is far more than the min
+ * task:
+ *   read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+					     enum idle_type idle, int* nr_group)
+{
+	struct sched_group *group = sd->groups;
+	unsigned long min_load, max_load, avg_load;
+	unsigned long total_load, this_load, total_pwr;
+
+	max_load = this_load = total_load = total_pwr = 0;
+	min_load = 0xFFFFFFFF;
+	*nr_group = 0;
+
+	do {
+		cpumask_t tmp;
+		unsigned long load;
+		int local_group;
+		int i, nr_cpus = 0;
+
+		/* Tally up the load of all CPUs in the group */
+		cpus_and(tmp, group->cpumask, cpu_online_map);
+		if (unlikely(cpus_empty(tmp)))
+			goto nextgroup;
+
+		avg_load = 0;
+		local_group = cpu_isset(this_cpu, group->cpumask);
+
+		for_each_cpu_mask(i, tmp) {
+			load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+			nr_cpus++;
+			avg_load += load;
+		}
+
+		if (!nr_cpus)
+			goto nextgroup;
+
+		total_load += avg_load;
+		total_pwr += group->cpu_power;
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			goto nextgroup;
+		} else if (avg_load > max_load) {
+			max_load = avg_load;
+		}      
+		if (avg_load < min_load) {
+			min_load = avg_load;
+		}
+nextgroup:
+		group = group->next;
+		*nr_group = *nr_group + 1;
+	} while (group != sd->groups);
+
+	if (!max_load || this_load >= max_load)
+		goto out_balanced;
+
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+	/* hzheng: debugging: 105 is a magic number
+	 * 100*max_load <= sd->imbalance_pct*this_load)
+	 * should use imbalance_pct instead
+	 */
+	if (this_load > avg_load 
+	    || 100*max_load < 105*this_load
+	    || 100*min_load < 70*this_load
+	    )
+		goto out_balanced;
+
+	return avg_load;
+ out_balanced:
+	return 0;
+}
+
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
+ */
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+		     unsigned long avg_load, enum idle_type idle,
+		     int nr_group)
+{
+	struct sched_group *group;
+	runqueue_t * busiest=NULL;
+	unsigned long rand;
+	
+	group = sd->groups;
+	rand = get_ckrm_rand(nr_group);
+	nr_group = 0;
+
+	do {
+		unsigned long load,total_load,max_load;
+		cpumask_t tmp;
+		int i;
+		runqueue_t * grp_busiest;
+
+		cpus_and(tmp, group->cpumask, cpu_online_map);
+		if (unlikely(cpus_empty(tmp)))
+			goto find_nextgroup;
+
+		total_load = 0;
+		max_load = 0;
+		grp_busiest = NULL;
+		for_each_cpu_mask(i, tmp) {
+			load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+			total_load += load;
+			if (load > max_load) {
+				max_load = load;
+				grp_busiest = cpu_rq(i);
+			}				
+		}
+
+		total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+		if (total_load > avg_load) {
+			busiest = grp_busiest;
+			if (nr_group >= rand)
+				break;
+		}
+	find_nextgroup:		
+		group = group->next;
+		nr_group ++;
+	} while (group != sd->groups);
+
+	return busiest;
+}
+
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq,
+				    struct sched_domain *sd, 
+				    enum idle_type idle)
+{
+	runqueue_t *busiest;
+	unsigned long avg_load;
+	int nr_moved,nr_group;
+
+	avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+	if (! avg_load)
+		goto out_balanced;
+
+	busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
+	if (! busiest)
+		goto out_balanced;
+	/*
+	 * This should be "impossible", but since load
+	 * balancing is inherently racy and statistical,
+	 * it could happen in theory.
+	 */
+	if (unlikely(busiest == this_rq)) {
+		WARN_ON(1);
+		goto out_balanced;
+	}
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/*
+		 * Attempt to move tasks. If find_busiest_group has found
+		 * an imbalance but busiest->nr_running <= 1, the group is
+		 * still unbalanced. nr_moved simply stays zero, so it is
+		 * correctly treated as an imbalance.
+		 */
+		double_lock_balance(this_rq, busiest);
+		nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest,
+					   0,sd, idle);		
+		spin_unlock(&busiest->lock);
+		if (nr_moved) {
+			adjust_local_weight();
+		}
+	}
+
+	if (!nr_moved) 
+		sd->nr_balance_failed ++;
+	else
+		sd->nr_balance_failed  = 0;		
+
+	/* We were unbalanced, so reset the balancing interval */
+	sd->balance_interval = sd->min_interval;
+
+	return nr_moved;
+
+out_balanced:
+	/* tune up the balancing interval */
+	if (sd->balance_interval < sd->max_interval)
+		sd->balance_interval *= 2;
+
+	return 0;
+}
+
+static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+				    struct sched_domain *sd, 
+				    enum idle_type idle)
+{
+	int ret;
+
+	if (ckrm_rq_cpu_disabled(this_rq)) 
+		return -1;
+	//spin_lock(&this_rq->lock);
+	read_lock(&class_list_lock);
+	ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle);
+	// ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE);
+	read_unlock(&class_list_lock);
+	//spin_unlock(&this_rq->lock);
+	return ret;
+}
+
+#endif   // CONFIG_SMP
+
+
+void ckrm_cpu_class_queue_update(int on)
+{
+	/* This is called when the mode changes from disabled
+	 * to enabled (on=1) or vice versa (on=0).
+	 * we make sure that all classqueues on all cpus
+	 * either have the default class enqueued (on=1) or 
+	 * all classes dequeued (on=0). 
+	 * if not done a race condition will persist
+	 * when flipping the ckrm_sched_mode.
+	 * Otherwise will lead to more complicated code
+	 * in rq_get_next_task, where we despite knowing of
+	 * runnable tasks can not find an enqueued class.
+	 */
+
+	int i;
+	runqueue_t *rq;
+	ckrm_lrq_t *lrq;
+	struct ckrm_cpu_class *clsptr;
+
+	if (on) {	
+		BUG_ON(ckrm_cpu_enabled());
+		for_each_cpu(i) {
+			rq = cpu_rq(i);
+			BUG_ON(ckrm_rq_cpu_enabled(rq));
+			lrq = &rq->dflt_lrq;
+			spin_lock(&rq->lock);
+
+			BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj));
+
+			classqueue_init(&rq->classqueue,1);
+			lrq->top_priority = find_first_bit(lrq->active->bitmap,
+							   MAX_PRIO),
+                	classqueue_enqueue(lrq->classqueue, 
+					   &lrq->classqueue_linkobj, 0);
+			spin_unlock(&rq->lock);
+#if 0
+			printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i,
+				rq->nr_running,lrq->active->nr_active,
+				lrq->expired->nr_active,
+				find_first_bit(lrq->active->bitmap,MAX_PRIO),
+				find_first_bit(lrq->expired->bitmap,MAX_PRIO),
+				lrq->top_priority);
 #endif
+        	}
+	} else {
+		for_each_cpu(i) {
+			rq = cpu_rq(i);
+			spin_lock(&rq->lock);
+
+			/* walk through all classes and make sure they
+	  		 * are not enqueued
+			 */
+			write_lock(&class_list_lock);
+		        list_for_each_entry(clsptr,&active_cpu_classes,links) {
+				lrq = get_ckrm_lrq(clsptr,i);
+				BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq));  // must be empty
+				if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
+                        		classqueue_dequeue(lrq->classqueue,
+                                                   	&lrq->classqueue_linkobj);
+        		}
+			rq->classqueue.enabled = 0;
+			write_unlock(&class_list_lock);
+			spin_unlock(&rq->lock);
+		}
+	}
+}
+
+/*
+ * callback when a class is getting deleted
+ * need to remove it from the class runqueue. see (class_queue_update)
+ */
+
+void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr)
+{
+	int i;
+	
+	for_each_cpu(i) {
+		runqueue_t *rq = cpu_rq(i);
+		ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i);
+
+		spin_lock(&rq->lock);
+		write_lock(&class_list_lock);
+		BUG_ON(lrq_nr_running(lrq));  // must be empty
+		if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
+			classqueue_dequeue(lrq->classqueue,
+					   &lrq->classqueue_linkobj);
+		write_unlock(&class_list_lock);
+		spin_unlock(&rq->lock);
+	}
+}
+
+#endif  // CONFIG_CKRM_CPU_SCHEDULE
diff --git a/kernel/sys.c b/kernel/sys.c
index c69f6ed82..6e8b073bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -18,6 +18,8 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
 #include <linux/workqueue.h>
 #include <linux/device.h>
 #include <linux/times.h>
@@ -511,6 +513,25 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		machine_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
+	case LINUX_REBOOT_CMD_KEXEC:
+	{
+		struct kimage *image;
+		image = xchg(&kexec_image, 0);
+		if (!image) {
+			unlock_kernel();
+			return -EINVAL;
+		}
+		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		system_state = SYSTEM_BOOTING;
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_shutdown();
+		machine_kexec(image);
+		break;
+	}
+#endif
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
diff --git a/lib/.cvsignore b/lib/.cvsignore
new file mode 100644
index 000000000..30d38180f
--- /dev/null
+++ b/lib/.cvsignore
@@ -0,0 +1,2 @@
+crc32table.h
+gen_crc32table
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index b58141ead..c4bae8c2f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -628,5 +628,50 @@ config IP_NF_MATCH_REALM
 	  If you want to compile it as a module, say M here and read
 	  Documentation/modules.txt.  If unsure, say `N'.
 
+config IP_NF_CT_ACCT
+	bool "Connection tracking flow accounting"
+	depends on IP_NF_CONNTRACK
+
+config IP_NF_CT_PROTO_GRE
+	tristate  ' GRE protocol support'
+	depends on IP_NF_CONNTRACK
+	help
+	  This module adds generic support for connection tracking and NAT of the
+	  GRE protocol (RFC1701, RFC2784).  Please note that this will only work
+	  with GRE connections using the key field of the GRE header.
+	
+	  You will need GRE support to enable PPTP support.
+	
+	  If you want to compile it as a module, say `M' here and read
+	  Documentation/modules.txt.  If unsire, say `N'.
+
+config IP_NF_PPTP
+	tristate  'PPTP protocol support'
+	depends on IP_NF_CT_PROTO_GRE
+	help
+	  This module adds support for PPTP (Point to Point Tunnelling Protocol, 
+	  RFC2637) conncection tracking and NAT. 
+	
+	  If you are running PPTP sessions over a stateful firewall or NAT box,
+	  you may want to enable this feature.  
+	
+	  Please note that not all PPTP modes of operation are supported yet.
+	  For more info, read top of the file net/ipv4/netfilter/ip_conntrack_pptp.c
+	
+	  If you want to compile it as a module, say M here and read
+	  Documentation/modules.txt.  If unsure, say `N'.
+
+config IP_NF_NAT_PPTP
+	tristate
+	depends on IP_NF_NAT!=n && IP_NF_PPTP!=n
+	default IP_NF_NAT if IP_NF_PPTP=y
+	default m if IP_NF_PPTP=m
+
+config IP_NF_NAT_PROTO_GRE
+	tristate
+	depends on IP_NF_NAT!=n && IP_NF_CT_PROTO_GRE!=n
+	default IP_NF_NAT if IP_NF_CT_PROTO_GRE=y
+	default m if IP_NF_CT_PROTO_GRE=m
+
 endmenu
 
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index bdb23fde1..f54887b48 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -19,17 +19,25 @@ ipchains-objs		:= $(ip_nf_compat-objs) ipchains_core.o
 # connection tracking
 obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
 
+# connection tracking protocol helpers
+obj-$(CONFIG_IP_NF_CT_PROTO_GRE) += ip_conntrack_proto_gre.o
+
+# NAT protocol helpers
+obj-$(CONFIG_IP_NF_NAT_PROTO_GRE) += ip_nat_proto_gre.o
+
 # connection tracking helpers
 obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
 obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
 obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
 obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o
 
 # NAT helpers 
 obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
 obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
 obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
 obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 4e8f4d83b..40ed4474d 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -58,7 +58,7 @@ static int help(struct sk_buff *skb,
 
 	/* increase the UDP timeout of the master connection as replies from
 	 * Amanda clients to the server can be quite delayed */
-	ip_ct_refresh(ct, master_timeout * HZ);
+	ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
 
 	/* No data? */
 	dataoff = skb->nh.iph->ihl*4 + sizeof(struct udphdr);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 05fbb43cc..757af6893 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -143,6 +143,7 @@ get_tuple(const struct iphdr *iph,
 	tuple->src.ip = iph->saddr;
 	tuple->dst.ip = iph->daddr;
 	tuple->dst.protonum = iph->protocol;
+	tuple->src.u.all = tuple->dst.u.all = 0;
 
 	return protocol->pkt_to_tuple(skb, dataoff, tuple);
 }
@@ -156,6 +157,8 @@ invert_tuple(struct ip_conntrack_tuple *inverse,
 	inverse->dst.ip = orig->src.ip;
 	inverse->dst.protonum = orig->dst.protonum;
 
+	inverse->src.u.all = inverse->dst.u.all = 0;
+
 	return protocol->invert_tuple(inverse, orig);
 }
 
@@ -976,8 +979,8 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
 	 * so there is no need to use the tuple lock too */
 
 	DEBUGP("ip_conntrack_expect_related %p\n", related_to);
-	DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
-	DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
+	DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
+	DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
 
 	old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
 		        struct ip_conntrack_expect *, &expect->tuple, 
@@ -1070,15 +1073,14 @@ int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
 
 	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
 	WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
-
 	DEBUGP("change_expect:\n");
-	DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
-	DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
-	DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
+	DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
+	DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
+	DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
 	if (expect->ct_tuple.dst.protonum == 0) {
 		/* Never seen before */
 		DEBUGP("change expect: never seen before\n");
-		if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
+		if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
 		    && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
 			         struct ip_conntrack_expect *, newtuple, &expect->mask)) {
 			/* Force NAT to find an unused tuple */
@@ -1166,21 +1168,39 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 	synchronize_net();
 }
 
-/* Refresh conntrack for this many jiffies. */
-void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+static inline void ct_add_counters(struct ip_conntrack *ct,
+				   enum ip_conntrack_info ctinfo,
+				   const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_NF_CT_ACCT
+	if (skb) {
+		ct->counters[CTINFO2DIR(ctinfo)].packets++;
+		ct->counters[CTINFO2DIR(ctinfo)].bytes += 
+					ntohs(skb->nh.iph->tot_len);
+	}
+#endif
+}
+
+/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
+void ip_ct_refresh_acct(struct ip_conntrack *ct, 
+		        enum ip_conntrack_info ctinfo,
+			const struct sk_buff *skb,
+			unsigned long extra_jiffies)
 {
 	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
 
 	/* If not in hash table, timer will not be active yet */
-	if (!is_confirmed(ct))
+	if (!is_confirmed(ct)) {
 		ct->timeout.expires = extra_jiffies;
-	else {
+		ct_add_counters(ct, ctinfo, skb);
+	} else {
 		WRITE_LOCK(&ip_conntrack_lock);
 		/* Need del_timer for race avoidance (may already be dying). */
 		if (del_timer(&ct->timeout)) {
 			ct->timeout.expires = jiffies + extra_jiffies;
 			add_timer(&ct->timeout);
 		}
+		ct_add_counters(ct, ctinfo, skb);
 		WRITE_UNLOCK(&ip_conntrack_lock);
 	}
 }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
index 0df558a58..6a7db7754 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
@@ -50,9 +50,9 @@ static unsigned int generic_print_conntrack(char *buffer,
 /* Returns verdict for packet, or -1 for invalid. */
 static int packet(struct ip_conntrack *conntrack,
 		  const struct sk_buff *skb,
-		  enum ip_conntrack_info conntrackinfo)
+		  enum ip_conntrack_info ctinfo)
 {
-	ip_ct_refresh(conntrack, ip_ct_generic_timeout);
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
 	return NF_ACCEPT;
 }
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 013f759cc..edccfe843 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -130,13 +130,6 @@ int ip_ct_gre_keymap_add(struct ip_conntrack_expect *exp,
 void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km,
 			     struct ip_conntrack_tuple *t)
 {
-        if (!km)
-        {
-                printk(KERN_WARNING
-                        "NULL GRE conntrack keymap change requested\n");
-                return;
-        }
-
 	DEBUGP("changing entry %p to: ", km);
 	DUMP_TUPLE_GRE(t);
 
@@ -188,8 +181,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
 	u_int32_t srckey;
 
 	grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
-	/* PPTP header is variable length, only need up to the call_id field */
-	pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
+	pgrehdr = skb_header_pointer(skb, dataoff, sizeof(_pgrehdr), &_pgrehdr);
 
 	if (!grehdr || !pgrehdr)
 		return 0;
@@ -219,11 +211,11 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
 
 	srckey = gre_keymap_lookup(tuple);
 
-	tuple->src.u.gre.key = srckey;
 #if 0
 	DEBUGP("found src key %x for tuple ", ntohl(srckey));
 	DUMP_TUPLE_GRE(tuple);
 #endif
+	tuple->src.u.gre.key = srckey;
 
 	return 1;
 }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 47114840f..e854193eb 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -94,7 +94,7 @@ static int icmp_packet(struct ip_conntrack *ct,
 			ct->timeout.function((unsigned long)ct);
 	} else {
 		atomic_inc(&ct->proto.icmp.count);
-		ip_ct_refresh(ct, ip_ct_icmp_timeout);
+		ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
 	}
 
 	return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 463cafa66..73fe0401d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -225,7 +225,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		set_bit(IPS_ASSURED_BIT, &conntrack->status);
 
 out:	WRITE_UNLOCK(&tcp_lock);
-	ip_ct_refresh(conntrack, *tcp_timeouts[newconntrack]);
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, *tcp_timeouts[newconntrack]);
 
 	return NF_ACCEPT;
 }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index a63c32d18..a69e14b5c 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -60,16 +60,17 @@ static unsigned int udp_print_conntrack(char *buffer,
 /* Returns verdict for packet, and may modify conntracktype */
 static int udp_packet(struct ip_conntrack *conntrack,
 		      const struct sk_buff *skb,
-		      enum ip_conntrack_info conntrackinfo)
+		      enum ip_conntrack_info ctinfo)
 {
 	/* If we've seen traffic both ways, this is some kind of UDP
 	   stream.  Extend timeout. */
 	if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
-		ip_ct_refresh(conntrack, ip_ct_udp_timeout_stream);
+		ip_ct_refresh_acct(conntrack, ctinfo, skb, 
+				   ip_ct_udp_timeout_stream);
 		/* Also, more likely to be important, and not a probe */
 		set_bit(IPS_ASSURED_BIT, &conntrack->status);
 	} else
-		ip_ct_refresh(conntrack, ip_ct_udp_timeout);
+		ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
 
 	return NF_ACCEPT;
 }
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index fd688f4fe..76c827dcb 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -83,6 +83,17 @@ print_expect(char *buffer, const struct ip_conntrack_expect *expect)
 	return len;
 }
 
+#ifdef CONFIG_IP_NF_CT_ACCT
+static unsigned int
+print_counters(char *buffer, struct ip_conntrack_counter *counter)
+{
+	return sprintf(buffer, "packets=%llu bytes=%llu ", 
+			counter->packets, counter->bytes);
+}
+#else
+#define print_counters(x, y)	0
+#endif
+
 static unsigned int
 print_conntrack(char *buffer, struct ip_conntrack *conntrack)
 {
@@ -103,12 +114,16 @@ print_conntrack(char *buffer, struct ip_conntrack *conntrack)
 			   &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 			   proto);
 	len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_ORIGINAL]);
+	len += print_counters(buffer + len, 
+			      &conntrack->counters[IP_CT_DIR_ORIGINAL]);
 	if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
 		len += sprintf(buffer + len, "[UNREPLIED] ");
 	len += print_tuple(buffer + len,
 			   &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
 			   proto);
 	len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_REPLY]);
+	len += print_counters(buffer + len, 
+			      &conntrack->counters[IP_CT_DIR_REPLY]);
 	if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
 		len += sprintf(buffer + len, "[ASSURED] ");
 	len += sprintf(buffer + len, "use=%u ",
@@ -640,7 +655,7 @@ EXPORT_SYMBOL(need_ip_conntrack);
 EXPORT_SYMBOL(ip_conntrack_helper_register);
 EXPORT_SYMBOL(ip_conntrack_helper_unregister);
 EXPORT_SYMBOL(ip_ct_selective_cleanup);
-EXPORT_SYMBOL(ip_ct_refresh);
+EXPORT_SYMBOL(ip_ct_refresh_acct);
 EXPORT_SYMBOL(ip_ct_find_proto);
 EXPORT_SYMBOL(__ip_ct_find_proto);
 EXPORT_SYMBOL(ip_ct_find_helper);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 1c6b78106..130b01c18 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -438,7 +438,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
 	*tuple = *orig_tuple;
 	while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
 	       != NULL) {
-		DEBUGP("Found best for "); DUMP_TUPLE(tuple);
+		DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple);
 		/* 3) The per-protocol part of the manip is made to
 		   map into the range to make a unique tuple. */
 
@@ -580,9 +580,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
 		       HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
 		       conntrack);
 		DEBUGP("Original: ");
-		DUMP_TUPLE(&orig_tp);
+		DUMP_TUPLE_RAW(&orig_tp);
 		DEBUGP("New: ");
-		DUMP_TUPLE(&new_tuple);
+		DUMP_TUPLE_RAW(&new_tuple);
 #endif
 
 		/* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 23f8f511d..ad097f510 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1107,6 +1107,75 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
 	return 0;
 }
 
+/* XXX (mef) need to generalize the IPOD stuff.  Right now I am borrowing 
+   from the ICMP infrastructure. */
+#ifdef CONFIG_ICMP_IPOD
+#include <linux/reboot.h>
+
+extern int sysctl_icmp_ipod_version;
+extern int sysctl_icmp_ipod_enabled;
+extern u32 sysctl_icmp_ipod_host;
+extern u32 sysctl_icmp_ipod_mask;
+extern char sysctl_icmp_ipod_key[32+1];
+#define IPOD_CHECK_KEY \
+	(sysctl_icmp_ipod_key[0] != 0)
+#define IPOD_VALID_KEY(d) \
+	(strncmp(sysctl_icmp_ipod_key, (char *)(d), strlen(sysctl_icmp_ipod_key)) == 0)
+
+static void udp_ping_of_death(struct sk_buff *skb, struct udphdr *uh, u32 saddr)
+{
+	int doit = 0;
+
+	/*
+	 * If IPOD not enabled or wrong UDP IPOD port, ignore.
+	 */
+	if (!sysctl_icmp_ipod_enabled || (ntohs(uh->dest) != 664))
+		return;
+
+#if 0
+	printk(KERN_INFO "IPOD: got udp pod request, host=%u.%u.%u.%u\n", NIPQUAD(saddr));
+#endif
+
+
+	/*
+	 * First check the source address info.
+	 * If host not set, ignore.
+	 */
+	if (sysctl_icmp_ipod_host != 0xffffffff &&
+	    (ntohl(saddr) & sysctl_icmp_ipod_mask) == sysctl_icmp_ipod_host) {
+		/*
+		 * Now check the key if enabled.
+		 * If packet doesn't contain enough data or key
+		 * is otherwise invalid, ignore.
+		 */
+		if (IPOD_CHECK_KEY) {
+			if (pskb_may_pull(skb, sizeof(sysctl_icmp_ipod_key)+sizeof(struct udphdr)-1)){
+#if 0
+			    int i;
+			    for (i=0;i<32+1;i++){
+				printk("%c",((char*)skb->data)[i+sizeof(struct udphdr)]);
+			    } 	
+			    printk("\n");
+#endif
+			    if (IPOD_VALID_KEY(skb->data+sizeof(struct udphdr)))
+				doit = 1;
+			}
+		} else {
+			doit = 1;
+		}
+	}
+	if (doit) {
+		sysctl_icmp_ipod_enabled = 0;
+		printk(KERN_CRIT "IPOD: reboot forced by %u.%u.%u.%u...\n",
+		       NIPQUAD(saddr));
+		machine_restart(NULL);
+	} else {
+		printk(KERN_WARNING "IPOD: from %u.%u.%u.%u rejected\n",
+		       NIPQUAD(saddr));
+	}
+}
+#endif
+
 /*
  *	All we need to do is get the socket, and then do a checksum. 
  */
@@ -1143,6 +1212,10 @@ int udp_rcv(struct sk_buff *skb)
 	if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
 		return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
 
+#ifdef CONFIG_ICMP_IPOD
+	udp_ping_of_death(skb, uh, saddr);
+#endif
+
 	sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
 
 	if (sk != NULL) {
diff --git a/scripts/.cvsignore b/scripts/.cvsignore
new file mode 100644
index 000000000..d95bc0ab8
--- /dev/null
+++ b/scripts/.cvsignore
@@ -0,0 +1,4 @@
+bin2c
+conmakehash
+kallsyms
+pnmtologo
diff --git a/scripts/basic/.cvsignore b/scripts/basic/.cvsignore
new file mode 100644
index 000000000..fa6c88800
--- /dev/null
+++ b/scripts/basic/.cvsignore
@@ -0,0 +1,3 @@
+docproc
+fixdep
+split-include
diff --git a/scripts/kconfig/.cvsignore b/scripts/kconfig/.cvsignore
new file mode 100644
index 000000000..37981a9ca
--- /dev/null
+++ b/scripts/kconfig/.cvsignore
@@ -0,0 +1,5 @@
+conf
+lex.zconf.c
+mconf
+zconf.tab.c
+zconf.tab.h
diff --git a/scripts/kernel-2.6-planetlab.spec b/scripts/kernel-2.6-planetlab.spec
index 4e2be569b..84f9f996d 100644
--- a/scripts/kernel-2.6-planetlab.spec
+++ b/scripts/kernel-2.6-planetlab.spec
@@ -22,7 +22,7 @@ Summary: The Linux kernel (the core of the Linux operating system)
 %define kversion 2.6.%{sublevel}
 %define rpmversion 2.6.%{sublevel}
 %define rhbsys  %([ -r /etc/beehive-root ] && echo  || echo .`whoami`)
-%define release 1.521.2.6.planetlab%{?date:.%{date}}
+%define release 1.521.3.planetlab%{?date:.%{date}}
 %define signmodules 0
 
 %define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE}
diff --git a/scripts/lxdialog/.cvsignore b/scripts/lxdialog/.cvsignore
new file mode 100644
index 000000000..bebf29560
--- /dev/null
+++ b/scripts/lxdialog/.cvsignore
@@ -0,0 +1 @@
+lxdialog
diff --git a/scripts/mod/.cvsignore b/scripts/mod/.cvsignore
new file mode 100644
index 000000000..a6dd5e27e
--- /dev/null
+++ b/scripts/mod/.cvsignore
@@ -0,0 +1,3 @@
+elfconfig.h
+mk_elfconfig
+modpost
diff --git a/usr/.cvsignore b/usr/.cvsignore
new file mode 100644
index 000000000..d06dfff84
--- /dev/null
+++ b/usr/.cvsignore
@@ -0,0 +1,3 @@
+gen_init_cpio
+initramfs_data.cpio
+initramfs_data.cpio.gz