This commit was manufactured by cvs2svn to create tag

author Planet-Lab Support <support@planet-lab.org>

Fri, 21 Jan 2005 03:34:25 +0000 (03:34 +0000)

committer Planet-Lab Support <support@planet-lab.org>

Fri, 21 Jan 2005 03:34:25 +0000 (03:34 +0000)
author Planet-Lab Support <support@planet-lab.org>
Fri, 21 Jan 2005 03:34:25 +0000 (03:34 +0000)
committer Planet-Lab Support <support@planet-lab.org>
Fri, 21 Jan 2005 03:34:25 +0000 (03:34 +0000)
diff --git a/.cvsignore b/.cvsignore

new file mode 100644 (file)

index 0000000..5e7d074
--- /dev/null
+++ b/.cvsignore
@@ -0,0 +1,13 @@
+.config
+.tmp_System.map
+.tmp_kallsyms1.S
+.tmp_kallsyms2.S
+.tmp_kallsyms3.S
+.tmp_versions
+.tmp_vmlinux1
+.tmp_vmlinux2
+.tmp_vmlinux3
+.version
+Module.symvers
+System.map
+vmlinux
diff --git a/Documentation/ckrm/cpusched b/Documentation/ckrm/cpusched

new file mode 100644 (file)

index 0000000..01f7f23
--- /dev/null
+++ b/Documentation/ckrm/cpusched
@@ -0,0 +1,86 @@
+CKRM CPU Scheduling 
+===================
+
+Overview
+--------
+
+In CKRM, cpu scheduling is based on a two level scheduling decision.
+Every time a new task is to be selected, the scheduler first determines
+which class to run next and then schedules the next task in selected
+task.
+
+The scheduling within a class is performed using the default Linux
+O(1) scheduler.
+
+The class scheduler also follows the O(1) principle and works as
+follows: 
+
+Each class maintains a local runqueue per cpu aka <struct
+ckrm_runqueue> or short lrq. The existing O(1) scheduler is used to
+schedule within an <lrq>.
+
+Weights are assigned to each lrq that mirror the effectives shares of
+that class. Every time a task executes, its weighted cycles are
+charged against its class. Thus classes progress in time called
+cummulative virtual time (CVT). In essence the class with the smallest
+CVT is selected next. Provisions are made to keep interactivity and
+avoid starvation by longer sleeping classes.
+
+Load balancing across an SMP is performed by balancing the load of
+each class across CPUs such that they produce equal load and thus 
+on the whole system maintain their share.
+
+Due to the fact that CKRM uses a class hierarchy, cycles that are unused
+by a class are redistributed to among busy siblings.
+Enabling the CKRM CPU scheduler
+-------------------------------
+
+The scheduler is integrated into the linux scheduler and therefore
+can not be loaded dynamically like other CKRM schedulers
+
+However it can be selected at boot time or dynamically at run time.
+
+The boot options "ckrmcpu" OR "nockrmcpu" enable / disable the CKRM
+cpu scheduler at boot time. Currently by default the scheduler is
+disabled.
+
+# cat /rcfs/taskclass/config 
+
+"res=cpu,mode=enabled" indicates that the CKRM cpu scheduler is
+enabled
+
+"res=cpu,mode=disabled" indicates that the CKRM cpu scheduler is
+disabled
+
+The strings can also be used to dynamically change the scheduling modus
+at runtime. For example, to dynamically activate the scheduler.
+
+# echo "res=cpu,mode=enabled" > /rcfs/taskclass/config
+
+# cat /rcfs/taskclass/*/stats
+
+The cpu portion of the scheduler is shown
+
+    "cpu-usage(2,10,60)= 290 340 510"
+
+The 3 numbers represent the load for the 2 second, 10 second 
+and 60 seconds. The base = 1000.
+Hence the system has 29.0%, 33.5% and 49.8% respectively
+
+For debugging purposes additional information can be printed out but
+that format should not be relied upon. 
+
+Use `echo "res=cpu,usage_detail=3" for the highest detail on usage.
+Please consult the source code for the specifics.
+
+Assigning shares
+----------------
+
+Follows the general approach described under ckrm_basics.
+
+# echo "res=cpu,guarantee=val" > shares   
+
+sets the minimum guarantee of a class.
+
+
+
diff --git a/MAINTAINERS b/MAINTAINERS

index c8c25df..523f115 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1226,6 +1226,17 @@ W:       http://nfs.sourceforge.net/
  W:     http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
  S:     Maintained
  
+KEXEC
+P:     Eric Biederman
+P:     Randy Dunlap
+M:     ebiederm@xmission.com
+M:     rddunlap@osdl.org
+W:     http://www.xmission.com/~ebiederm/files/kexec/
+W:     http://developer.osdl.org/rddunlap/kexec/
+L:     linux-kernel@vger.kernel.org
+L:     fastboot@osdl.org
+S:     Maintained
+
  LANMEDIA WAN CARD DRIVER
  P:     Andrew Stanley-Jones
  M:     asj@lanmedia.com
diff --git a/Makefile b/Makefile

index 4d94580..c576843 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 2
  PATCHLEVEL = 6
  SUBLEVEL = 8
-EXTRAVERSION = -1.521.2.5.planetlab
+EXTRAVERSION = -1.521.3.planetlab
  NAME=Zonked Quokka
  
  # *DOCUMENTATION*
@@ -453,6 +453,10 @@ ifndef CONFIG_FRAME_POINTER
  CFLAGS         += -fomit-frame-pointer
  endif
  
+ifdef CONFIG_X86_STACK_CHECK
+CFLAGS         += -p
+endif
+
  ifdef CONFIG_DEBUG_INFO
  CFLAGS         += -g
  endif
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig

index 15b003b..3a3ba7f 100644 (file)
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -926,6 +926,74 @@ config REGPARM
         generate incorrect output with certain kernel constructs when
         -mregparm=3 is used.
  
+config IRQSTACKS
+       bool "Use separate IRQ stacks"
+       help
+       If you say Y here the kernel will use a separate IRQ stack on each
+       cpu to handle interrupts.
+
+config STACK_SIZE_SHIFT
+       int "Kernel stack size (12 => 4KB, 13 => 8KB, 14 => 16KB)"
+       range 12 14
+       default 12 if IRQSTACKS
+       default 13
+       help
+       Select kernel stack size.  4KB stacks are best as they let
+       the system scale further.  Use 8KB stacks if you have an 
+       experimental kernel where a stack overlow with a 4KB stack
+       might occur.  Use 16KB stacks if you want to safely support
+       Windows device drivers using either Linuxant or ndiswrapper.
+
+config STACK_WARN
+       int "Print stack trace when stack grows beyond specified bytes"
+       default 4096 if IRQSTACKS
+       default 4096
+       help
+       The kernel will print a stack trace when the current stack exceeds
+       the specified size.
+
+config X86_STACK_CHECK
+       bool "Check for stack overflows"
+       default n
+       help
+       Say Y here to have the kernel attempt to detect when the per-task
+       kernel stack overflows.
+
+       Some older versions of gcc don't handle the -p option correctly.
+       Kernprof is affected by the same problem, which is described here:
+       http://oss.sgi.com/projects/kernprof/faq.html#Q9
+
+       Basically, if you get oopses in __free_pages_ok during boot when
+       you have this turned on, you need to fix gcc. The Redhat 2.96
+       version and gcc-3.x seem to work.
+
+       If not debugging a stack overflow problem, say N
+
+config STACK_PANIC
+       int "Panic when stack approaches with specified bytes of the stack limit"
+       depends on X86_STACK_CHECK
+       default 512 if IRQSTACKS
+       default 512
+       help
+       Panic if the stack grows to within specified byte range.
+
+config KEXEC
+       bool "kexec system call (EXPERIMENTAL)"
+       depends on EXPERIMENTAL
+       help
+         kexec is a system call that implements the ability to shutdown your
+         current kernel, and to start another kernel.  It is like a reboot
+         but it is indepedent of the system firmware.   And like a reboot
+         you can start any kernel with it, not just Linux.
+
+         The name comes from the similiarity to the exec system call.
+
+         It is an ongoing process to be certain the hardware in a machine
+         is properly shutdown, so do not be surprised if this code does not
+         initially work for you.  It may help to enable device hotplugging
+         support.  As of this writing the exact hardware interface is
+         strongly in flux, so no good recommendation can be made.
+
  endmenu
  
  
diff --git a/arch/i386/boot/.cvsignore b/arch/i386/boot/.cvsignore

new file mode 100644 (file)

index 0000000..2d8a3af
--- /dev/null
+++ b/arch/i386/boot/.cvsignore
@@ -0,0 +1,4 @@
+bootsect
+bzImage
+setup
+vmlinux.bin
diff --git a/arch/i386/boot/compressed/.cvsignore b/arch/i386/boot/compressed/.cvsignore

new file mode 100644 (file)

index 0000000..96b1b00
--- /dev/null
+++ b/arch/i386/boot/compressed/.cvsignore
@@ -0,0 +1,3 @@
+vmlinux
+vmlinux.bin
+vmlinux.bin.gz
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c

index fa67045..8745683 100644 (file)
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -380,3 +380,6 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
         if (high_loaded) close_output_buffer_if_we_run_high(mv);
         return high_loaded;
  }
+
+/* We don't actually check for stack overflows this early. */
+__asm__(".globl mcount ; mcount: ret\n");
diff --git a/arch/i386/boot/tools/.cvsignore b/arch/i386/boot/tools/.cvsignore

new file mode 100644 (file)

index 0000000..378eac2
--- /dev/null
+++ b/arch/i386/boot/tools/.cvsignore
@@ -0,0 +1 @@
+build
diff --git a/arch/i386/defconfig b/arch/i386/defconfig

index aed3bc2..ed2bbb5 100644 (file)
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1221,7 +1221,7 @@ CONFIG_OPROFILE=y
  CONFIG_EARLY_PRINTK=y
  CONFIG_DEBUG_SPINLOCK_SLEEP=y
  # CONFIG_FRAME_POINTER is not set
-CONFIG_4KSTACKS=y
+# CONFIG_4KSTACKS is not set
  CONFIG_X86_FIND_SMP_CONFIG=y
  CONFIG_X86_MPPARSE=y
  
diff --git a/arch/i386/kernel/.cvsignore b/arch/i386/kernel/.cvsignore

new file mode 100644 (file)

index 0000000..21c2876
--- /dev/null
+++ b/arch/i386/kernel/.cvsignore
@@ -0,0 +1,2 @@
+asm-offsets.s
+vmlinux.lds.s
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile

index a056d50..ab1ef80 100644 (file)
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_X86_TRAMPOLINE)  += trampoline.o
  obj-$(CONFIG_X86_MPPARSE)      += mpparse.o
  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o nmi.o
  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
+obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o
  obj-$(CONFIG_X86_NUMAQ)                += numaq.o
  obj-$(CONFIG_X86_SUMMIT_NUMA)  += summit.o
  obj-$(CONFIG_MODULES)          += module.o
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c

index ecf2b63..eb4d416 100644 (file)
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -193,6 +193,36 @@ void disconnect_bsp_APIC(void)
                 outb(0x70, 0x22);
                 outb(0x00, 0x23);
         }
+       else {
+               /* Go back to Virtual Wire compatibility mode */
+               unsigned long value;
+
+               /* For the spurious interrupt use vector F, and enable it */
+               value = apic_read(APIC_SPIV);
+               value &= ~APIC_VECTOR_MASK;
+               value |= APIC_SPIV_APIC_ENABLED;
+               value |= 0xf;
+               apic_write_around(APIC_SPIV, value);
+
+               /* For LVT0 make it edge triggered, active high, external and enabled */
+               value = apic_read(APIC_LVT0);
+               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
+               apic_write_around(APIC_LVT0, value);
+
+               /* For LVT1 make it edge triggered, active high, nmi and enabled */
+               value = apic_read(APIC_LVT1);
+               value &= ~(
+                       APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+               apic_write_around(APIC_LVT1, value);
+       }
  }
  
  void disable_local_APIC(void)
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c

index 43943f8..b03f579 100644 (file)
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -7,11 +7,11 @@
  #include <linux/sched.h>
  #include <linux/signal.h>
  #include <linux/personality.h>
+#include <linux/thread_info.h>
  #include <asm/ucontext.h>
  #include "sigframe.h"
  #include <asm/fixmap.h>
  #include <asm/processor.h>
-#include <asm/thread_info.h>
  
  #define DEFINE(sym, val) \
          asm volatile("\n->" #sym " %0 " #val : : "i" (val))
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S

index 3ac7418..dfbade1 100644 (file)
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1029,8 +1029,55 @@ ENTRY(sys_call_table)
         .long sys_mq_timedreceive       /* 280 */
         .long sys_mq_notify
         .long sys_mq_getsetattr
-       .long sys_ni_syscall            /* reserved for kexec */
+       .long sys_kexec_load
         .long sys_ioprio_set
         .long sys_ioprio_get            /* 285 */
  
  syscall_table_size=(.-sys_call_table)
+
+#ifdef CONFIG_X86_STACK_CHECK
+.data
+.globl stack_overflowed
+stack_overflowed:
+       .long 0
+.text
+
+ENTRY(mcount)
+#warning stack check enabled
+       push %eax
+       movl $(THREAD_SIZE - 1),%eax
+       andl %esp,%eax
+       cmpl $STACK_WARN,%eax
+       jle 1f
+2:
+       popl %eax
+       ret
+1:
+       /* prevent infinite recursion from call to mcount from the
+        * stack_overflow function.  Need to revisit this code for
+        * SMP based systems.
+        */
+       lock; btsl $0,stack_overflowed
+       jc 2b
+
+       /* prepare to jmp to stack_overflow directly, as if it were 
+        * called directly by the caller of mcount.  
+        */
+       pushl %ebp
+       pushl %ebx
+       pushl %esi
+       pushl %edi
+       
+       call stack_overflow
+       /* Note that stack_overflow() will clear the stack_overflowed
+        * variable.
+        */
+
+       popl %edi
+       popl %esi
+       popl %ebx
+       popl %ebp
+       
+       popl %eax       
+       ret
+#endif
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c

index 5a50c53..584982c 100644 (file)
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -188,6 +188,12 @@ EXPORT_SYMBOL(atomic_dec_and_lock);
  
  EXPORT_SYMBOL(__PAGE_KERNEL);
  
+#ifdef CONFIG_X86_STACK_CHECK
+extern void mcount(void);
+EXPORT_SYMBOL(mcount);
+#endif
+
+
  #ifdef CONFIG_HIGHMEM
  EXPORT_SYMBOL(kmap);
  EXPORT_SYMBOL(kunmap);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c

index 97653d2..7141d27 100644 (file)
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -244,9 +244,21 @@ static int i8259A_resume(struct sys_device *dev)
         return 0;
  }
  
+static int i8259A_shutdown(struct sys_device *dev)
+{
+      /* Put the i8259A into a quiescent state that
+       * the kernel initialization code can get it
+       * out of.
+       */
+      outb(0xff, 0x21);       /* mask all of 8259A-1 */
+      outb(0xff, 0xA1);       /* mask all of 8259A-1 */
+      return 0;
+}
+
  static struct sysdev_class i8259_sysdev_class = {
         set_kset_name("i8259"),
         .resume = i8259A_resume,
+        .shutdown = i8259A_shutdown,
  };
  
  static struct sys_device device_i8259A = {
diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c

index 7422d73..30cfd40 100644 (file)
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -29,6 +29,13 @@ union thread_union init_thread_union
         __attribute__((__section__(".data.init_task"))) =
                 { INIT_THREAD_INFO(init_task, init_thread_union) };
  
+#ifdef CONFIG_X86_STACK_CHECK
+union thread_union stack_overflow_stack
+ __attribute__((__section__(".data.init_task"))) =
+               { INIT_THREAD_INFO(init_task, stack_overflow_stack) };
+#endif
+
+
  /*
   * Initial task structure.
   *
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c

index 39af35d..f600e67 100644 (file)
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1604,11 +1604,42 @@ static void __init enable_IO_APIC(void)
   */
  void disable_IO_APIC(void)
  {
+       int pin;
         /*
          * Clear the IO-APIC before rebooting:
          */
         clear_IO_APIC();
  
+       /*
+        * If the i82559 is routed through an IOAPIC
+        * Put that IOAPIC in virtual wire mode
+        * so legacy interrups can be delivered.
+        */
+       pin = find_isa_irq_pin(0, mp_ExtINT);
+       if (pin != -1) {
+               struct IO_APIC_route_entry entry;
+               unsigned long flags;
+
+               memset(&entry, 0, sizeof(entry));
+               entry.mask            = 0; /* Enabled */
+               entry.trigger         = 0; /* Edge */
+               entry.irr             = 0;
+               entry.polarity        = 0; /* High */
+               entry.delivery_status = 0;
+               entry.dest_mode       = 0; /* Physical */
+               entry.delivery_mode   = 7; /* ExtInt */
+               entry.vector          = 0;
+               entry.dest.physical.physical_dest = 0;
+
+
+               /*
+                * Add it to the IO-APIC irq-routing table:
+                */
+               spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+               io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+       }
         disconnect_bsp_APIC();
  }
  
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c

index 22f7fc7..1c8beda 100644 (file)
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -76,8 +76,10 @@ static void register_irq_proc (unsigned int irq);
  /*
   * per-CPU IRQ handling stacks
   */
+#ifdef CONFIG_IRQSTACKS
  union irq_ctx *hardirq_ctx[NR_CPUS];
  union irq_ctx *softirq_ctx[NR_CPUS];
+#endif
  
  /*
   * Special irq handlers.
@@ -220,6 +222,9 @@ asmlinkage int handle_IRQ_event(unsigned int irq,
         int status = 1; /* Force the "do bottom halves" bit */
         int retval = 0;
  
+       if (!(action->flags & SA_INTERRUPT))
+               local_irq_enable();
+
         do {
                 status |= action->flags;
                 retval |= action->handler(irq, action->dev_id, regs);
@@ -489,10 +494,12 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
                 u32 *isp;
                 union irq_ctx * curctx;
                 union irq_ctx * irqctx;
-
+#ifdef CONFIG_IRQSTACKS
                 curctx = (union irq_ctx *) current_thread_info();
                 irqctx = hardirq_ctx[smp_processor_id()];
-
+#else
+               curctx = irqctx = (union irq_ctx *)0;
+#endif
                 spin_unlock(&desc->lock);
  
                 /*
@@ -536,7 +543,6 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
                         break;
                 desc->status &= ~IRQ_PENDING;
         }
-
         desc->status &= ~IRQ_INPROGRESS;
  
  out:
@@ -1095,6 +1101,7 @@ void init_irq_proc (void)
  }
  
  
+#ifdef CONFIG_IRQSTACKS
  /*
   * These should really be __section__(".bss.page_aligned") as well, but
   * gcc's 3.0 and earlier don't handle that correctly.
@@ -1174,3 +1181,4 @@ asmlinkage void do_softirq(void)
  }
  
  EXPORT_SYMBOL(do_softirq);
+#endif
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c

new file mode 100644 (file)

index 0000000..3a9e878
--- /dev/null
+++ b/arch/i386/kernel/machine_kexec.c
@@ -0,0 +1,208 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+
+static inline unsigned long read_cr3(void)
+{
+       unsigned long cr3;
+       asm volatile("movl %%cr3,%0": "=r"(cr3));
+       return cr3;
+}
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#define LEVEL0_SIZE (1UL << 12UL)
+
+#ifndef CONFIG_X86_PAE
+#define LEVEL1_SIZE (1UL << 22UL)
+static u32 pgtable_level1[1024] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+       unsigned long level1_index, level2_index;
+       u32 *pgtable_level2;
+
+       /* Find the current page table */
+       pgtable_level2 = __va(read_cr3());
+
+       /* Find the indexes of the physical address to identity map */
+       level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+       level2_index = address / LEVEL1_SIZE;
+
+       /* Identity map the page table entry */
+       pgtable_level1[level1_index] = address | L0_ATTR;
+       pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+
+       /* Flush the tlb so the new mapping takes effect.
+        * Global tlb entries are not flushed but that is not an issue.
+        */
+       load_cr3(pgtable_level2);
+}
+
+#else
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+static u64 pgtable_level1[512] PAGE_ALIGNED;
+static u64 pgtable_level2[512] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+       unsigned long level1_index, level2_index, level3_index;
+       u64 *pgtable_level3;
+
+       /* Find the current page table */
+       pgtable_level3 = __va(read_cr3());
+
+       /* Find the indexes of the physical address to identity map */
+       level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+       level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+       level3_index = address / LEVEL2_SIZE;
+
+       /* Identity map the page table entry */
+       pgtable_level1[level1_index] = address | L0_ATTR;
+       pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+       set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR);
+
+       /* Flush the tlb so the new mapping takes effect.
+        * Global tlb entries are not flushed but that is not an issue.
+        */
+       load_cr3(pgtable_level3);
+}
+#endif
+
+
+static void set_idt(void *newidt, __u16 limit)
+{
+       unsigned char curidt[6];
+
+       /* ia32 supports unaliged loads & stores */
+       (*(__u16 *)(curidt)) = limit;
+       (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
+
+       __asm__ __volatile__ (
+               "lidt %0\n"
+               : "=m" (curidt)
+               );
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+       unsigned char curgdt[6];
+
+       /* ia32 supports unaligned loads & stores */
+       (*(__u16 *)(curgdt)) = limit;
+       (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+       __asm__ __volatile__ (
+               "lgdt %0\n"
+               : "=m" (curgdt)
+               );
+};
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+       __asm__ __volatile__ (
+               "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+               "\t1:\n"
+               "\tmovl $"STR(__KERNEL_DS)",%eax\n"
+               "\tmovl %eax,%ds\n"
+               "\tmovl %eax,%es\n"
+               "\tmovl %eax,%fs\n"
+               "\tmovl %eax,%gs\n"
+               "\tmovl %eax,%ss\n"
+               );
+#undef STR
+#undef __STR
+}
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+       unsigned long indirection_page, unsigned long reboot_code_buffer,
+       unsigned long start_address, unsigned int has_pae);
+
+const extern unsigned char relocate_new_kernel[];
+extern void relocate_new_kernel_end(void);
+const extern unsigned int relocate_new_kernel_size;
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.  Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+       return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+       unsigned long indirection_page;
+       unsigned long reboot_code_buffer;
+       relocate_new_kernel_t rnk;
+
+       /* Interrupts aren't acceptable while we reboot */
+       local_irq_disable();
+
+       /* Compute some offsets */
+       reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+       indirection_page = image->head & PAGE_MASK;
+
+       /* Set up an identity mapping for the reboot_code_buffer */
+       identity_map_page(reboot_code_buffer);
+
+       /* copy it out */
+       memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
+
+       /* The segment registers are funny things, they are
+        * automatically loaded from a table, in memory wherever you
+        * set them to a specific selector, but this table is never
+        * accessed again you set the segment to a different selector.
+        *
+        * The more common model is are caches where the behide
+        * the scenes work is done, but is also dropped at arbitrary
+        * times.
+        *
+        * I take advantage of this here by force loading the
+        * segments, before I zap the gdt with an invalid value.
+        */
+       load_segments();
+       /* The gdt & idt are now invalid.
+        * If you want to load them you must set up your own idt & gdt.
+        */
+       set_gdt(phys_to_virt(0),0);
+       set_idt(phys_to_virt(0),0);
+
+       /* now call it */
+       rnk = (relocate_new_kernel_t) reboot_code_buffer;
+       (*rnk)(indirection_page, reboot_code_buffer, image->start, cpu_has_pae);
+}
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c

index 3093d1f..e8a01f2 100644 (file)
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -219,6 +219,32 @@ static int __init idle_setup (char *str)
  
  __setup("idle=", idle_setup);
  
+void stack_overflow(void)
+{
+        extern unsigned long stack_overflowed;
+        unsigned long esp = current_stack_pointer();
+       int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC);
+
+       oops_in_progress = 1;
+       printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%lx %d %d\n",
+               esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, 
+               (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing);
+       show_trace(current,(void*)esp);
+
+       if (panicing)
+         panic("stack overflow\n");
+
+       oops_in_progress = 0;
+
+       /* Just let it happen once per task, as otherwise it goes nuts
+        * in printing stack traces.  This means that I need to dump
+        * the stack_overflowed boolean into the task or thread_info
+        * structure.  For now just turn it off all together.
+        */
+
+       /* stack_overflowed = 0; */
+}
+
  void show_regs(struct pt_regs * regs)
  {
         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c

index e8d5cd3..85e89f9 100644 (file)
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -23,7 +23,6 @@ static int reboot_mode;
  int reboot_thru_bios;
  
  #ifdef CONFIG_SMP
-int reboot_smp = 0;
  static int reboot_cpu = -1;
  /* shamelessly grabbed from lib/vsprintf.c for readability */
  #define is_digit(c)    ((c) >= '0' && (c) <= '9')
@@ -85,33 +84,9 @@ static int __init set_bios_reboot(struct dmi_system_id *d)
         return 0;
  }
  
-/*
- * Some machines require the "reboot=s"  commandline option, this quirk makes that automatic.
- */
-static int __init set_smp_reboot(struct dmi_system_id *d)
-{
-#ifdef CONFIG_SMP
-       if (!reboot_smp) {
-               reboot_smp = 1;
-               printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
-       }
-#endif
-       return 0;
-}
-
-/*
- * Some machines require the "reboot=b,s"  commandline option, this quirk makes that automatic.
- */
-static int __init set_smp_bios_reboot(struct dmi_system_id *d)
-{
-       set_smp_reboot(d);
-       set_bios_reboot(d);
-       return 0;
-}
-
  static struct dmi_system_id __initdata reboot_dmi_table[] = {
         {       /* Handle problems with rebooting on Dell 1300's */
-               .callback = set_smp_bios_reboot,
+               .callback = set_bios_reboot,
                 .ident = "Dell PowerEdge 1300",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
@@ -294,41 +269,32 @@ void machine_real_restart(unsigned char *code, int length)
                                 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
  }
  
-void machine_restart(char * __unused)
+void machine_shutdown(void)
  {
  #ifdef CONFIG_SMP
-       int cpuid;
-       
-       cpuid = GET_APIC_ID(apic_read(APIC_ID));
-
-       if (reboot_smp) {
-
-               /* check to see if reboot_cpu is valid 
-                  if its not, default to the BSP */
-               if ((reboot_cpu == -1) ||  
-                     (reboot_cpu > (NR_CPUS -1))  || 
-                     !physid_isset(cpuid, phys_cpu_present_map))
-                       reboot_cpu = boot_cpu_physical_apicid;
-
-               reboot_smp = 0;  /* use this as a flag to only go through this once*/
-               /* re-run this function on the other CPUs
-                  it will fall though this section since we have 
-                  cleared reboot_smp, and do the reboot if it is the
-                  correct CPU, otherwise it halts. */
-               if (reboot_cpu != cpuid)
-                       smp_call_function((void *)machine_restart , NULL, 1, 0);
+        int reboot_cpu_id;
+
+        /* The boot cpu is always logical cpu 0 */
+        reboot_cpu_id = 0;
+
+        /* See if there has been given a command line override */
+       if ((reboot_cpu_id != -1) && (reboot_cpu < NR_CPUS) &&
+               cpu_isset(reboot_cpu, cpu_online_map)) {
+                reboot_cpu_id = reboot_cpu;
         }
  
-       /* if reboot_cpu is still -1, then we want a tradional reboot, 
-          and if we are not running on the reboot_cpu,, halt */
-       if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
-               for (;;)
-               __asm__ __volatile__ ("hlt");
+       /* Make certain the cpu I'm rebooting on is online */
+        if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+                reboot_cpu_id = smp_processor_id();
         }
-       /*
-        * Stop all CPUs and turn off local APICs and the IO-APIC, so
-        * other OSs see a clean IRQ state.
+
+        /* Make certain I only run on the appropriate processor */
+        set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+        /* O.K. Now that I'm on the appropriate processor, stop
+         * all of the others, and disable their local APICs.
          */
+
         if (!netdump_mode)
                 smp_send_stop();
  #elif defined(CONFIG_X86_LOCAL_APIC)
@@ -341,6 +307,11 @@ void machine_restart(char * __unused)
  #ifdef CONFIG_X86_IO_APIC
         disable_IO_APIC();
  #endif
+}
+
+void machine_restart(char * __unused)
+{
+        machine_shutdown();
  
         if (!reboot_thru_bios) {
                 if (efi_enabled) {
diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S

new file mode 100644 (file)

index 0000000..54be4c2
--- /dev/null
+++ b/arch/i386/kernel/relocate_kernel.S
@@ -0,0 +1,118 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+       /*
+        * Must be relocatable PIC code callable as a C function, that once
+        * it starts can not use the previous processes stack.
+        */
+       .globl relocate_new_kernel
+relocate_new_kernel:
+       /* read the arguments and say goodbye to the stack */
+       movl  4(%esp), %ebx /* indirection_page */
+       movl  8(%esp), %ebp /* reboot_code_buffer */
+       movl  12(%esp), %edx /* start address */
+       movl  16(%esp), %ecx /* cpu_has_pae */
+
+       /* zero out flags, and disable interrupts */
+       pushl $0
+       popfl
+
+       /* set a new stack at the bottom of our page... */
+       lea   4096(%ebp), %esp
+
+       /* store the parameters back on the stack */
+       pushl   %edx /* store the start address */
+
+       /* Set cr0 to a known state:
+        * 31 0 == Paging disabled
+        * 18 0 == Alignment check disabled
+        * 16 0 == Write protect disabled
+        * 3  0 == No task switch
+        * 2  0 == Don't do FP software emulation.
+        * 0  1 == Proctected mode enabled
+        */
+       movl    %cr0, %eax
+       andl    $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+       orl     $(1<<0), %eax
+       movl    %eax, %cr0
+
+       /* clear cr4 if applicable */
+       testl   %ecx, %ecx
+       jz      1f
+       /* Set cr4 to a known state:
+        * Setting everything to zero seems safe.
+        */
+       movl    %cr4, %eax
+       andl    $0, %eax
+       movl    %eax, %cr4
+
+       jmp 1f
+1:
+
+       /* Flush the TLB (needed?) */
+       xorl    %eax, %eax
+       movl    %eax, %cr3
+
+       /* Do the copies */
+       cld
+0:     /* top, read another word for the indirection page */
+       movl    %ebx, %ecx
+       movl    (%ebx), %ecx
+       addl    $4, %ebx
+       testl   $0x1,   %ecx  /* is it a destination page */
+       jz      1f
+       movl    %ecx,   %edi
+       andl    $0xfffff000, %edi
+       jmp     0b
+1:
+       testl   $0x2,   %ecx  /* is it an indirection page */
+       jz      1f
+       movl    %ecx,   %ebx
+       andl    $0xfffff000, %ebx
+       jmp     0b
+1:
+       testl   $0x4,   %ecx /* is it the done indicator */
+       jz      1f
+       jmp     2f
+1:
+       testl   $0x8,   %ecx /* is it the source indicator */
+       jz      0b           /* Ignore it otherwise */
+       movl    %ecx,   %esi /* For every source page do a copy */
+       andl    $0xfffff000, %esi
+
+       movl    $1024, %ecx
+       rep ; movsl
+       jmp     0b
+
+2:
+
+       /* To be certain of avoiding problems with self-modifying code
+        * I need to execute a serializing instruction here.
+        * So I flush the TLB, it's handy, and not processor dependent.
+        */
+       xorl    %eax, %eax
+       movl    %eax, %cr3
+
+       /* set all of the registers to known values */
+       /* leave %esp alone */
+
+       xorl    %eax, %eax
+       xorl    %ebx, %ebx
+       xorl    %ecx, %ecx
+       xorl    %edx, %edx
+       xorl    %esi, %esi
+       xorl    %edi, %edi
+       xorl    %ebp, %ebp
+       ret
+relocate_new_kernel_end:
+
+       .globl relocate_new_kernel_size
+relocate_new_kernel_size:
+       .long relocate_new_kernel_end - relocate_new_kernel
diff --git a/configs/kernel-2.6.8-i686-planetlab.config b/configs/kernel-2.6.8-i686-planetlab.config

index ea66387..ffa265f 100644 (file)
--- a/configs/kernel-2.6.8-i686-planetlab.config
+++ b/configs/kernel-2.6.8-i686-planetlab.config
@@ -30,8 +30,9 @@ CONFIG_RCFS_FS=y
  CONFIG_CKRM_TYPE_TASKCLASS=y
  CONFIG_CKRM_RES_NUMTASKS=y
  CONFIG_CKRM_CPU_SCHEDULE=y
-CONFIG_CKRM_RES_BLKIO=y
+# CONFIG_CKRM_RES_BLKIO is not set
  # CONFIG_CKRM_RES_MEM is not set
+# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set
  # CONFIG_CKRM_TYPE_SOCKETCLASS is not set
  CONFIG_CKRM_RBCE=y
  CONFIG_SYSCTL=y
@@ -140,6 +141,12 @@ CONFIG_HIGHPTE=y
  # CONFIG_MATH_EMULATION is not set
  CONFIG_MTRR=y
  CONFIG_REGPARM=y
+CONFIG_IRQSTACKS=y
+CONFIG_STACK_SIZE_SHIFT=13
+CONFIG_STACK_WARN=4000
+CONFIG_X86_STACK_CHECK=y
+CONFIG_STACK_PANIC=512
+CONFIG_KEXEC=y
  
  #
  # Power management options (ACPI, APM)
@@ -211,7 +218,7 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y
  #
  # Block devices
  #
-# CONFIG_BLK_DEV_FD is not set
+CONFIG_BLK_DEV_FD=m
  # CONFIG_BLK_DEV_XD is not set
  CONFIG_BLK_CPQ_DA=m
  CONFIG_BLK_CPQ_CISS_DA=m
@@ -540,6 +547,11 @@ CONFIG_IP_NF_TARGET_NOTRACK=m
  CONFIG_IP_NF_RAW=m
  CONFIG_IP_NF_MATCH_ADDRTYPE=m
  CONFIG_IP_NF_MATCH_REALM=m
+# CONFIG_IP_NF_CT_ACCT is not set
+CONFIG_IP_NF_CT_PROTO_GRE=m
+CONFIG_IP_NF_PPTP=m
+CONFIG_IP_NF_NAT_PPTP=m
+CONFIG_IP_NF_NAT_PROTO_GRE=m
  
  #
  # SCTP Configuration (EXPERIMENTAL)
diff --git a/drivers/block/cfq-iosched-orig.c b/drivers/block/cfq-iosched-orig.c

deleted file mode 100644 (file)

index 977d32d..0000000
--- a/drivers/block/cfq-iosched-orig.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- *  linux/drivers/block/cfq-iosched.c
- *
- *  CFQ, or complete fairness queueing, disk scheduler.
- *
- *  Based on ideas from a previously unfinished io
- *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
- *
- *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/hash.h>
-#include <linux/rbtree.h>
-#include <linux/mempool.h>
-
-/*
- * tunables
- */
-static int cfq_quantum = 4;
-static int cfq_queued = 8;
-
-#define CFQ_QHASH_SHIFT                6
-#define CFQ_QHASH_ENTRIES      (1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)        list_entry((entry), struct cfq_queue, cfq_hash)
-
-#define CFQ_MHASH_SHIFT                8
-#define CFQ_MHASH_BLOCK(sec)   ((sec) >> 3)
-#define CFQ_MHASH_ENTRIES      (1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)      (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
-#define ON_MHASH(crq)          !list_empty(&(crq)->hash)
-#define rq_hash_key(rq)                ((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)   list_entry((ptr), struct cfq_rq, hash)
-
-#define list_entry_cfqq(ptr)   list_entry((ptr), struct cfq_queue, cfq_list)
-
-#define RQ_DATA(rq)            ((struct cfq_rq *) (rq)->elevator_private)
-
-static kmem_cache_t *crq_pool;
-static kmem_cache_t *cfq_pool;
-static mempool_t *cfq_mpool;
-
-struct cfq_data {
-       struct list_head rr_list;
-       struct list_head *dispatch;
-       struct list_head *cfq_hash;
-
-       struct list_head *crq_hash;
-
-       unsigned int busy_queues;
-       unsigned int max_queued;
-
-       mempool_t *crq_pool;
-};
-
-struct cfq_queue {
-       struct list_head cfq_hash;
-       struct list_head cfq_list;
-       struct rb_root sort_list;
-       int pid;
-       int queued[2];
-#if 0
-       /*
-        * with a simple addition like this, we can do io priorities. almost.
-        * does need a split request free list, too.
-        */
-       int io_prio
-#endif
-};
-
-struct cfq_rq {
-       struct rb_node rb_node;
-       sector_t rb_key;
-
-       struct request *request;
-
-       struct cfq_queue *cfq_queue;
-
-       struct list_head hash;
-};
-
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
-static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq);
-
-/*
- * lots of deadline iosched dupes, can be abstracted later...
- */
-static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
-{
-       list_del_init(&crq->hash);
-}
-
-static inline void cfq_del_crq_hash(struct cfq_rq *crq)
-{
-       if (ON_MHASH(crq))
-               __cfq_del_crq_hash(crq);
-}
-
-static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
-{
-       cfq_del_crq_hash(crq);
-
-       if (q->last_merge == crq->request)
-               q->last_merge = NULL;
-}
-
-static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-       struct request *rq = crq->request;
-
-       BUG_ON(ON_MHASH(crq));
-
-       list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
-}
-
-static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
-{
-       struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-       struct list_head *entry, *next = hash_list->next;
-
-       while ((entry = next) != hash_list) {
-               struct cfq_rq *crq = list_entry_hash(entry);
-               struct request *__rq = crq->request;
-
-               next = entry->next;
-
-               BUG_ON(!ON_MHASH(crq));
-
-               if (!rq_mergeable(__rq)) {
-                       __cfq_del_crq_hash(crq);
-                       continue;
-               }
-
-               if (rq_hash_key(__rq) == offset)
-                       return __rq;
-       }
-
-       return NULL;
-}
-
-/*
- * rb tree support functions
- */
-#define RB_NONE                (2)
-#define RB_EMPTY(node) ((node)->rb_node == NULL)
-#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
-#define RB_CLEAR_ROOT(root)    ((root)->rb_node = NULL)
-#define ON_RB(node)    ((node)->rb_color != RB_NONE)
-#define rb_entry_crq(node)     rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)          (rq)->sector
-
-static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
-{
-       if (ON_RB(&crq->rb_node)) {
-               cfqq->queued[rq_data_dir(crq->request)]--;
-               rb_erase(&crq->rb_node, &cfqq->sort_list);
-               crq->cfq_queue = NULL;
-       }
-}
-
-static struct cfq_rq *
-__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
-{
-       struct rb_node **p = &cfqq->sort_list.rb_node;
-       struct rb_node *parent = NULL;
-       struct cfq_rq *__crq;
-
-       while (*p) {
-               parent = *p;
-               __crq = rb_entry_crq(parent);
-
-               if (crq->rb_key < __crq->rb_key)
-                       p = &(*p)->rb_left;
-               else if (crq->rb_key > __crq->rb_key)
-                       p = &(*p)->rb_right;
-               else
-                       return __crq;
-       }
-
-       rb_link_node(&crq->rb_node, parent, p);
-       return 0;
-}
-
-static void
-cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
-{
-       struct request *rq = crq->request;
-       struct cfq_rq *__alias;
-
-       crq->rb_key = rq_rb_key(rq);
-       cfqq->queued[rq_data_dir(rq)]++;
-retry:
-       __alias = __cfq_add_crq_rb(cfqq, crq);
-       if (!__alias) {
-               rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-               crq->cfq_queue = cfqq;
-               return;
-       }
-
-       cfq_del_crq_rb(cfqq, __alias);
-       cfq_dispatch_sort(cfqd->dispatch, __alias);
-       goto retry;
-}
-
-static struct request *
-cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
-{
-       struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
-       struct rb_node *n;
-
-       if (!cfqq)
-               goto out;
-
-       n = cfqq->sort_list.rb_node;
-       while (n) {
-               struct cfq_rq *crq = rb_entry_crq(n);
-
-               if (sector < crq->rb_key)
-                       n = n->rb_left;
-               else if (sector > crq->rb_key)
-                       n = n->rb_right;
-               else
-                       return crq->request;
-       }
-
-out:
-       return NULL;
-}
-
-static void cfq_remove_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       if (crq) {
-               struct cfq_queue *cfqq = crq->cfq_queue;
-
-               cfq_remove_merge_hints(q, crq);
-               list_del_init(&rq->queuelist);
-
-               if (cfqq) {
-                       cfq_del_crq_rb(cfqq, crq);
-
-                       if (RB_EMPTY(&cfqq->sort_list))
-                               cfq_put_queue(cfqd, cfqq);
-               }
-       }
-}
-
-static int
-cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct request *__rq;
-       int ret;
-
-       ret = elv_try_last_merge(q, bio);
-       if (ret != ELEVATOR_NO_MERGE) {
-               __rq = q->last_merge;
-               goto out_insert;
-       }
-
-       __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
-       if (__rq) {
-               BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-               if (elv_rq_merge_ok(__rq, bio)) {
-                       ret = ELEVATOR_BACK_MERGE;
-                       goto out;
-               }
-       }
-
-       __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
-       if (__rq) {
-               if (elv_rq_merge_ok(__rq, bio)) {
-                       ret = ELEVATOR_FRONT_MERGE;
-                       goto out;
-               }
-       }
-
-       return ELEVATOR_NO_MERGE;
-out:
-       q->last_merge = __rq;
-out_insert:
-       *req = __rq;
-       return ret;
-}
-
-static void cfq_merged_request(request_queue_t *q, struct request *req)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(req);
-
-       cfq_del_crq_hash(crq);
-       cfq_add_crq_hash(cfqd, crq);
-
-       if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
-               struct cfq_queue *cfqq = crq->cfq_queue;
-
-               cfq_del_crq_rb(cfqq, crq);
-               cfq_add_crq_rb(cfqd, cfqq, crq);
-       }
-
-       q->last_merge = req;
-}
-
-static void
-cfq_merged_requests(request_queue_t *q, struct request *req,
-                   struct request *next)
-{
-       cfq_merged_request(q, req);
-       cfq_remove_request(q, next);
-}
-
-static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq)
-{
-       struct list_head *entry = head;
-       struct request *__rq;
-
-       if (!list_empty(head)) {
-               __rq = list_entry_rq(head->next);
-
-               if (crq->request->sector < __rq->sector) {
-                       entry = head->prev;
-                       goto link;
-               }
-       }
-
-       while ((entry = entry->prev) != head) {
-               __rq = list_entry_rq(entry);
-
-               if (crq->request->sector <= __rq->sector)
-                       break;
-       }
-
-link:
-       list_add_tail(&crq->request->queuelist, entry);
-}
-
-static inline void
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-                       struct cfq_queue *cfqq)
-{
-       struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
-
-       cfq_del_crq_rb(cfqq, crq);
-       cfq_remove_merge_hints(q, crq);
-       cfq_dispatch_sort(cfqd->dispatch, crq);
-}
-
-static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
-{
-       struct cfq_queue *cfqq;
-       struct list_head *entry, *tmp;
-       int ret, queued, good_queues;
-
-       if (list_empty(&cfqd->rr_list))
-               return 0;
-
-       queued = ret = 0;
-restart:
-       good_queues = 0;
-       list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-               cfqq = list_entry_cfqq(cfqd->rr_list.next);
-
-               BUG_ON(RB_EMPTY(&cfqq->sort_list));
-
-               __cfq_dispatch_requests(q, cfqd, cfqq);
-
-               if (RB_EMPTY(&cfqq->sort_list))
-                       cfq_put_queue(cfqd, cfqq);
-               else
-                       good_queues++;
-
-               queued++;
-               ret = 1;
-       }
-
-       if ((queued < cfq_quantum) && good_queues)
-               goto restart;
-
-       return ret;
-}
-
-static struct request *cfq_next_request(request_queue_t *q)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct request *rq;
-
-       if (!list_empty(cfqd->dispatch)) {
-               struct cfq_rq *crq;
-dispatch:
-               rq = list_entry_rq(cfqd->dispatch->next);
-
-               crq = RQ_DATA(rq);
-               if (crq)
-                       cfq_remove_merge_hints(q, crq);
-
-               return rq;
-       }
-
-       if (cfq_dispatch_requests(q, cfqd))
-               goto dispatch;
-
-       return NULL;
-}
-
-static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
-{
-       struct list_head *hash_list = &cfqd->cfq_hash[hashval];
-       struct list_head *entry;
-
-       list_for_each(entry, hash_list) {
-               struct cfq_queue *__cfqq = list_entry_qhash(entry);
-
-               if (__cfqq->pid == pid)
-                       return __cfqq;
-       }
-
-       return NULL;
-}
-
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
-{
-       const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
-
-       return __cfq_find_cfq_hash(cfqd, pid, hashval);
-}
-
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-       cfqd->busy_queues--;
-       list_del(&cfqq->cfq_list);
-       list_del(&cfqq->cfq_hash);
-       mempool_free(cfqq, cfq_mpool);
-}
-
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid)
-{
-       const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
-       struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
-
-       if (!cfqq) {
-               cfqq = mempool_alloc(cfq_mpool, GFP_NOIO);
-
-               INIT_LIST_HEAD(&cfqq->cfq_hash);
-               INIT_LIST_HEAD(&cfqq->cfq_list);
-               RB_CLEAR_ROOT(&cfqq->sort_list);
-
-               cfqq->pid = pid;
-               cfqq->queued[0] = cfqq->queued[1] = 0;
-               list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
-       }
-
-       return cfqq;
-}
-
-static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-       struct cfq_queue *cfqq;
-
-       cfqq = cfq_get_queue(cfqd, current->tgid);
-
-       cfq_add_crq_rb(cfqd, cfqq, crq);
-
-       if (list_empty(&cfqq->cfq_list)) {
-               list_add(&cfqq->cfq_list, &cfqd->rr_list);
-               cfqd->busy_queues++;
-       }
-}
-
-static void
-cfq_insert_request(request_queue_t *q, struct request *rq, int where)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       switch (where) {
-               case ELEVATOR_INSERT_BACK:
-                       while (cfq_dispatch_requests(q, cfqd))
-                               ;
-                       list_add_tail(&rq->queuelist, cfqd->dispatch);
-                       break;
-               case ELEVATOR_INSERT_FRONT:
-                       list_add(&rq->queuelist, cfqd->dispatch);
-                       break;
-               case ELEVATOR_INSERT_SORT:
-                       BUG_ON(!blk_fs_request(rq));
-                       cfq_enqueue(cfqd, crq);
-                       break;
-               default:
-                       printk("%s: bad insert point %d\n", __FUNCTION__,where);
-                       return;
-       }
-
-       if (rq_mergeable(rq)) {
-               cfq_add_crq_hash(cfqd, crq);
-
-               if (!q->last_merge)
-                       q->last_merge = rq;
-       }
-}
-
-static int cfq_queue_empty(request_queue_t *q)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-
-       if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
-               return 1;
-
-       return 0;
-}
-
-static struct request *
-cfq_former_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_rq *crq = RQ_DATA(rq);
-       struct rb_node *rbprev = rb_prev(&crq->rb_node);
-
-       if (rbprev)
-               return rb_entry_crq(rbprev)->request;
-
-       return NULL;
-}
-
-static struct request *
-cfq_latter_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_rq *crq = RQ_DATA(rq);
-       struct rb_node *rbnext = rb_next(&crq->rb_node);
-
-       if (rbnext)
-               return rb_entry_crq(rbnext)->request;
-
-       return NULL;
-}
-
-static int cfq_may_queue(request_queue_t *q, int rw)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_queue *cfqq;
-       int ret = 1;
-
-       if (!cfqd->busy_queues)
-               goto out;
-
-       cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
-       if (cfqq) {
-               int limit = (q->nr_requests - cfq_queued) / cfqd->busy_queues;
-
-               if (limit < 3)
-                       limit = 3;
-               else if (limit > cfqd->max_queued)
-                       limit = cfqd->max_queued;
-
-               if (cfqq->queued[rw] > limit)
-                       ret = 0;
-       }
-out:
-       return ret;
-}
-
-static void cfq_put_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       if (crq) {
-               BUG_ON(q->last_merge == rq);
-               BUG_ON(ON_MHASH(crq));
-
-               mempool_free(crq, cfqd->crq_pool);
-               rq->elevator_private = NULL;
-       }
-}
-
-static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
-
-       if (crq) {
-               RB_CLEAR(&crq->rb_node);
-               crq->request = rq;
-               crq->cfq_queue = NULL;
-               INIT_LIST_HEAD(&crq->hash);
-               rq->elevator_private = crq;
-               return 0;
-       }
-
-       return 1;
-}
-
-static void cfq_exit(request_queue_t *q, elevator_t *e)
-{
-       struct cfq_data *cfqd = e->elevator_data;
-
-       e->elevator_data = NULL;
-       mempool_destroy(cfqd->crq_pool);
-       kfree(cfqd->crq_hash);
-       kfree(cfqd->cfq_hash);
-       kfree(cfqd);
-}
-
-static int cfq_init(request_queue_t *q, elevator_t *e)
-{
-       struct cfq_data *cfqd;
-       int i;
-
-       cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
-       if (!cfqd)
-               return -ENOMEM;
-
-       memset(cfqd, 0, sizeof(*cfqd));
-       INIT_LIST_HEAD(&cfqd->rr_list);
-
-       cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
-       if (!cfqd->crq_hash)
-               goto out_crqhash;
-
-       cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
-       if (!cfqd->cfq_hash)
-               goto out_cfqhash;
-
-       cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
-       if (!cfqd->crq_pool)
-               goto out_crqpool;
-
-       for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-               INIT_LIST_HEAD(&cfqd->crq_hash[i]);
-       for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
-               INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
-
-       cfqd->dispatch = &q->queue_head;
-       e->elevator_data = cfqd;
-
-       /*
-        * just set it to some high value, we want anyone to be able to queue
-        * some requests. fairness is handled differently
-        */
-       cfqd->max_queued = q->nr_requests;
-       q->nr_requests = 8192;
-
-       return 0;
-out_crqpool:
-       kfree(cfqd->cfq_hash);
-out_cfqhash:
-       kfree(cfqd->crq_hash);
-out_crqhash:
-       kfree(cfqd);
-       return -ENOMEM;
-}
-
-static int __init cfq_slab_setup(void)
-{
-       crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
-                                       NULL, NULL);
-
-       if (!crq_pool)
-               panic("cfq_iosched: can't init crq pool\n");
-
-       cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
-                                       NULL, NULL);
-
-       if (!cfq_pool)
-               panic("cfq_iosched: can't init cfq pool\n");
-
-       cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
-
-       if (!cfq_mpool)
-               panic("cfq_iosched: can't init cfq mpool\n");
-
-       return 0;
-}
-
-subsys_initcall(cfq_slab_setup);
-
-elevator_t iosched_cfq = {
-       .elevator_name =                "cfq",
-       .elevator_merge_fn =            cfq_merge,
-       .elevator_merged_fn =           cfq_merged_request,
-       .elevator_merge_req_fn =        cfq_merged_requests,
-       .elevator_next_req_fn =         cfq_next_request,
-       .elevator_add_req_fn =          cfq_insert_request,
-       .elevator_remove_req_fn =       cfq_remove_request,
-       .elevator_queue_empty_fn =      cfq_queue_empty,
-       .elevator_former_req_fn =       cfq_former_request,
-       .elevator_latter_req_fn =       cfq_latter_request,
-       .elevator_set_req_fn =          cfq_set_request,
-       .elevator_put_req_fn =          cfq_put_request,
-       .elevator_may_queue_fn =        cfq_may_queue,
-       .elevator_init_fn =             cfq_init,
-       .elevator_exit_fn =             cfq_exit,
-};
-
-EXPORT_SYMBOL(iosched_cfq);
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c

index 7b45a80..70d66c5 100644 (file)
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -39,8 +39,6 @@
  #error Cannot support this many io priority levels
  #endif
  
-#define LIMIT_DEBUG   1
-
  /*
   * tunables
   */
@@ -52,6 +50,10 @@ static int cfq_queued = 4;
  static int cfq_grace_rt = HZ / 100 ?: 1;
  static int cfq_grace_idle = HZ / 10;
  
+#define CFQ_EPOCH              1000000000
+#define CFQ_SECTORATE          1000   
+#define CFQ_HMAX_PCT           80
+
  #define CFQ_QHASH_SHIFT                6
  #define CFQ_QHASH_ENTRIES      (1 << CFQ_QHASH_SHIFT)
  #define list_entry_qhash(entry)        hlist_entry((entry), struct cfq_queue, cfq_hash)
@@ -69,13 +71,6 @@ static int cfq_grace_idle = HZ / 10;
  #define cfq_account_io(crq)    \
         ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
  
-/* define to be 50 ms for now; make tunable later */
-#define CFQ_EPOCH              50000
-/* Needs to be made tunable right away, in MiB/s */
-#define CFQ_DISKBW             10       
-/* Temporary global limit, as percent of available b/w, for each "class" */
-#define CFQ_TEMPLIM            10
-
  /*
   * defines how we distribute bandwidth (can be tgid, uid, etc)
   */
@@ -87,18 +82,22 @@ static int cfq_grace_idle = HZ / 10;
   */
  
  #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
-extern inline void *cki_hash_key(struct task_struct *tsk);
-extern inline int cki_ioprio(struct task_struct *tsk);
-#define cfq_hash_key(current)   ((int)cki_hash_key((current)))
-#define cfq_ioprio(current)    (cki_ioprio((current)))
+extern void *cki_hash_key(struct task_struct *tsk);
+extern int cki_ioprio(struct task_struct *tsk);
+extern void *cki_cfqpriv(struct task_struct *tsk); 
+
+#define cfq_hash_key(tsk)   ((int)cki_hash_key((tsk)))
+#define cfq_ioprio(tsk)        (cki_ioprio((tsk)))
+#define cfq_cfqpriv(cfqd,tsk)  (cki_cfqpriv((tsk)))
  
  #else
-#define cfq_hash_key(current)  ((current)->tgid)
+#define cfq_hash_key(tsk)      ((tsk)->tgid)
+#define cfq_cfqpriv(cfqd,tsk)  (&(((cfqd)->cid[(tsk)->ioprio]).cfqpriv))
  
  /*
   * move to io_context
   */
-#define cfq_ioprio(current)    ((current)->ioprio)
+#define cfq_ioprio(tsk)        ((tsk)->ioprio)
  #endif
  
  #define CFQ_WAIT_RT    0
@@ -125,16 +124,12 @@ struct io_prio_data {
         atomic_t cum_sectors_in,cum_sectors_out;    
         atomic_t cum_queues_in,cum_queues_out;
  
-#ifdef LIMIT_DEBUG
-       int nskip;
-       unsigned long navsec;
-       unsigned long csectorate;
-       unsigned long lsectorate;
-#endif
+       cfqlim_t cfqpriv;       /* data for enforcing limits */
  
         struct list_head prio_list;
         int last_rq;
         int last_sectors;
+
  };
  
  /*
@@ -179,8 +174,9 @@ struct cfq_data {
         unsigned int cfq_grace_rt;
         unsigned int cfq_grace_idle;
  
-       unsigned long cfq_epoch;        /* duration for limit enforcement */
-       unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */
+       unsigned int cfq_epoch;
+       unsigned int cfq_hmax_pct;
+       unsigned int cfq_qsectorate;
  };
  
  /*
@@ -194,14 +190,34 @@ struct cfq_queue {
         int queued[2];
         int ioprio;
  
+       /* limit related settings/stats obtained 
+          either from io_prio_data or ckrm I/O class
+       */
+       struct cfqlim *cfqpriv; 
+
+       u64 epstart;            /* current epoch's starting timestamp (ns) */
+       u64 epsector[2];        /* Total sectors dispatched in [0] previous
+                                * and [1] current epoch
+                                */
+       
         unsigned long avsec;            /* avg sectors dispatched/epoch */
-       unsigned long long lastime;     /* timestamp of last request served */
-       unsigned long sectorate;        /* limit for sectors served/epoch */
+//     unsigned long long lastime;     /* timestamp of last request served */
+//     unsigned long sectorate;        /* limit for sectors served/epoch */
         int skipped;                    /* queue skipped at last dispatch ? */
+
+       /* Per queue timer to suspend/resume queue from processing */
+       struct timer_list timer;
+       unsigned long wait_end;
+       unsigned long flags;
+       struct work_struct work;
+
+       struct cfq_data *cfqd;
  };
  
+
+
  /*
- * per-request structure
+ * Per-request structure
   */
  struct cfq_rq {
         struct cfq_queue *cfq_queue;
@@ -516,69 +532,101 @@ link:
         list_add_tail(&crq->request->queuelist, entry);
  }
  
-/*
- * remove from io scheduler core and put on dispatch list for service
- */
+struct cfq_queue *dcfqq;
+u64 dtmp;
+
+
+
+/* Over how many ns is sectorate defined */
+#define NS4SCALE  (100000000)
+
  static inline int
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-                       struct cfq_queue *cfqq)
+__cfq_check_limit(struct cfq_data *cfqd,struct cfq_queue *cfqq, int dontskip)
  {
         struct cfq_rq *crq;
-       unsigned long long ts, gap;
-       unsigned long newavsec;
+       unsigned long long ts, gap, epoch, tmp;
+       unsigned long newavsec, sectorate;
  
         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
  
-#if 1
-       /* Determine if queue should be skipped for being overshare */
         ts = sched_clock();
-       gap = ts - cfqq->lastime;
-#ifdef LIMIT_DEBUG
-       cfqq->sectorate = (cfqd->cfq_epochsectors 
-                          * CFQ_TEMPLIM)/100;
-       
-#endif
-       if ((gap >= cfqd->cfq_epoch) || (gap < 0)) {
-               cfqq->avsec = crq->nr_sectors ; 
-               cfqq->lastime = ts;
+       gap = ts - cfqq->epstart;
+       epoch = cfqd->cfq_epoch;
+
+       sectorate = atomic_read(&cfqq->cfqpriv->sectorate);
+//     sectorate = atomic_read(&(cfqd->cid[crq->ioprio].sectorate));
+
+       dcfqq = cfqq;
+
+       if ((gap >= epoch) || (gap < 0)) {
+
+               if (gap >= (epoch << 1)) {
+                       cfqq->epsector[0] = 0;
+                       cfqq->epstart = ts ; 
+               } else {
+                       cfqq->epsector[0] = cfqq->epsector[1];
+                       cfqq->epstart += epoch;
+               } 
+               cfqq->epsector[1] = 0;
+               gap = ts - cfqq->epstart;
+
+               tmp  = (cfqq->epsector[0] + crq->nr_sectors) * NS4SCALE;
+               do_div(tmp,epoch+gap);
+
+               cfqq->avsec = (unsigned long)tmp;
+               cfqq->skipped = 0;
+               cfqq->epsector[1] += crq->nr_sectors;
+               
+               cfqq->cfqpriv->navsec = cfqq->avsec;
+               cfqq->cfqpriv->sec[0] = cfqq->epsector[0];
+               cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
+               cfqq->cfqpriv->timedout++;
+               /*
+               cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
+               cfqd->cid[crq->ioprio].sec[0] = cfqq->epsector[0];
+               cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
+               cfqd->cid[crq->ioprio].timedout++;
+               */
+               return 0;
         } else {
-               u64 tmp;
-               /* Age old average and accumalate request to be served */
-
-//             tmp = (u64) (cfqq->avsec * gap) ;
-//             do_div(tmp, cfqd->cfq_epoch);
-               newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors;
-//             if (crq->ioprio >= 0 && crq->ioprio <= 20)
-//                     cfqd->cid[crq->ioprio].lsectorate = newavsec; 
-//             atomic_set(&(cfqd->cid[crq->ioprio].lsectorate),
-//                        newavsec);
-
-               if ((newavsec < cfqq->sectorate) || cfqq->skipped) {
+               
+               tmp = (cfqq->epsector[0] + cfqq->epsector[1] + crq->nr_sectors)
+                       * NS4SCALE;
+               do_div(tmp,epoch+gap);
+
+               newavsec = (unsigned long)tmp;
+               if ((newavsec < sectorate) || dontskip) {
                         cfqq->avsec = newavsec ;
-                       cfqq->lastime = ts;
                         cfqq->skipped = 0;
+                       cfqq->epsector[1] += crq->nr_sectors;
+                       cfqq->cfqpriv->navsec = cfqq->avsec;
+                       cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
+                       /*
+                       cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
+                       cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
+                       */
                 } else {
-                       /* queue over share ; skip once */
                         cfqq->skipped = 1;
-#ifdef LIMIT_DEBUG     
-//                     atomic_inc(&(cfqd->cid[crq->ioprio].nskip));
-//                     if (crq->ioprio >= 0 && crq->ioprio <= 20)
-//                             cfqd->cid[crq->ioprio].nskip++;
-#endif
-                       return 0;
+                       /* pause q's processing till avsec drops to 
+                          cfq_hmax_pct % of its value */
+                       tmp = (epoch+gap) * (100-cfqd->cfq_hmax_pct);
+                       do_div(tmp,1000000*cfqd->cfq_hmax_pct);
+                       cfqq->wait_end = jiffies+msecs_to_jiffies(tmp);
                 }
-       }
-#endif
+       }                       
+}
  
-#ifdef LIMIT_DEBUG
-//     if (crq->ioprio >= 0 && crq->ioprio <= 20) {
-//             cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
-//             cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate;
-//     }
+/*
+ * remove from io scheduler core and put on dispatch list for service
+ */
+static inline int
+__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
+                       struct cfq_queue *cfqq)
+{
+       struct cfq_rq *crq;
+
+       crq = rb_entry_crq(rb_first(&cfqq->sort_list));
  
-//     atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec);
-//     atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate);
-#endif
         cfq_dispatch_sort(cfqd, cfqq, crq);
  
         /*
@@ -593,44 +641,83 @@ cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
  {
         struct cfq_data *cfqd = q->elevator.elevator_data;
         struct list_head *plist = &cfqd->cid[prio].rr_list;
+       struct cfq_queue *cfqq;
         struct list_head *entry, *nxt;
         int q_rq, q_io;
-       int ret ;
+       int first_round,busy_queues,busy_unlimited;
+
  
         /*
          * for each queue at this prio level, dispatch a request
          */
         q_rq = q_io = 0;
+       first_round=1;
+ restart:
+       busy_unlimited = 0;
+       busy_queues = 0;
         list_for_each_safe(entry, nxt, plist) {
-               struct cfq_queue *cfqq = list_entry_cfqq(entry);
+               cfqq = list_entry_cfqq(entry);
  
                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
+               busy_queues++;
  
-               ret = __cfq_dispatch_requests(q, cfqd, cfqq);
-               if (ret <= 0) {
-                       continue; /* skip queue */
-                       /* can optimize more by moving q to end of plist ? */
+               
+               if (first_round || busy_unlimited)
+                       __cfq_check_limit(cfqd,cfqq,0);
+               else
+                       __cfq_check_limit(cfqd,cfqq,1);
+
+               if (cfqq->skipped) {
+                       cfqq->cfqpriv->nskip++;
+                       /* cfqd->cid[prio].nskip++; */
+                       busy_queues--;
+                       if (time_before(jiffies, cfqq->wait_end)) {
+                               list_del(&cfqq->cfq_list);
+                               mod_timer(&cfqq->timer,cfqq->wait_end);
+                       }
+                       continue;
                 }
-               q_io += ret ;
-               q_rq++ ;
+               busy_unlimited++;
+
+               q_io += __cfq_dispatch_requests(q, cfqd, cfqq);
+               q_rq++;
  
-               if (RB_EMPTY(&cfqq->sort_list))
+               if (RB_EMPTY(&cfqq->sort_list)) {
+                       busy_unlimited--;
+                       busy_queues--;
                         cfq_put_queue(cfqd, cfqq);
-               /*
-                * if we hit the queue limit, put the string of serviced
-                * queues at the back of the pending list
-                */
+               } 
+
                 if (q_io >= max_sectors || q_rq >= max_rq) {
+#if 0
                         struct list_head *prv = nxt->prev;
  
                         if (prv != plist) {
                                 list_del(plist);
                                 list_add(plist, prv);
                         }
+#endif
                         break;
                 }
         }
  
+       if ((q_io < max_sectors) && (q_rq < max_rq) && 
+           (busy_queues || first_round))
+       {
+               first_round = 0;
+               goto restart;
+       } else {
+               /*
+                * if we hit the queue limit, put the string of serviced
+                * queues at the back of the pending list
+                */
+               struct list_head *prv = nxt->prev;
+               if (prv != plist) {
+                       list_del(plist);
+                       list_add(plist, prv);
+               }
+       }
+
         cfqd->cid[prio].last_rq = q_rq;
         cfqd->cid[prio].last_sectors = q_io;
         return q_rq;
@@ -806,6 +893,29 @@ static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         mempool_free(cfqq, cfq_mpool);
  }
  
+static void cfq_pauseq_timer(unsigned long data)
+{
+       struct cfq_queue *cfqq = (struct cfq_queue *) data;
+       kblockd_schedule_work(&cfqq->work);
+}
+
+static void cfq_pauseq_work(void *data)
+{
+       struct cfq_queue *cfqq = (struct cfq_queue *) data;
+       struct cfq_data *cfqd = cfqq->cfqd;
+       request_queue_t *q = cfqd->queue;
+       unsigned long flags;
+       
+       spin_lock_irqsave(q->queue_lock, flags);
+       list_add_tail(&cfqq->cfq_list,&cfqd->cid[cfqq->ioprio].rr_list);
+       cfqq->skipped = 0;
+       if (cfq_next_request(q))
+               q->request_fn(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+
+       //del_timer(&cfqq->timer);
+}      
+
  static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
                                          int gfp_mask)
  {
@@ -833,9 +943,22 @@ retry:
                 INIT_LIST_HEAD(&cfqq->cfq_list);
                 cfqq->hash_key = cfq_hash_key(current);
                 cfqq->ioprio = cfq_ioprio(current);
-               cfqq->avsec = 0 ;
-               cfqq->lastime = sched_clock();
-               cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100;
+               
+               cfqq->cfqpriv = cfq_cfqpriv(cfqd,current);
+               if (!cfqq->cfqpriv)
+                       cfqq->cfqpriv = &((cfqd->cid[cfqq->ioprio]).cfqpriv);
+
+               cfqq->epstart = sched_clock();
+               /* epsector, avsec, skipped initialized to zero by memset */
+               
+               init_timer(&cfqq->timer);
+               cfqq->timer.function = cfq_pauseq_timer;
+               cfqq->timer.data = (unsigned long) cfqq;
+
+               INIT_WORK(&cfqq->work, cfq_pauseq_work, cfqq); 
+
+               cfqq->cfqd = cfqd ;
+
                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
         }
  
@@ -1132,6 +1255,8 @@ static void cfq_exit(request_queue_t *q, elevator_t *e)
         kfree(cfqd);
  }
  
+       
+
  static void cfq_timer(unsigned long data)
  {
         struct cfq_data *cfqd = (struct cfq_data *) data;
@@ -1182,12 +1307,12 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
                 atomic_set(&cid->cum_sectors_out,0);            
                 atomic_set(&cid->cum_queues_in,0);
                 atomic_set(&cid->cum_queues_out,0);
-#if 0
-               atomic_set(&cid->nskip,0);
-               atomic_set(&cid->navsec,0);
-               atomic_set(&cid->csectorate,0);
-               atomic_set(&cid->lsectorate,0);
-#endif
+
+               
+               atomic_set(&((cid->cfqpriv).sectorate),CFQ_SECTORATE);
+               (cid->cfqpriv).nskip = 0;
+               (cid->cfqpriv).navsec = 0;
+               (cid->cfqpriv).timedout = 0;
         }
  
         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
@@ -1217,6 +1342,9 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
         cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
         cfqd->cfq_grace_rt = cfq_grace_rt;
         cfqd->cfq_grace_idle = cfq_grace_idle;
+       
+       cfqd->cfq_epoch = CFQ_EPOCH;
+       cfqd->cfq_hmax_pct = CFQ_HMAX_PCT;
  
         q->nr_requests <<= 2;
  
@@ -1224,14 +1352,6 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
         e->elevator_data = cfqd;
         cfqd->queue = q;
  
-       cfqd->cfq_epoch = CFQ_EPOCH;
-       if (q->hardsect_size)
-               cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/
-                                     q->hardsect_size)* (1000000 / CFQ_EPOCH);
-       else
-               cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512)
-                       * (1000000 / CFQ_EPOCH) ;
-
         return 0;
  out_crqpool:
         kfree(cfqd->cfq_hash);
@@ -1302,6 +1422,8 @@ SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
  SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
  SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
  SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
+SHOW_FUNCTION(cfq_epoch_show, cfqd->cfq_epoch);
+SHOW_FUNCTION(cfq_hmax_pct_show, cfqd->cfq_hmax_pct);
  #undef SHOW_FUNCTION
  
  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                                \
@@ -1321,63 +1443,38 @@ STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX
  STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
  STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
  STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
+STORE_FUNCTION(cfq_epoch_store, &cfqd->cfq_epoch, 0, INT_MAX);
+STORE_FUNCTION(cfq_hmax_pct_store, &cfqd->cfq_hmax_pct, 1, 100);
  #undef STORE_FUNCTION
  
  
-static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page)
-{
-       return sprintf(page, "%lu\n", cfqd->cfq_epoch);
-}
-
-static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
-       char *p = (char *) page;
-       cfqd->cfq_epoch = simple_strtoul(p, &p, 10);
-       return count;
-}
-
-static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page)
-{
-       return sprintf(page, "%lu\n", cfqd->cfq_epochsectors);
-}
-
-static ssize_t 
-cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
-       char *p = (char *) page;
-       cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10);
-       return count;
-}
-
  /* Additional entries to get priority level data */
  static ssize_t
  cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
  {
-       int r1,r2,s1,s2,q1,q2;
+    //int r1,r2,s1,s2,q1,q2;
  
         if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) 
                 return 0;
         
+       /*
         r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
         r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
         s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
         s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
         q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); 
         q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
-       
-       return sprintf(page,"skip %d avsec %lu rate %lu new %lu"
-                      "rq (%d,%d) sec (%d,%d) q (%d,%d)\n",
-                      cfqd->cid[priolvl].nskip,
-                      cfqd->cid[priolvl].navsec,
-                      cfqd->cid[priolvl].csectorate,
-                      cfqd->cid[priolvl].lsectorate,
-//                    atomic_read(&cfqd->cid[priolvl].nskip),
-//                    atomic_read(&cfqd->cid[priolvl].navsec),
-//                    atomic_read(&cfqd->cid[priolvl].csectorate),
-//                    atomic_read(&cfqd->cid[priolvl].lsectorate),
-                      r1,r2,
-                      s1,s2,
-                      q1,q2);
+       */
+
+       return sprintf(page,"skip %d timdout %d avsec %lu rate %ld "
+                      " sec0 %lu sec1 %lu\n",
+                      cfqd->cid[priolvl].cfqpriv.nskip,
+                      cfqd->cid[priolvl].cfqpriv.timedout,
+                      cfqd->cid[priolvl].cfqpriv.navsec,
+                      atomic_read(&(cfqd->cid[priolvl].cfqpriv.sectorate)),
+                      (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[0],
+                      (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[1]);
+
  }
  
  #define SHOW_PRIO_DATA(__PRIOLVL)                                               \
@@ -1411,12 +1508,25 @@ SHOW_PRIO_DATA(20);
  
  static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
  {      
+
+       char *p = (char *) page;
+       int val;
+
+       val = (int) simple_strtoul(p, &p, 10);
+
+       atomic_set(&(cfqd->cid[priolvl].cfqpriv.sectorate),val);
+       cfqd->cid[priolvl].cfqpriv.nskip = 0;
+       cfqd->cid[priolvl].cfqpriv.navsec = 0;
+       cfqd->cid[priolvl].cfqpriv.timedout = 0;
+
+#if 0
         atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
         atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
         atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
         atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
+#endif
  
         return count;
  }
@@ -1491,10 +1601,10 @@ static struct cfq_fs_entry cfq_epoch_entry = {
         .show = cfq_epoch_show,
         .store = cfq_epoch_store,
  };
-static struct cfq_fs_entry cfq_epochsectors_entry = {
-       .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR },
-       .show = cfq_epochsectors_show,
-       .store = cfq_epochsectors_store,
+static struct cfq_fs_entry cfq_hmax_pct_entry = {
+       .attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR },
+       .show = cfq_hmax_pct_show,
+       .store = cfq_hmax_pct_store,
  };
  
  #define P_0_STR   "p0"
@@ -1558,7 +1668,7 @@ static struct attribute *default_attrs[] = {
         &cfq_grace_rt_entry.attr,
         &cfq_grace_idle_entry.attr,
         &cfq_epoch_entry.attr,
-       &cfq_epochsectors_entry.attr,
+       &cfq_hmax_pct_entry.attr,
         &cfq_prio_0_entry.attr,
         &cfq_prio_1_entry.attr,
         &cfq_prio_2_entry.attr,
diff --git a/drivers/block/ckrm-io.c b/drivers/block/ckrm-io.c

index 7edfce7..8991026 100644 (file)
--- a/drivers/block/ckrm-io.c
+++ b/drivers/block/ckrm-io.c
@@ -35,14 +35,11 @@
  #include <linux/ckrm_tc.h>
  #include <linux/ckrm-io.h>
  
-/* Tie to cfq priorities */
-#define CKI_IOPRIO_NORM                IOPRIO_NORM
+/* sectorate == 512 byte sectors served in CFQ_EPOCH ns*/
  
-/* Divisor to get fraction of bandwidth represented by an IOPRIO value */
-/* FIXME: Will not work if IOPRIO_NR > 100 */
-#define CKI_IOPRIO_DIV         (IOPRIO_NR-1)
-/* Minimum ioprio value to be assigned to a class */
-#define CKI_IOPRIO_MIN         1
+/* CKI_ROOTSECTORATE needs to be made configurable from outside */
+#define CKI_ROOTSECTORATE      100000
+#define CKI_MINSECTORATE       100
  
  #define CKI_IOUSAGE_UNIT       512
  
@@ -52,7 +49,12 @@ typedef struct ckrm_io_stats{
         unsigned long        blksz;  /* size of bandwidth unit */
         atomic_t             blkrd;  /* read units submitted to DD */
         atomic_t             blkwr; /* write units submitted to DD */
-       
+
+       int nskip;                      /* # times q skipped    */
+       unsigned long navsec;           /* avg sectors serviced */
+       int timedout;                   /* # times gap > epoch  */
+       u64 sec[2];                     /* sectors serviced in 
+                                          prev & curr epochs   */
  } cki_stats_t;          /* per class I/O statistics */
  
  /* Note
@@ -75,8 +77,12 @@ typedef struct ckrm_io_class {
          * in local units. 
          */
  
+       cfqlim_t cfqpriv;       /* Data common with cfq priolvl's */    
+
+
         int cnt_guarantee; /* Allocation as parent */
         int cnt_unused;    /* Allocation to default subclass */
+       int cnt_limit;
  
         /* Statistics, for class and default subclass */
         cki_stats_t stats; 
@@ -85,19 +91,16 @@ typedef struct ckrm_io_class {
  } cki_icls_t;
  
  
-
  /* Internal functions */
  static inline void cki_reset_stats(cki_stats_t *usg);
  static inline void init_icls_one(cki_icls_t *icls);
-static inline int cki_div(int *a, int b, int c);
-//static inline int cki_recalc(cki_icls_t *icls, int rel2abs);
  static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres);
  
  /* External functions e.g. interface to ioscheduler */
  void *cki_tsk_icls (struct task_struct *tsk);
  int cki_tsk_ioprio (struct task_struct *tsk);
  
-extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio);
+extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv);
  
  /* CKRM Resource Controller API functions */
  static void * cki_alloc(struct ckrm_core_class *this,
@@ -139,45 +142,27 @@ static inline void init_icls_stats(cki_icls_t *icls)
  
  static inline void init_icls_one(cki_icls_t *icls)
  {
-       // Assign zero as initial guarantee otherwise creations
-       // could fail due to inadequate share
-
-       //icls->shares.my_guarantee = 
-       //      (CKI_IOPRIO_MIN * CKRM_SHARE_DFLT_TOTAL_GUARANTEE) / 
-       //      CKI_IOPRIO_DIV ;
-       icls->shares.my_guarantee = 0;
-       icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       /* Zero initial guarantee for scalable creation of
+          multiple classes */
  
-       icls->shares.unused_guarantee = icls->shares.total_guarantee - 
-               icls->shares.my_guarantee;
-       icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-
-       icls->cnt_guarantee = icls->cnt_unused = IOPRIO_IDLE;
+       /* Try out a new set */
+       
+       icls->shares.my_guarantee = CKRM_SHARE_DONTCARE;
+       icls->shares.my_limit = CKRM_SHARE_DONTCARE;
+       icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+       icls->shares.unused_guarantee = icls->shares.total_guarantee;
+       icls->shares.cur_max_limit = 0;
  
-       //Same rationale icls->ioprio = CKI_IOPRIO_MIN;
-       //IOPRIO_IDLE equivalence to zero my_guarantee (set above) relies
-       //on former being zero.
+       icls->cnt_guarantee = CKRM_SHARE_DONTCARE;
+       icls->cnt_unused = CKRM_SHARE_DONTCARE;
+       icls->cnt_limit = CKRM_SHARE_DONTCARE;
         
         init_icls_stats(icls);
  }
  
-
-static inline int cki_div(int *a, int b, int c)
-{
-       u64 temp = (u64) b * c ;
-       do_div(temp,CKI_IOPRIO_DIV);
-       *a = (int) temp;
-
-       return 0;
-}
-       
-
-/* Recalculate absolute shares from relative (rel2abs=1)
- * or vice versa (rel2abs=0) 
- * Caller should have a lock on icls
+/* Recalculate absolute shares from relative
+ * Caller should hold a lock on icls
   */
  
  static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
@@ -186,17 +171,17 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
         ckrm_core_class_t *child = NULL;
         cki_icls_t *childres;
         int resid = cki_rcbs.resid;
+       u64 temp;
  
         if (parres) {
                 struct ckrm_shares *par = &parres->shares;
                 struct ckrm_shares *self = &res->shares;
  
  
-
                 if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         res->cnt_guarantee = CKRM_SHARE_DONTCARE;
                 } else if (par->total_guarantee) {
-                       u64 temp = (u64) self->my_guarantee * 
+                       temp = (u64) self->my_guarantee * 
                                 parres->cnt_guarantee;
                         do_div(temp, par->total_guarantee);
                         res->cnt_guarantee = (int) temp;
@@ -204,16 +189,36 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
                         res->cnt_guarantee = 0;
                 }
  
+
+               if (parres->cnt_limit == CKRM_SHARE_DONTCARE) {
+                       res->cnt_limit = CKRM_SHARE_DONTCARE;
+                       atomic_set(&res->cfqpriv.sectorate,CKI_MINSECTORATE);
+               } else {
+                       if (par->max_limit) {
+                               temp = (u64) self->my_limit * 
+                                       parres->cnt_limit;
+                               do_div(temp, par->max_limit);
+                               res->cnt_limit = (int) temp;
+                       } else {
+                               res->cnt_limit = 0;
+                       }
+                       atomic_set(&res->cfqpriv.sectorate,res->cnt_limit);
+               }
+               
                 if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         res->cnt_unused = CKRM_SHARE_DONTCARE;
-               } else if (self->total_guarantee) {
-                       u64 temp = (u64) self->unused_guarantee * 
-                               res->cnt_guarantee;
-                       do_div(temp, self->total_guarantee);
-                       res->cnt_unused = (int) temp;
                 } else {
-                       res->cnt_unused = 0;
+                       if (self->total_guarantee) {
+                               temp = (u64) self->unused_guarantee * 
+                                       res->cnt_guarantee;
+                               do_div(temp, self->total_guarantee);
+                               res->cnt_unused = (int) temp;
+                       } else {
+                               res->cnt_unused = 0;
+                       }
+
                 }
+               
         }
         // propagate to children
         ckrm_lock_hier(res->core);
@@ -228,50 +233,6 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
         ckrm_unlock_hier(res->core);
  }
  
-#if 0
-static inline int cki_recalc(cki_icls_t *icls, int rel2abs)
-{
-       u64 temp;
-
-       if (icls->parent == NULL) {
-               /* Root, as parent, always gets all */
-
-               temp = icls->shares.my_guarantee * (IOPRIO_NR-1);
-               do_div(temp, icls->shares.total_guarantee);
-
-               icls->total = IOPRIO_NR-1;
-               icls->ioprio = temp ;
-               icls->unused = icls->total - icls->ioprio;
-//             icls->unused = (IOPRIO_NR-1)-icls->ioprio;
-
-       } else {
-               cki_icls_t *parres;
-               int partot ;
-               
-               parres = ckrm_get_res_class(icls->parent,
-                                           cki_rcbs.resid,
-                                           cki_icls_t);
-               if (!parres) {
-                       printk(KERN_ERR "cki_recalc: error getting "
-                              "resclass from core \n");
-                       return -EINVAL;
-               }
-
-
-               temp = (icls->shares.my_guarantee * 
-                       parres->total);
-               do_div(temp, parres->shares.total_guarantee);
-
-               icls->ioprio = temp;
-               icls->unused = 0;
-
-       }
-       
-       return 0;
-
-}
-#endif
-
  void *cki_tsk_icls(struct task_struct *tsk)
  {
         return (void *) ckrm_get_res_class(class_core(tsk->taskclass),
@@ -279,12 +240,19 @@ void *cki_tsk_icls(struct task_struct *tsk)
  }
  
  int cki_tsk_ioprio(struct task_struct *tsk)
+{
+       /* Don't use I/O priorities for now */
+       return IOPRIO_NORM;
+}
+
+void *cki_tsk_cfqpriv(struct task_struct *tsk)
  {
         cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass),
                                            cki_rcbs.resid, cki_icls_t);
-       return icls->cnt_unused;
+       return (void *)&(icls->cfqpriv);
  }
  
+
  static void *cki_alloc(struct ckrm_core_class *core,
                          struct ckrm_core_class *parent)
  {
@@ -301,43 +269,13 @@ static void *cki_alloc(struct ckrm_core_class *core,
         icls->parent = parent;
         icls->shares_lock = SPIN_LOCK_UNLOCKED;
  
-       if (parent == NULL) {
-
-               /* Root class gets same as "normal" CFQ priorities to
-                * retain compatibility of behaviour in the absence of 
-                * other classes
-                */
-
-               icls->cnt_guarantee = icls->cnt_unused = IOPRIO_NR-1; 
-
-               /* Default gets normal, not minimum */
-               //icls->unused = IOPRIO_NORM;
-               //icls->unused = icls->guarantee-icls->myguarantee;
-               //icls->limit = icls->mylimit = IOPRIO_NR;
-
-               /* Compute shares in abstract units */
-               icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-               // my_guarantee for root is meaningless. Set to default
-               icls->shares.my_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       init_icls_one(icls);
  
-               icls->shares.unused_guarantee = 
-                       CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-               //temp = (u64) icls->cnt_unused * icls->shares.total_guarantee;
-               //do_div(temp, CKI_IOPRIO_DIV); 
-               // temp now has root's default's share
-               //icls->shares.unused_guarantee = 
-               // icls->shares.total_guarantee - temp; 
-
-               icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-               icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-               icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-       } else {
-               init_icls_one(icls);
-               /* No propagation to parent needed if icls'
-                  initial share is zero */
+       if (parent == NULL) {
+               icls->cnt_guarantee =  CKI_ROOTSECTORATE;
+               icls->cnt_unused = CKI_ROOTSECTORATE;
+               icls->cnt_limit = CKI_ROOTSECTORATE;
+               atomic_set(&(icls->cfqpriv.sectorate),icls->cnt_limit);
         }
         try_module_get(THIS_MODULE);
         return icls;
@@ -345,7 +283,10 @@ static void *cki_alloc(struct ckrm_core_class *core,
  
  static void cki_free(void *res)
  {
-       cki_icls_t *icls = res, *parres;
+       cki_icls_t *icls = res, *parres, *childres;
+       ckrm_core_class_t *child = NULL;
+       int maxlimit, resid = cki_rcbs.resid;
+
         
         if (!res)
                 return;
@@ -361,9 +302,7 @@ static void cki_free(void *res)
          *
          */
  
-       parres = ckrm_get_res_class(icls->parent,
-                                   cki_rcbs.resid,
-                                   cki_icls_t);
+       parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t);
         if (!parres) {
                 printk(KERN_ERR "cki_free: error getting "
                        "resclass from core \n");
@@ -372,8 +311,23 @@ static void cki_free(void *res)
  
         /* Update parent's shares */
         spin_lock(&parres->shares_lock);
+
         child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0);
         parres->cnt_unused += icls->cnt_guarantee;
+
+       // run thru parent's children and get the new max_limit of the parent
+       ckrm_lock_hier(parres->core);
+       maxlimit = 0;
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, resid, cki_icls_t);
+               if (maxlimit < childres->shares.my_limit) {
+                       maxlimit = childres->shares.my_limit;
+               }
+       }
+       ckrm_unlock_hier(parres->core);
+       if (parres->shares.cur_max_limit < maxlimit) {
+               parres->shares.cur_max_limit = maxlimit;
+       }
         spin_unlock(&parres->shares_lock);
  
         kfree(res);
@@ -388,26 +342,15 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
         struct ckrm_shares *cur, *par;
         int rc = -EINVAL, resid = cki_rcbs.resid;
  
-       if (!icls) {
-               printk(KERN_ERR "No class\n");
+       if (!icls) 
                 return rc;
-       }
  
         cur = &icls->shares; 
-
-       /* limits not supported */
-       if ((new->max_limit != CKRM_SHARE_UNCHANGED)
-           || (new->my_limit != CKRM_SHARE_UNCHANGED)) {
-               printk(KERN_ERR "limits not supported\n");
-               return -EINVAL;
-       }
-
         if (icls->parent) {
                 parres =
                     ckrm_get_res_class(icls->parent, resid, cki_icls_t);
                 if (!parres) {
-                       printk(KERN_ERR "cki_setshare: error getting "
-                              "resclass from core \n");
+                       pr_debug("cki_setshare: invalid resclass\n");
                         return -EINVAL;
                 }
                 spin_lock(&parres->shares_lock);
@@ -420,10 +363,8 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
         }
  
         rc = set_shares(new, cur, par);
-       printk(KERN_ERR "rc from set_shares %d\n", rc);
  
         if ((!rc) && parres) {
-               
                 if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                         parres->cnt_unused = CKRM_SHARE_DONTCARE;
                 } else if (par->total_guarantee) {
@@ -435,17 +376,6 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
                         parres->cnt_unused = 0;
                 }
                 cki_recalc_propagate(res, parres);
-       
-#if 0
-               int old = icls->ioprio;
-               
-               rc = cki_recalc(icls,0);
-
-               if (!rc && parres) {
-                       int raise_tot = icls->ioprio - old ;
-                       parres->unused -= raise_tot ;
-               }
-#endif
         }
         spin_unlock(&icls->shares_lock);
         if (icls->parent) {
@@ -471,15 +401,15 @@ static int cki_getstats(void *res, struct seq_file *sfile)
         if (!icls)
                 return -EINVAL;
  
-/*     
-       seq_printf(sfile, "%d my_read\n",atomic_read(&icls->mystats.blkrd));
-       seq_printf(sfile, "%d my_write\n",atomic_read(&icls->mystats.blkwr));
-       seq_printf(sfile, "%d total_read\n",atomic_read(&icls->stats.blkrd));
-       seq_printf(sfile, "%d total_write\n",atomic_read(&icls->stats.blkwr));
-*/
-       
-       seq_printf(sfile, "%d total ioprio\n",icls->cnt_guarantee);
-       seq_printf(sfile, "%d unused/default ioprio\n",icls->cnt_unused);
+       seq_printf(sfile, "abs limit %d\n",icls->cnt_limit);
+       seq_printf(sfile, "skip %d timdout %d avsec %lu rate %ld "
+                  " sec0 %ld sec1 %ld\n",
+                  icls->cfqpriv.nskip,
+                  icls->cfqpriv.timedout,
+                  icls->cfqpriv.navsec,
+                  atomic_read(&(icls->cfqpriv.sectorate)),
+                  (unsigned long)icls->cfqpriv.sec[0],
+                  (unsigned long)icls->cfqpriv.sec[1]);
  
         return 0;
  }
@@ -554,7 +484,7 @@ int __init cki_init(void)
                 resid = ckrm_register_res_ctlr(clstype, &cki_rcbs);
                 if (resid != -1) {
                         cki_rcbs.classtype = clstype;
-                       cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio);
+                       cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio,cki_tsk_cfqpriv);
                 }
         }
         
@@ -566,7 +496,7 @@ void __exit cki_exit(void)
         ckrm_unregister_res_ctlr(&cki_rcbs);
         cki_rcbs.resid = -1;
         cki_rcbs.classtype = NULL; 
-       cki_cfq_set(NULL,NULL);
+       cki_cfq_set(NULL,NULL,NULL);
  }
  
  module_init(cki_init)
diff --git a/drivers/block/ckrm-iostub.c b/drivers/block/ckrm-iostub.c

index c325d8e..f401254 100644 (file)
--- a/drivers/block/ckrm-iostub.c
+++ b/drivers/block/ckrm-iostub.c
@@ -25,13 +25,14 @@ static spinlock_t stub_lock = SPIN_LOCK_UNLOCKED;
  
  static icls_tsk_t tskiclstub;
  static icls_ioprio_t tskiopriostub;
+static icls_tsk_t tskcfqprivstub;
  
-
-void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio)
+void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv)
  {
         spin_lock(&stub_lock);
         tskiclstub = tskicls;
         tskiopriostub = tskioprio;
+       tskcfqprivstub = tskcfqpriv;
         spin_unlock(&stub_lock);
  }
  
@@ -59,6 +60,19 @@ int cki_ioprio(struct task_struct *tsk)
         return ret;
  }
  
+void *cki_cfqpriv(struct task_struct *tsk)
+{
+       void *ret;
+       spin_lock(&stub_lock);
+       if (tskiclstub)
+               ret = (*tskcfqprivstub)(tsk);
+       else 
+               ret = NULL;
+       spin_unlock(&stub_lock);
+       return ret;
+}    
+
  EXPORT_SYMBOL(cki_cfq_set);
  EXPORT_SYMBOL(cki_hash_key);
  EXPORT_SYMBOL(cki_ioprio);
+EXPORT_SYMBOL(cki_cfqpriv);
diff --git a/drivers/char/.cvsignore b/drivers/char/.cvsignore

new file mode 100644 (file)

index 0000000..83683a2
--- /dev/null
+++ b/drivers/char/.cvsignore
@@ -0,0 +1,2 @@
+consolemap_deftbl.c
+defkeymap.c
diff --git a/drivers/pci/.cvsignore b/drivers/pci/.cvsignore

new file mode 100644 (file)

index 0000000..d5b21d9
--- /dev/null
+++ b/drivers/pci/.cvsignore
@@ -0,0 +1,3 @@
+classlist.h
+devlist.h
+gen-devlist
diff --git a/drivers/scsi/aic7xxx/.cvsignore b/drivers/scsi/aic7xxx/.cvsignore

new file mode 100644 (file)

index 0000000..a1a7fcd
--- /dev/null
+++ b/drivers/scsi/aic7xxx/.cvsignore
@@ -0,0 +1,4 @@
+aic79xx_reg.h
+aic79xx_seq.h
+aic7xxx_reg.h
+aic7xxx_seq.h
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c

index e727628..87f99a3 100644 (file)
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -479,7 +479,7 @@ static void get_boot_desc           (struct edgeport_serial *edge_serial);
  static void load_application_firmware  (struct edgeport_serial *edge_serial);
  
  
-static void unicode_to_ascii           (char *string, short *unicode, int unicode_size);
+static void unicode_to_ascii           (char *string, __le16 *unicode, int unicode_size);
  
  
  
@@ -504,7 +504,7 @@ static void update_edgeport_E2PROM (struct edgeport_serial *edge_serial)
         __u32 BootNewVer;
         __u8 BootMajorVersion;                  
         __u8 BootMinorVersion;                  
-       __u16 BootBuildNumber;
+       __le16 BootBuildNumber;
         __u8 *BootImage;      
         __u32 BootSize;
         struct edge_firmware_image_record *record;
@@ -653,7 +653,7 @@ static void get_product_info(struct edgeport_serial *edge_serial)
  
         memset (product_info, 0, sizeof(struct edgeport_product_info));
  
-       product_info->ProductId         = (__u16)(edge_serial->serial->dev->descriptor.idProduct & ~ION_DEVICE_ID_GENERATION_2);
+       product_info->ProductId         = (__u16)(edge_serial->serial->dev->descriptor.idProduct & ~ION_DEVICE_ID_80251_NETCHIP);
         product_info->NumPorts          = edge_serial->manuf_descriptor.NumPorts;
         product_info->ProdInfoVer       = 0;
  
@@ -669,7 +669,7 @@ static void get_product_info(struct edgeport_serial *edge_serial)
         memcpy(product_info->ManufactureDescDate, edge_serial->manuf_descriptor.DescDate, sizeof(edge_serial->manuf_descriptor.DescDate));
  
         // check if this is 2nd generation hardware
-       if (edge_serial->serial->dev->descriptor.idProduct & ION_DEVICE_ID_GENERATION_2) {
+       if (edge_serial->serial->dev->descriptor.idProduct & ION_DEVICE_ID_80251_NETCHIP) {
                 product_info->FirmwareMajorVersion      = OperationalCodeImageVersion_GEN2.MajorVersion;
                 product_info->FirmwareMinorVersion      = OperationalCodeImageVersion_GEN2.MinorVersion;
                 product_info->FirmwareBuildNumber       = cpu_to_le16(OperationalCodeImageVersion_GEN2.BuildNumber);
@@ -900,12 +900,7 @@ static void edge_bulk_out_data_callback (struct urb *urb, struct pt_regs *regs)
  
         if (tty && edge_port->open) {
                 /* let the tty driver wakeup if it has a special write_wakeup function */
-               if ((tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && tty->ldisc.write_wakeup) {
-                       (tty->ldisc.write_wakeup)(tty);
-               }
-
-               /* tell the tty driver that something has changed */
-               wake_up_interruptible(&tty->write_wait);
+               tty_wakeup(tty);
         }
  
         // Release the Write URB
@@ -1389,7 +1384,7 @@ static void send_more_port_data(struct edgeport_serial *edge_serial, struct edge
         //      to bother queueing a write. If it's too small, say a few bytes,
         //      it's better to wait for more credits so we can do a larger
         //      write.
-       if (edge_port->txCredits < EDGE_FW_GET_TX_CREDITS_SEND_THRESHOLD(edge_port->maxTxCredits)) {
+       if (edge_port->txCredits < EDGE_FW_GET_TX_CREDITS_SEND_THRESHOLD(edge_port->maxTxCredits,EDGE_FW_BULK_MAX_PACKET_SIZE)) {
                 dbg("%s(%d) Not enough credit - fifo %d TxCredit %d", __FUNCTION__, edge_port->port->number, fifo->count, edge_port->txCredits );
                 return;
         }
@@ -2747,12 +2742,15 @@ static void change_port_settings (struct edgeport_port *edge_port, struct termio
   *     ASCII range, but it's only for debugging...
   *     NOTE: expects the unicode in LE format
   ****************************************************************************/
-static void unicode_to_ascii (char *string, short *unicode, int unicode_size)
+static void unicode_to_ascii (char *string, __le16 *unicode, int unicode_size)
  {
         int i;
-       for (i = 0; i < unicode_size; ++i) {
+
+       if (unicode_size <= 0)
+               return;
+
+       for (i = 0; i < unicode_size; ++i)
                 string[i] = (char)(le16_to_cpu(unicode[i]));
-       }
         string[unicode_size] = 0x00;
  }
  
@@ -3007,9 +3005,6 @@ static void edge_shutdown (struct usb_serial *serial)
  static int __init edgeport_init(void)
  {
         int retval;
-       retval = usb_serial_register(&edgeport_1port_device);
-       if (retval) 
-               goto failed_1port_device_register;
         retval = usb_serial_register(&edgeport_2port_device);
         if (retval)
                 goto failed_2port_device_register;
@@ -3031,8 +3026,6 @@ failed_8port_device_register:
  failed_4port_device_register:
         usb_serial_deregister(&edgeport_2port_device);
  failed_2port_device_register:
-       usb_serial_deregister(&edgeport_1port_device);
-failed_1port_device_register:
         return retval;
  }
  
@@ -3045,7 +3038,6 @@ failed_1port_device_register:
  static void __exit edgeport_exit (void)
  {
         usb_deregister (&io_driver);
-       usb_serial_deregister (&edgeport_1port_device);
         usb_serial_deregister (&edgeport_2port_device);
         usb_serial_deregister (&edgeport_4port_device);
         usb_serial_deregister (&edgeport_8port_device);
diff --git a/drivers/usb/serial/io_edgeport.h b/drivers/usb/serial/io_edgeport.h

index 386139d..5112d7a 100644 (file)
--- a/drivers/usb/serial/io_edgeport.h
+++ b/drivers/usb/serial/io_edgeport.h
@@ -107,11 +107,11 @@ struct edgeport_product_info {
  
         __u8    BootMajorVersion;               /* Boot Firmware version: xx. */
         __u8    BootMinorVersion;               /*                        yy. */
-       __u16   BootBuildNumber;                /*                        zzzz (LE format) */
+       __le16  BootBuildNumber;                /*                        zzzz (LE format) */
  
         __u8    FirmwareMajorVersion;           /* Operational Firmware version:xx. */
         __u8    FirmwareMinorVersion;           /*                              yy. */
-       __u16   FirmwareBuildNumber;            /*                              zzzz (LE format) */
+       __le16  FirmwareBuildNumber;            /*                              zzzz (LE format) */
  
         __u8    ManufactureDescDate[3];         /* MM/DD/YY when descriptor template was compiled */
         __u8    Unused1[1];                     /* Available */
diff --git a/fs/aio.c b/fs/aio.c

index 9e7b592..2335a07 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -543,7 +543,7 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id)
         return ioctx;
  }
  
-static void use_mm(struct mm_struct *mm)
+void use_mm(struct mm_struct *mm)
  {
         struct mm_struct *active_mm;
  
diff --git a/include/.cvsignore b/include/.cvsignore

new file mode 100644 (file)

index 0000000..04204c7
--- /dev/null
+++ b/include/.cvsignore
@@ -0,0 +1 @@
+config
diff --git a/include/asm-i386/.cvsignore b/include/asm-i386/.cvsignore

new file mode 100644 (file)

index 0000000..4ec57ad
--- /dev/null
+++ b/include/asm-i386/.cvsignore
@@ -0,0 +1 @@
+asm_offsets.h
diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h

index c689554..9513dd8 100644 (file)
--- a/include/asm-i386/apicdef.h
+++ b/include/asm-i386/apicdef.h
@@ -86,6 +86,7 @@
  #define                        APIC_LVT_REMOTE_IRR             (1<<14)
  #define                        APIC_INPUT_POLARITY             (1<<13)
  #define                        APIC_SEND_PENDING               (1<<12)
+#define                        APIC_MODE_MASK                  0x700
  #define                        GET_APIC_DELIVERY_MODE(x)       (((x)>>8)&0x7)
  #define                        SET_APIC_DELIVERY_MODE(x,y)     (((x)&~0x700)|((y)<<8))
  #define                                APIC_MODE_FIXED         0x0
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h

index d1a4dd6..43917d9 100644 (file)
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -39,6 +39,7 @@ union irq_ctx {
         u32                     stack[THREAD_SIZE/sizeof(u32)];
  };
  
+#ifdef CONFIG_IRQSTACKS
  extern union irq_ctx *hardirq_ctx[NR_CPUS];
  extern union irq_ctx *softirq_ctx[NR_CPUS];
  
@@ -46,6 +47,10 @@ extern void irq_ctx_init(int cpu);
  
  #define __ARCH_HAS_DO_SOFTIRQ
  
+#else
+#define irq_ctx_init(cpu) do { ; } while (0)
+#endif
+
  struct irqaction;
  struct pt_regs;
  asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *,
diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h

new file mode 100644 (file)

index 0000000..eb8fd98
--- /dev/null
+++ b/include/asm-i386/kexec.h
@@ -0,0 +1,25 @@
+#ifndef _I386_KEXEC_H
+#define _I386_KEXEC_H
+
+#include <asm/fixmap.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
+ * calculation for the amount of memory directly mappable into the
+ * kernel memory space.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+#define KEXEC_CONTROL_CODE_SIZE        4096
+
+#endif /* _I386_KEXEC_H */
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h

index 614d05f..263c6f7 100644 (file)
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -60,7 +60,19 @@ struct mod_arch_specific
  #define MODULE_REGPARM ""
  #endif
  
+#if (CONFIG_STACK_SIZE_SHIFT < 12)
+#define MODULE_STACKSIZE "TINYSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 12)
  #define MODULE_STACKSIZE "4KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 13)
+#define MODULE_STACKSIZE "8KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT == 14)
+#define MODULE_STACKSIZE "16KSTACKS "
+#elif (CONFIG_STACK_SIZE_SHIFT > 14)
+#define MODULE_STACKSIZE "HUGESTACKS "
+#else
+#define MODULE_STACKSIZE ""
+#endif
  
  #define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
  
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h

index cd8708b..3651a3b 100644 (file)
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -400,10 +400,10 @@ struct tss_struct {
  
  #define ARCH_MIN_TASKALIGN     16
  
-
-#define STACK_PAGE_COUNT       (4096/PAGE_SIZE)
-
-
+#if ((1<<CONFIG_STACK_SIZE_SHIFT) < PAGE_SIZE)
+#error (1<<CONFIG_STACK_SIZE_SHIFT) must be at least PAGE_SIZE
+#endif
+#define STACK_PAGE_COUNT       ((1<<CONFIG_STACK_SIZE_SHIFT)/PAGE_SIZE)
  
  
  struct thread_struct {
diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h

index abe3440..ed44e47 100644 (file)
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -95,4 +95,6 @@
   */
  #define IDT_ENTRIES 256
  
+#define KERN_PHYS_OFFSET (CONFIG_KERN_PHYS_OFFSET * 0x100000)
+
  #endif
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h

index d941e6d..da74573 100644 (file)
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -54,9 +54,10 @@ struct thread_info {
  #endif
  
  #define PREEMPT_ACTIVE         0x4000000
-#define THREAD_SIZE            (4096)
+#define THREAD_SIZE            (1<<CONFIG_STACK_SIZE_SHIFT)
+#define STACK_WARN             (CONFIG_STACK_WARN)
+#define STACK_PANIC            (0x200ul)
  
-#define STACK_WARN             (THREAD_SIZE/8)
  /*
   * macros/functions for gaining access to the thread information structure
   *
diff --git a/include/linux/.cvsignore b/include/linux/.cvsignore

new file mode 100644 (file)

index 0000000..c1cddb6
--- /dev/null
+++ b/include/linux/.cvsignore
@@ -0,0 +1,3 @@
+autoconf.h
+compile.h
+version.h
diff --git a/include/linux/ckrm-io.h b/include/linux/ckrm-io.h

index 36040b9..70277c7 100644 (file)
--- a/include/linux/ckrm-io.h
+++ b/include/linux/ckrm-io.h
@@ -34,6 +34,7 @@ typedef int (*icls_ioprio_t) (struct task_struct *tsk);
  
  extern void *cki_tsk_icls (struct task_struct *tsk);
  extern int cki_tsk_ioprio (struct task_struct *tsk);
+extern void *cki_tsk_cfqpriv (struct task_struct *tsk);
  
  #endif /* CONFIG_CKRM_RES_BLKIO */
  
diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h

index 3041c81..1453f5e 100644 (file)
--- a/include/linux/ckrm_classqueue.h
+++ b/include/linux/ckrm_classqueue.h
@@ -19,8 +19,8 @@
   * Aug 28, 2003
   *        Created.
   * July 07, 2004
- *   clean up, add comments     
- *   
+ *   clean up, add comments
+ *
   */
  
  #ifndef _CKRM_CLASSQUEUE_H
diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h

index 3611c2d..088e06c 100644 (file)
--- a/include/linux/ckrm_sched.h
+++ b/include/linux/ckrm_sched.h
@@ -62,7 +62,7 @@ struct ckrm_runqueue {
         CVT_t local_cvt;
  
         unsigned long lrq_load;
-       int local_weight; 
+       int local_weight;   
  
  
         /*
@@ -108,14 +108,14 @@ struct ckrm_cpu_class_stat {
  #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
  
  #define USAGE_SAMPLE_FREQ HZ  //sample every 1 seconds
-#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
+#define NS_PER_SAMPLE      (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
  #define USAGE_WINDOW_SIZE 60  //keep the last 60 sample
  
  struct ckrm_usage {
         unsigned long samples[USAGE_WINDOW_SIZE]; //record usages 
-       unsigned long sample_pointer; //pointer for the sliding window
-       unsigned long long last_ns; //ns for last sample
-       long long last_sample_jiffies; //in number of jiffies
+       unsigned long sample_pointer;  // pointer for the sliding window
+       unsigned long long last_ns;    // ns for last sample
+       long long last_sample_jiffies; // in number of jiffies
  };
  
  /*
@@ -247,7 +247,6 @@ void init_cpu_classes(void);
  void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
  void ckrm_cpu_change_class(void *task, void *old, void *new);
  
-
  #define CPU_DEMAND_ENQUEUE 0
  #define CPU_DEMAND_DEQUEUE 1
  #define CPU_DEMAND_DESCHEDULE 2
@@ -299,7 +298,7 @@ void adjust_local_weight(void);
  
  
  #define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC)  // 10 seconds
-
+ 
  #define CVT_UPDATE_TICK     ((HZ/2)?:1)
  
  // ABSOLUTE_CKRM_TUNING determines whether classes can make up
@@ -309,10 +308,10 @@ void adjust_local_weight(void);
  
  #ifdef ABSOLUTE_CKRM_TUNING
  
-#define MAX_SAVINGS        MAX_SAVINGS_ABSOLUTE
+#define MAX_SAVINGS          MAX_SAVINGS_ABSOLUTE
  //an absolute bonus of 200ms for classes when reactivated
  #define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
-#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
+#define SAVINGS_LEAK_SPEED   (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
  
  #define scale_cvt(val,lrq)   ((val)*local_class_weight(lrq))
  #define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
@@ -491,7 +490,7 @@ static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
  
         update_class_priority(lrq);
  }
-
+                                                                                
  static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
  {
         struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
@@ -544,7 +543,7 @@ long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
  static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load)
  {
         read_lock(&class_list_lock);
-       
+
  #ifdef CONFIG_SMP
         ckrm_load_sample(ckrm_load,this_cpu);
  #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h

index ece31a7..11067b7 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1603,6 +1603,15 @@ static inline void free_secdata(void *secdata)
  asmlinkage int sys_ioprio_set(int ioprio);
  asmlinkage int sys_ioprio_get(void);
  
+/* common structure for cfq & ckrm I/O controller */
+typedef struct cfqlim {
+       int nskip;
+       unsigned long navsec;
+       int timedout;
+       atomic_t sectorate;
+       u64 sec[2];
+} cfqlim_t ;
+
  
  #endif /* __KERNEL__ */
  #endif /* _LINUX_FS_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h

new file mode 100644 (file)

index 0000000..8bd6c6b
--- /dev/null
+++ b/include/linux/kexec.h
@@ -0,0 +1,56 @@
+#ifndef LINUX_KEXEC_H
+#define LINUX_KEXEC_H
+
+#ifdef CONFIG_KEXEC
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/kexec.h>
+
+/*
+ * This structure is used to hold the arguments that are used when loading
+ * kernel binaries.
+ */
+
+typedef unsigned long kimage_entry_t;
+#define IND_DESTINATION  0x1
+#define IND_INDIRECTION  0x2
+#define IND_DONE         0x4
+#define IND_SOURCE       0x8
+
+#define KEXEC_SEGMENT_MAX 8
+struct kexec_segment {
+       void *buf;
+       size_t bufsz;
+       void *mem;
+       size_t memsz;
+};
+
+struct kimage {
+       kimage_entry_t head;
+       kimage_entry_t *entry;
+       kimage_entry_t *last_entry;
+
+       unsigned long destination;
+
+       unsigned long start;
+       struct page *control_code_page;
+
+       unsigned long nr_segments;
+       struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+
+       struct list_head control_pages;
+       struct list_head dest_pages;
+       struct list_head unuseable_pages;
+};
+
+
+/* kexec interface functions */
+extern void machine_kexec(struct kimage *image);
+extern int machine_kexec_prepare(struct kimage *image);
+extern void machine_kexec_cleanup(struct kimage *image);
+extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
+       struct kexec_segment *segments);
+extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
+extern struct kimage *kexec_image;
+#endif
+#endif /* LINUX_KEXEC_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 3fb1893..83c64bb 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -581,7 +581,7 @@ int clear_page_dirty_for_io(struct page *page);
   */
  typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask);
  
-extern long do_mprotect(struct mm_struct *mm, unsigned long start, 
+asmlinkage long do_mprotect(struct mm_struct *mm, unsigned long start, 
                         size_t len, unsigned long prot);
  
  /*
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h

index a325de5..f2ded11 100644 (file)
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -52,19 +52,23 @@ enum ip_conntrack_status {
  
  #include <linux/netfilter_ipv4/ip_conntrack_tcp.h>
  #include <linux/netfilter_ipv4/ip_conntrack_icmp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_proto_gre.h>
  
  /* per conntrack: protocol private data */
  union ip_conntrack_proto {
         /* insert conntrack proto private data here */
+       struct ip_ct_gre gre;
         struct ip_ct_tcp tcp;
         struct ip_ct_icmp icmp;
  };
  
  union ip_conntrack_expect_proto {
         /* insert expect proto private data here */
+       struct ip_ct_gre_expect gre;
  };
  
  /* Add protocol helper include file here */
+#include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
  #include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
  #include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
  #include <linux/netfilter_ipv4/ip_conntrack_irc.h>
@@ -72,6 +76,7 @@ union ip_conntrack_expect_proto {
  /* per expectation: application helper private data */
  union ip_conntrack_expect_help {
         /* insert conntrack helper private data (expect) here */
+       struct ip_ct_pptp_expect exp_pptp_info;
         struct ip_ct_amanda_expect exp_amanda_info;
         struct ip_ct_ftp_expect exp_ftp_info;
         struct ip_ct_irc_expect exp_irc_info;
@@ -86,16 +91,19 @@ union ip_conntrack_expect_help {
  /* per conntrack: application helper private data */
  union ip_conntrack_help {
         /* insert conntrack helper private data (master) here */
+       struct ip_ct_pptp_master ct_pptp_info;
         struct ip_ct_ftp_master ct_ftp_info;
         struct ip_ct_irc_master ct_irc_info;
  };
  
  #ifdef CONFIG_IP_NF_NAT_NEEDED
  #include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_pptp.h>
  
  /* per conntrack: nat application helper private data */
  union ip_conntrack_nat_help {
         /* insert nat helper private data here */
+       struct ip_nat_pptp nat_pptp_info;
  };
  #endif
  
@@ -157,6 +165,12 @@ struct ip_conntrack_expect
         union ip_conntrack_expect_help help;
  };
  
+struct ip_conntrack_counter
+{
+       u_int64_t packets;
+       u_int64_t bytes;
+};
+
  struct ip_conntrack_helper;
  
  struct ip_conntrack
@@ -174,6 +188,11 @@ struct ip_conntrack
         /* Timer function; drops refcnt when it goes off. */
         struct timer_list timeout;
  
+#ifdef CONFIG_IP_NF_CT_ACCT
+       /* Accounting Information (same cache line as other written members) */
+       struct ip_conntrack_counter counters[IP_CT_DIR_MAX];
+#endif
+
         /* If we're expecting another related connection, this will be
             in expected linked list */
         struct list_head sibling_list;
@@ -249,8 +268,10 @@ extern int invert_tuplepr(struct ip_conntrack_tuple *inverse,
                           const struct ip_conntrack_tuple *orig);
  
  /* Refresh conntrack for this many jiffies */
-extern void ip_ct_refresh(struct ip_conntrack *ct,
-                         unsigned long extra_jiffies);
+extern void ip_ct_refresh_acct(struct ip_conntrack *ct,
+                              enum ip_conntrack_info ctinfo,
+                              const struct sk_buff *skb,
+                              unsigned long extra_jiffies);
  
  /* These are for NAT.  Icky. */
  /* Call me when a conntrack is destroyed. */
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h

index 1e76911..d2bd0be 100644 (file)
--- a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
@@ -14,7 +14,7 @@
  union ip_conntrack_manip_proto
  {
         /* Add other protocols here. */
-       u_int16_t all;
+       u_int32_t all;
  
         struct {
                 u_int16_t port;
@@ -25,6 +25,9 @@ union ip_conntrack_manip_proto
         struct {
                 u_int16_t id;
         } icmp;
+       struct {
+               u_int32_t key;
+       } gre;
  };
  
  /* The manipulable part of the tuple. */
@@ -44,7 +47,7 @@ struct ip_conntrack_tuple
                 u_int32_t ip;
                 union {
                         /* Add other protocols here. */
-                       u_int16_t all;
+                       u_int32_t all;
  
                         struct {
                                 u_int16_t port;
@@ -55,6 +58,9 @@ struct ip_conntrack_tuple
                         struct {
                                 u_int8_t type, code;
                         } icmp;
+                       struct {
+                               u_int32_t key;
+                       } gre;
                 } u;
  
                 /* The protocol. */
@@ -80,10 +86,16 @@ enum ip_conntrack_dir
  #ifdef __KERNEL__
  
  #define DUMP_TUPLE(tp)                                         \
-DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n",    \
+DEBUGP("tuple %p: %u %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n",      \
         (tp), (tp)->dst.protonum,                               \
-       NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all),          \
-       NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
+       NIPQUAD((tp)->src.ip), ntohl((tp)->src.u.all),          \
+       NIPQUAD((tp)->dst.ip), ntohl((tp)->dst.u.all))
+
+#define DUMP_TUPLE_RAW(x)                                              \
+       DEBUGP("tuple %p: %u %u.%u.%u.%u:0x%08x -> %u.%u.%u.%u:0x%08x\n",\
+       (x), (x)->dst.protonum,                                         \
+       NIPQUAD((x)->src.ip), ntohl((x)->src.u.all),                    \
+       NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.all))
  
  #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
  
diff --git a/include/linux/reboot.h b/include/linux/reboot.h

index d60fafc..5460e94 100644 (file)
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -51,6 +51,8 @@ extern void machine_restart(char *cmd);
  extern void machine_halt(void);
  extern void machine_power_off(void);
  
+extern void machine_shutdown(void);
+
  #endif
  
  #endif /* _LINUX_REBOOT_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

index 111bb73..5156e43 100644 (file)
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1106,6 +1106,20 @@ extern void             skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
  extern void           skb_split(struct sk_buff *skb,
                                  struct sk_buff *skb1, const u32 len);
  
+static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
+                                      int len, void *buffer)
+{
+       int hlen = skb_headlen(skb);
+
+       if (offset + len <= hlen)
+               return skb->data + offset;
+
+       if (skb_copy_bits(skb, offset, buffer, len) < 0)
+               return NULL;
+
+       return buffer;
+}
+
  extern void skb_init(void);
  extern void skb_add_mtu(int mtu);
  
diff --git a/include/linux/socket.h b/include/linux/socket.h

index 602d03b..f30253c 100644 (file)
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -90,6 +90,10 @@ struct cmsghdr {
                                   (struct cmsghdr *)(ctl) : \
                                   (struct cmsghdr *)NULL)
  #define CMSG_FIRSTHDR(msg)     __CMSG_FIRSTHDR((msg)->msg_control, (msg)->msg_controllen)
+#define CMSG_OK(mhdr, cmsg) ((cmsg)->cmsg_len >= sizeof(struct cmsghdr) && \
+                            (cmsg)->cmsg_len <= (unsigned long) \
+                            ((mhdr)->msg_controllen - \
+                             ((char *)(cmsg) - (char *)(mhdr)->msg_control)))
  
  /*
   *     This mess will go away with glibc
diff --git a/include/net/af_unix.h b/include/net/af_unix.h

index 79314ff..621c64f 100644 (file)
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -43,16 +43,14 @@ static inline struct sock *first_unix_socket(int *i)
  #define forall_unix_sockets(i, s) \
         for (s = first_unix_socket(&(i)); s; s = next_unix_socket(&(i),(s)))
  
-struct unix_address
-{
+struct unix_address {
         atomic_t        refcnt;
         int             len;
         unsigned        hash;
         struct sockaddr_un name[0];
  };
  
-struct unix_skb_parms
-{
+struct unix_skb_parms {
         struct ucred            creds;          /* Skb credentials      */
         struct scm_fp_list      *fp;            /* Passed files         */
  };
@@ -74,6 +72,7 @@ struct unix_sock {
          struct dentry          *dentry;
          struct vfsmount                *mnt;
          struct semaphore        readsem;
+        struct sock            *peer;
          struct sock            *other;
          struct sock            *gc_tree;
          atomic_t                inflight;
diff --git a/init/Kconfig b/init/Kconfig

index 64ca2fc..5d28bb7 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -214,6 +214,18 @@ config CKRM_MEM_LRUORDER_CHANGE
           Changing this to yes reduces the checking overhead but violates the approximate
           LRU order that is maintained by the paging subsystem.
  
+config CKRM_CPU_SCHEDULE_AT_BOOT
+       bool "Turn on at boot time"
+       depends on CKRM_CPU_SCHEDULE
+       default n
+       help
+         Enable CKRM CPU Scheduler at boot time. Otherwise
+         it can be turned on dynamically at runtime. If not
+         turned on the default Linux Scheduler behavior 
+         will be obtained.
+
+         Say N if unsure, Y to use this feature
+
  config CKRM_TYPE_SOCKETCLASS
         bool "Class Manager for socket groups"
         depends on CKRM
diff --git a/kernel/.cvsignore b/kernel/.cvsignore

new file mode 100644 (file)

index 0000000..21426e9
--- /dev/null
+++ b/kernel/.cvsignore
@@ -0,0 +1,2 @@
+config_data.gz
+config_data.h
diff --git a/kernel/Makefile b/kernel/Makefile

index ec50010..455ec1e 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_MODULE_SIG) += module-verify.o
  obj-$(CONFIG_KALLSYMS) += kallsyms.o
  obj-$(CONFIG_PM) += power/
  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
  obj-$(CONFIG_COMPAT) += compat.o
  obj-$(CONFIG_IKCONFIG) += configs.o
  obj-$(CONFIG_IKCONFIG_PROC) += configs.o
diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile

index b325309..4956dcb 100644 (file)
--- a/kernel/ckrm/Makefile
+++ b/kernel/ckrm/Makefile
@@ -8,6 +8,6 @@ endif
      obj-$(CONFIG_CKRM_TYPE_TASKCLASS)  += ckrm_tc.o
      obj-$(CONFIG_CKRM_RES_NUMTASKS)    += ckrm_numtasks.o
      obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
-    obj-$(CONFIG_CKRM_RES_LISTENAQ)    += ckrm_laq.o
+    obj-$(CONFIG_CKRM_RES_LISTENAQ)    += ckrm_listenaq.o
      obj-$(CONFIG_CKRM_CPU_SCHEDULE)     += ckrm_cpu_class.o ckrm_cpu_monitor.o
      obj-$(CONFIG_CKRM_RES_MEM)                 += ckrm_mem.o
diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c

index f1cfb26..34c531f 100644 (file)
--- a/kernel/ckrm/ckrm.c
+++ b/kernel/ckrm/ckrm.c
@@ -934,14 +934,37 @@ int ckrm_class_set_shares(struct ckrm_core_class *core, const char *resname,
         int rc;
  
         // Check for legal values
-       if (!legalshare(shares->my_guarantee) || !legalshare(shares->my_limit)
-           || !legalshare(shares->total_guarantee)
-           || !legalshare(shares->max_limit))
+       if (!legalshare(shares->my_guarantee)) {
+               printk("ckrm_class_set_shares: shares->my_guarantee invalid value (%d)\n",
+                      shares->my_guarantee);
                 return -EINVAL;
+       }
+
+       if(!legalshare(shares->my_limit)) {
+               printk("ckrm_class_set_shares: shares->my_limit invalid value (%d)\n",
+                      shares->my_limit);
+               return -EINVAL;
+       }
+
+       if(!legalshare(shares->total_guarantee)){
+               printk("ckrm_class_set_shares: shares->total_guarantee invalid value (%d)\n",
+                      shares->total_guarantee);
+               return -EINVAL;
+       }
+
+       if(!legalshare(shares->max_limit)) {
+               printk("ckrm_class_set_shares: shares->max_limit invalid value (%d)\n",
+                      shares->max_limit);
+               return -EINVAL;
+       }
  
         rcbs = ckrm_resctlr_lookup(clstype, resname);
-       if (rcbs == NULL || rcbs->set_share_values == NULL)
+       if (rcbs == NULL || rcbs->set_share_values == NULL) {
+               printk("ckrm_class_set_shares: resname=%s, rcbs=%p rcbs->set_shares_values == %p returning error\n",
+                      resname, rcbs, rcbs == NULL ? NULL : rcbs->set_share_values);
                 return -EINVAL;
+       }
+
         rc = (*rcbs->set_share_values) (core->res_class[rcbs->resid], shares);
         return rc;
  }
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c

index 917875b..b4604a7 100644 (file)
--- a/kernel/ckrm/ckrm_cpu_class.c
+++ b/kernel/ckrm/ckrm_cpu_class.c
@@ -52,31 +52,31 @@ void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares)
  
         for (i = 0 ; i < NR_CPUS ; i++) {
                 queue = &cls->local_queues[i];
-               queue->active  = queue->arrays;
-               queue->expired = queue->arrays+1;
-               
-               for (j = 0; j < 2; j++) {
+       queue->active   = queue->arrays;
+       queue->expired  = queue->arrays+1;
+       
+       for (j = 0; j < 2; j++) {
                         array = queue->arrays + j;
-                       for (k = 0; k < MAX_PRIO; k++) {
-                               INIT_LIST_HEAD(array->queue + k);
-                               __clear_bit(k, array->bitmap);
-                       }
-                       // delimiter for bitsearch
-                       __set_bit(MAX_PRIO, array->bitmap);
-                       array->nr_active = 0;
+               for (k = 0; k < MAX_PRIO; k++) {
+                       INIT_LIST_HEAD(array->queue + k);
+                       __clear_bit(k, array->bitmap);
                 }
-
-               queue->expired_timestamp = 0;
-               
-               queue->cpu_class = cls;
+               // delimiter for bitsearch
+               __set_bit(MAX_PRIO, array->bitmap);
+               array->nr_active = 0;
+       }
+       
+       queue->expired_timestamp = 0;
+       
+       queue->cpu_class = cls;
                 queue->classqueue = get_cpu_classqueue(i);
-               queue->top_priority = MAX_PRIO;
-               cq_node_init(&queue->classqueue_linkobj);
+       queue->top_priority = MAX_PRIO;
+       cq_node_init(&queue->classqueue_linkobj);
                 queue->local_cvt = 0;
-               queue->lrq_load = 0;
-               queue->local_weight = cpu_class_weight(cls);
-               queue->uncounted_ns = 0;
-               queue->savings = 0;
+       queue->lrq_load = 0;
+       queue->local_weight = cpu_class_weight(cls);
+       queue->uncounted_ns = 0;
+       queue->savings = 0;
                 queue->magic = 0x43FF43D7;
         }
  
@@ -100,7 +100,7 @@ struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core)
  {
         struct ckrm_cpu_class * cls;
         cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
-       if (valid_cpu_class(cls))
+       if (valid_cpu_class(cls))
                 return cls;
         else
                 return NULL;
@@ -128,7 +128,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
                         set_default_share(&shares);
                         init_cpu_class(cls,&shares);
                         cls->core = core;
-                       cls->parent = parent;
+                       cls->parent = parent;                   
                 }
         } else
                 printk(KERN_ERR"alloc_cpu_class failed\n");
@@ -194,8 +194,10 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
          struct ckrm_shares *cur = &cls->shares, *par;
          int rc = -EINVAL;
  
-        if (!cls) 
-                return rc;
+        if (!cls) {
+               printk("ckrm_cpu_set_share: cls == NULL\n");
+               return rc;
+       }
  
          if (cls->parent) {
                  parres = ckrm_get_cpu_class(cls->parent);
@@ -235,7 +237,7 @@ static int ckrm_cpu_get_share(void *my_res,
  {                      
         struct ckrm_cpu_class *cls = my_res;
  
-       if (!cls) 
+        if (!cls)
                 return -EINVAL;
         *shares = cls->shares;
         return 0;
@@ -371,7 +373,7 @@ void init_cpu_classes(void)
          *  required for E14/E15 since ckrm_init is called after sched_init
          */
         ckrm_alloc_cpu_class(NULL,NULL);
-}
+       }
  
  
  EXPORT_SYMBOL(ckrm_get_cpu_class);
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c

index d8c199a..a608f4e 100644 (file)
--- a/kernel/ckrm/ckrm_cpu_monitor.c
+++ b/kernel/ckrm/ckrm_cpu_monitor.c
@@ -93,7 +93,7 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
         stat->total_ns = 0;
         stat->max_demand = 0;
  
-       for (i=0; i< NR_CPUS; i++) {
+       for (i=0; i<NR_CPUS; i++) {
                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
         }
  
@@ -831,7 +831,7 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
         int i;
         unsigned long class_weight;
         unsigned long long lw;  
-
+       
         //get total pressure
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(clsptr,i);
@@ -858,8 +858,8 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
                                 lw = 1;
                         else if (lw > CKRM_SHARE_MAX)
                                 lw = CKRM_SHARE_MAX;
-               }
-               
+               }       
+
                 lrq->local_weight = lw;
         }
  }
@@ -867,6 +867,7 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
  /*
   * assume called with class_list_lock read lock held
   */
+
  void adjust_local_weight(void)
  {
         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
diff --git a/kernel/ckrm/ckrm_laq.c b/kernel/ckrm/ckrm_laq.c

deleted file mode 100644 (file)

index b64205a..0000000
--- a/kernel/ckrm/ckrm_laq.c
+++ /dev/null
@@ -1,495 +0,0 @@
-/* ckrm_socketaq.c - accept queue resource controller
- *
- * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
- * 
- * Latest version, more details at http://ckrm.sf.net
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-/* Changes
- * Initial version
- */
-
-/* Code Description: TBD
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/errno.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/ckrm.h>
-#include <linux/ckrm_rc.h>
-#include <net/tcp.h>
-
-#include <linux/ckrm_net.h>
-
-#define hnode_2_core(ptr) \
-        ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
-
-#define CKRM_SAQ_MAX_DEPTH     3       // 0 => /rcfs
-                                 // 1 => socket_aq
-                                 // 2 => socket_aq/listen_class
-                                 // 3 => socket_aq/listen_class/accept_queues
-                                 // 4 => Not allowed
-
-typedef struct ckrm_laq_res {
-       spinlock_t reslock;
-       atomic_t refcnt;
-       struct ckrm_shares shares;
-       struct ckrm_core_class *core;
-       struct ckrm_core_class *pcore;
-       int my_depth;
-       int my_id;
-       unsigned int min_ratio;
-} ckrm_laq_res_t;
-
-static int my_resid = -1;
-
-extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int);
-extern struct ckrm_core_class *rcfs_make_core(struct dentry *,
-                                             struct ckrm_core_class *);
-
-void laq_res_hold(struct ckrm_laq_res *res)
-{
-       atomic_inc(&res->refcnt);
-       return;
-}
-
-void laq_res_put(struct ckrm_laq_res *res)
-{
-       if (atomic_dec_and_test(&res->refcnt))
-               kfree(res);
-       return;
-}
-
-/* Initialize rescls values
- */
-static void laq_res_initcls(void *my_res)
-{
-       ckrm_laq_res_t *res = my_res;
-
-       res->shares.my_guarantee = CKRM_SHARE_DONTCARE;
-       res->shares.my_limit = CKRM_SHARE_DONTCARE;
-       res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
-       res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       res->shares.cur_max_limit = 0;
-}
-
-static int atoi(char *s)
-{
-       int k = 0;
-       while (*s)
-               k = *s++ - '0' + (k * 10);
-       return k;
-}
-
-static char *laq_get_name(struct ckrm_core_class *c)
-{
-       char *p = (char *)c->name;
-
-       while (*p)
-               p++;
-       while (*p != '/' && p != c->name)
-               p--;
-
-       return ++p;
-}
-
-static void *laq_res_alloc(struct ckrm_core_class *core,
-                          struct ckrm_core_class *parent)
-{
-       ckrm_laq_res_t *res, *pres;
-       int pdepth;
-
-       if (parent)
-               pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t);
-       else
-               pres = NULL;
-
-       if (core == core->classtype->default_class)
-               pdepth = 1;
-       else {
-               if (!parent)
-                       return NULL;
-               pdepth = 1 + pres->my_depth;
-       }
-
-       res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC);
-       if (res) {
-               memset(res, 0, sizeof(res));
-               spin_lock_init(&res->reslock);
-               laq_res_hold(res);
-               res->my_depth = pdepth;
-               if (pdepth == 2)        // listen class
-                       res->my_id = 0;
-               else if (pdepth == 3)
-                       res->my_id = atoi(laq_get_name(core));
-               res->core = core;
-               res->pcore = parent;
-
-               // rescls in place, now initialize contents other than 
-               // hierarchy pointers
-               laq_res_initcls(res);   // acts as initialising value
-       }
-
-       return res;
-}
-
-static void laq_res_free(void *my_res)
-{
-       ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res;
-       ckrm_laq_res_t *parent;
-
-       if (!res)
-               return;
-
-       if (res->my_depth != 3) {
-               kfree(res);
-               return;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)            // Should never happen
-               return;
-
-       spin_lock(&parent->reslock);
-       spin_lock(&res->reslock);
-
-       // return child's guarantee to parent node
-       // Limits have no meaning for accept queue control
-       child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0);
-
-       spin_unlock(&res->reslock);
-       laq_res_put(res);
-       spin_unlock(&parent->reslock);
-       return;
-}
-
-/**************************************************************************
- *                     SHARES                                          ***
- **************************************************************************/
-
-void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio)
-{
-       int i;
-       struct tcp_opt *tp;
-
-       tp = tcp_sk(ns->ns_sk);
-       for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
-               tp->acceptq[i].aq_ratio = aq_ratio[i];
-       return;
-}
-void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio)
-{
-
-       struct ckrm_net_struct *ns;
-       struct ckrm_core_class *core = parent->core;
-
-       class_lock(core);
-       list_for_each_entry(ns, &core->objlist, ckrm_link) {
-               laq_set_aq_value(ns, aq_ratio);
-       }
-       class_unlock(core);
-       return;
-}
-
-static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio)
-{
-       struct ckrm_hnode *chnode;
-       ckrm_laq_res_t *child;
-       unsigned int min;
-       int i;
-
-       min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee;
-
-       list_for_each_entry(chnode, &res->core->hnode.children, siblings) {
-               child = hnode_2_core(chnode)->res_class[my_resid];
-
-               aq_ratio[child->my_id] =
-                   (unsigned int)child->shares.my_guarantee;
-               if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE)
-                       aq_ratio[child->my_id] = 0;
-               if (aq_ratio[child->my_id] &&
-                   ((unsigned int)aq_ratio[child->my_id] < min))
-                       min = (unsigned int)child->shares.my_guarantee;
-       }
-
-       if (min == 0) {
-               min = 1;
-               // default takes all if nothing specified
-               aq_ratio[0] = 1;        
-       }
-       res->min_ratio = min;
-
-       for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
-               aq_ratio[i] = aq_ratio[i] / min;
-}
-
-static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
-{
-       ckrm_laq_res_t *res = my_res;
-       ckrm_laq_res_t *parent;
-       unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
-       int rc = 0;
-
-       if (!res)
-               return -EINVAL;
-
-       if (!res->pcore) {
-               // something is badly wrong
-               printk(KERN_ERR "socketaq internal inconsistency\n");
-               return -EBADF;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)            // socketclass does not have a share interface
-               return -EINVAL;
-
-       // Ensure that we ignore limit values
-       shares->my_limit = CKRM_SHARE_DONTCARE;
-       shares->max_limit = CKRM_SHARE_UNCHANGED;
-
-       if (res->my_depth == 0) {
-               printk(KERN_ERR "socketaq bad entry\n");
-               return -EBADF;
-       } else if (res->my_depth == 1) {
-               // can't be written to. This is an internal default.
-               return -EINVAL;
-       } else if (res->my_depth == 2) {
-               //nothin to inherit
-               if (!shares->total_guarantee) {
-                       return -EINVAL;
-               }
-               parent = res;
-               shares->my_guarantee = CKRM_SHARE_DONTCARE;
-       } else if (res->my_depth == 3) {
-               // accept queue itself. 
-               shares->total_guarantee = CKRM_SHARE_UNCHANGED;
-       }
-
-       ckrm_lock_hier(parent->pcore);
-       spin_lock(&parent->reslock);
-       rc = set_shares(shares, &res->shares,
-                       (parent == res) ? NULL : &parent->shares);
-       if (rc) {
-               spin_unlock(&res->reslock);
-               ckrm_unlock_hier(res->pcore);
-               return rc;
-       }
-       calculate_aq_ratios(parent, aq_ratio);
-       laq_set_aq_values(parent, aq_ratio);
-       spin_unlock(&parent->reslock);
-       ckrm_unlock_hier(parent->pcore);
-
-       return rc;
-}
-
-static int laq_get_share_values(void *my_res, struct ckrm_shares *shares)
-{
-       ckrm_laq_res_t *res = my_res;
-
-       if (!res)
-               return -EINVAL;
-       *shares = res->shares;
-       return 0;
-}
-
-/**************************************************************************
- *                     STATS                                           ***
- **************************************************************************/
-
-void
-laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i)
-{
-       seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t"
-                  "queued: %u\n\twait_time: %u\n",
-                  i, taq->acceptq_count, taq->acceptq_qcount,
-                  jiffies_to_msecs(taq->acceptq_wait_time));
-
-       if (i)
-               return;
-
-       for (i = 1; i < NUM_ACCEPT_QUEUES; i++) {
-               taq[0].acceptq_wait_time += taq[i].acceptq_wait_time;
-               taq[0].acceptq_qcount += taq[i].acceptq_qcount;
-               taq[0].acceptq_count += taq[i].acceptq_count;
-       }
-
-       seq_printf(sfile, "Totals :\n\taccepted: %u\n\t"
-                  "queued: %u\n\twait_time: %u\n",
-                  taq->acceptq_count, taq->acceptq_qcount,
-                  jiffies_to_msecs(taq->acceptq_wait_time));
-
-       return;
-}
-
-void
-laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres,
-                struct tcp_acceptq_info *taq)
-{
-       struct ckrm_net_struct *ns;
-       struct ckrm_core_class *core = pres->core;
-       struct tcp_opt *tp;
-       int a = mres->my_id;
-       int z;
-
-       if (a == 0)
-               z = NUM_ACCEPT_QUEUES;
-       else
-               z = a + 1;
-
-       // XXX Instead of holding a  class_lock introduce a rw
-       // lock to be write locked by listen callbacks and read locked here.
-       // - VK
-       class_lock(pres->core);
-       list_for_each_entry(ns, &core->objlist, ckrm_link) {
-               tp = tcp_sk(ns->ns_sk);
-               for (; a < z; a++) {
-                       taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time;
-                       taq->acceptq_qcount += tp->acceptq[a].aq_qcount;
-                       taq->acceptq_count += tp->acceptq[a].aq_count;
-                       taq++;
-               }
-       }
-       class_unlock(pres->core);
-}
-
-static int laq_get_stats(void *my_res, struct seq_file *sfile)
-{
-       ckrm_laq_res_t *res = my_res;
-       ckrm_laq_res_t *parent;
-       struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES];
-       int rc = 0;
-
-       if (!res)
-               return -EINVAL;
-
-       if (!res->pcore) {
-               // something is badly wrong
-               printk(KERN_ERR "socketaq internal inconsistency\n");
-               return -EBADF;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent) {          // socketclass does not have a stat interface
-               printk(KERN_ERR "socketaq internal fs inconsistency\n");
-               return -EINVAL;
-       }
-
-       memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES);
-
-       switch (res->my_depth) {
-
-       default:
-       case 0:
-               printk(KERN_ERR "socket class bad entry\n");
-               rc = -EBADF;
-               break;
-
-       case 1:         // can't be read from. this is internal default.
-               // return -EINVAL
-               rc = -EINVAL;
-               break;
-
-       case 2:         // return the default and total
-               ckrm_lock_hier(res->core);      // block any deletes
-               laq_get_aq_stats(res, res, &taq[0]);
-               laq_print_aq_stats(sfile, &taq[0], 0);
-               ckrm_unlock_hier(res->core);    // block any deletes
-               break;
-
-       case 3:
-               ckrm_lock_hier(parent->core);   // block any deletes
-               laq_get_aq_stats(parent, res, &taq[res->my_id]);
-               laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id);
-               ckrm_unlock_hier(parent->core); // block any deletes
-               break;
-       }
-
-       return rc;
-}
-
-/*
- * The network connection is reclassified to this class. Update its shares.
- * The socket lock is held. 
- */
-static void laq_change_resclass(void *n, void *old, void *r)
-{
-       struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n;
-       struct ckrm_laq_res *res = (struct ckrm_laq_res *)r;
-       unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
-
-       if (res->my_depth != 2)
-               return;
-
-       // a change to my_depth == 3 ie. the accept classes cannot happen.
-       // there is no target file
-       if (res->my_depth == 2) {       // it is one of the socket classes
-               ckrm_lock_hier(res->pcore);
-               // share rule: hold parent resource lock. then self.
-               // However, since my_depth == 1 is a generic class it is not
-               // needed here. Self lock is enough.
-               spin_lock(&res->reslock);
-               calculate_aq_ratios(res, aq_ratio);
-               class_lock(res->pcore);
-               laq_set_aq_value(ns, aq_ratio);
-               class_unlock(res->pcore);
-               spin_unlock(&res->reslock);
-               ckrm_unlock_hier(res->pcore);
-       }
-
-       return;
-}
-
-struct ckrm_res_ctlr laq_rcbs = {
-       .res_name = "laq",
-       .resid = -1,            // dynamically assigned
-       .res_alloc = laq_res_alloc,
-       .res_free = laq_res_free,
-       .set_share_values = laq_set_share_values,
-       .get_share_values = laq_get_share_values,
-       .get_stats = laq_get_stats,
-       .change_resclass = laq_change_resclass,
-       //.res_initcls       = laq_res_initcls,  //HUBERTUS: unnecessary !!
-};
-
-int __init init_ckrm_laq_res(void)
-{
-       struct ckrm_classtype *clstype;
-       int resid;
-
-       clstype = ckrm_find_classtype_by_name("socketclass");
-       if (clstype == NULL) {
-               printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
-               return -ENOENT;
-       }
-
-       if (my_resid == -1) {
-               resid = ckrm_register_res_ctlr(clstype, &laq_rcbs);
-               if (resid >= 0)
-                       my_resid = resid;
-               printk(KERN_DEBUG "........init_ckrm_listen_aq_res -> %d\n", my_resid);
-       }
-       return 0;
-
-}
-
-void __exit exit_ckrm_laq_res(void)
-{
-       ckrm_unregister_res_ctlr(&laq_rcbs);
-       my_resid = -1;
-}
-
-module_init(init_ckrm_laq_res)
-    module_exit(exit_ckrm_laq_res)
-
-    MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c

index 0fe8586..103e3f9 100644 (file)
--- a/kernel/ckrm/ckrm_listenaq.c
+++ b/kernel/ckrm/ckrm_listenaq.c
@@ -1,4 +1,4 @@
-/* ckrm_socketaq.c - accept queue resource controller
+/* ckrm_listenaq.c - accept queue resource controller
   *
   * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
   * 
@@ -251,7 +251,7 @@ static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
         }
  
         parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)            // socket_class does not have a share interface
+       if (!parent)            // socketclass does not have a share interface
                 return -EINVAL;
  
         // Ensure that we ignore limit values
@@ -380,7 +380,7 @@ static int laq_get_stats(void *my_res, struct seq_file *sfile)
         }
  
         parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent) {          // socket_class does not have a stat interface
+       if (!parent) {          // socketclass does not have a stat interface
                 printk(KERN_ERR "socketaq internal fs inconsistency\n");
                 return -EINVAL;
         }
@@ -451,7 +451,7 @@ static void laq_change_resclass(void *n, void *old, void *r)
  }
  
  struct ckrm_res_ctlr laq_rcbs = {
-       .res_name = "laq",
+       .res_name = "listenaq",
         .resid = -1,            // dynamically assigned
         .res_alloc = laq_res_alloc,
         .res_free = laq_res_free,
@@ -467,9 +467,9 @@ int __init init_ckrm_laq_res(void)
         struct ckrm_classtype *clstype;
         int resid;
  
-       clstype = ckrm_find_classtype_by_name("socket_class");
+       clstype = ckrm_find_classtype_by_name("socketclass");
         if (clstype == NULL) {
-               printk(KERN_INFO " Unknown ckrm classtype<socket_class>");
+               printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
                 return -ENOENT;
         }
  
diff --git a/kernel/ckrm/ckrmutils.c b/kernel/ckrm/ckrmutils.c

index d54e7b5..44522d6 100644 (file)
--- a/kernel/ckrm/ckrmutils.c
+++ b/kernel/ckrm/ckrmutils.c
@@ -106,22 +106,35 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur,
  {
         int rc = -EINVAL;
         int cur_usage_guar = cur->total_guarantee - cur->unused_guarantee;
-       int increase_by = new->my_guarantee - cur->my_guarantee;
+       int increase_by;
+
+       if (cur->my_guarantee < 0) // DONTCARE or UNCHANGED
+               increase_by = new->my_guarantee;
+       else
+               increase_by = new->my_guarantee - cur->my_guarantee;
  
         // Check total_guarantee for correctness
         if (new->total_guarantee <= CKRM_SHARE_DONTCARE) {
+               printk("set_shares: new->total_guarantee (%d) <= CKRM_SHARE_DONTCARE\n",
+                      new->total_guarantee);
                 goto set_share_err;
         } else if (new->total_guarantee == CKRM_SHARE_UNCHANGED) {
                 ;               // do nothing
         } else if (cur_usage_guar > new->total_guarantee) {
+               printk("set_shares: cur_usage_guar(%d) > new->total_guarantee (%d)\n",
+                      cur_usage_guar, new->total_guarantee);
                 goto set_share_err;
         }
         // Check max_limit for correctness
         if (new->max_limit <= CKRM_SHARE_DONTCARE) {
+               printk("set_shares: new->max_limit (%d) <= CKRM_SHARE_DONTCARE\n",
+                      new->max_limit);
                 goto set_share_err;
         } else if (new->max_limit == CKRM_SHARE_UNCHANGED) {
                 ;               // do nothing
         } else if (cur->cur_max_limit > new->max_limit) {
+               printk("set_shares: cur->cur_max_limit (%d) > new->max_limit (%d)\n",
+                      cur->cur_max_limit,new->max_limit);
                 goto set_share_err;
         }
         // Check my_guarantee for correctness
@@ -130,6 +143,8 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur,
         } else if (new->my_guarantee == CKRM_SHARE_DONTCARE) {
                 ;               // do nothing
         } else if (par && increase_by > par->unused_guarantee) {
+               printk("set_shares: increase_by (%d) > par->unused_guarantee (%d)\n",
+                      increase_by, par->unused_guarantee);
                 goto set_share_err;
         }
         // Check my_limit for correctness
@@ -139,8 +154,9 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur,
                 ;               // do nothing
         } else if (par && new->my_limit > par->max_limit) {
                 // I can't get more limit than my parent's limit
+               printk("set_shares: new->my_limit (%d) > par->max_limit (%d)\n",
+                      new->my_limit,par->max_limit);
                 goto set_share_err;
-
         }
         // make sure guarantee is lesser than limit
         if (new->my_limit == CKRM_SHARE_DONTCARE) {
@@ -152,6 +168,8 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur,
                         ;       // do nothing earlier setting would've 
                                 // taken care of it
                 } else if (new->my_guarantee > cur->my_limit) {
+                       printk("set_shares: new->my_guarantee (%d) > cur->my_limit (%d)\n",
+                              new->my_guarantee,cur->my_limit);
                         goto set_share_err;
                 }
         } else {                // new->my_limit has a valid value
@@ -159,9 +177,13 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur,
                         ;       // do nothing
                 } else if (new->my_guarantee == CKRM_SHARE_UNCHANGED) {
                         if (cur->my_guarantee > new->my_limit) {
+                               printk("set_shares: cur->my_guarantee (%d) > new->my_limit (%d)\n",
+                                      cur->my_guarantee,new->my_limit);
                                 goto set_share_err;
                         }
                 } else if (new->my_guarantee > new->my_limit) {
+                       printk("set_shares: new->my_guarantee (%d) > new->my_limit (%d)\n",
+                              new->my_guarantee,new->my_limit);
                         goto set_share_err;
                 }
         }
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c

index 0400844..80d5d49 100644 (file)
--- a/kernel/ckrm_classqueue.c
+++ b/kernel/ckrm_classqueue.c
@@ -87,7 +87,7 @@ void classqueue_enqueue(struct classqueue_struct *cq,
         int index;
  
         //get real index
-       if (cq_nr_member(cq)) {
+       if (cq_nr_member(cq)) {         
                 index = get_index(cq, &prio);
         } else {                //the first one
                 cq->base = prio;
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c

index 5142b2e..7ed70d0 100644 (file)
--- a/kernel/ckrm_sched.c
+++ b/kernel/ckrm_sched.c
@@ -54,15 +54,15 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
                 min_cvt = cur_cvt - bonus;
         else
                 min_cvt = 0;
-       
-       if (lrq->local_cvt < min_cvt) {
+
+       if (lrq->local_cvt < min_cvt) { 
                 CVT_t lost_cvt;
  
                 lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq);
                 lrq->local_cvt = min_cvt;
  
                 /* add what the class lost to its savings*/
-               lrq->savings += lost_cvt;
+               lrq->savings += lost_cvt;              
                 if (lrq->savings > MAX_SAVINGS)
                         lrq->savings = MAX_SAVINGS; 
         } else if (lrq->savings) {
@@ -88,7 +88,7 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
  #else
                 lrq->local_cvt -= savings_used;
  #endif
-       }               
+       }
  }
  
  /*
@@ -186,6 +186,7 @@ void update_class_cputime(int this_cpu)
  /**
   * sample pid load periodically
   */
+
  void ckrm_load_sample(ckrm_load_t* pid,int cpu)
  {
         long load;
diff --git a/kernel/kexec.c b/kernel/kexec.c

new file mode 100644 (file)

index 0000000..b59023f
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,640 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <net/checksum.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(
+       struct kimage *image, unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+
+
+static int kimage_alloc(struct kimage **rimage,
+       unsigned long nr_segments, struct kexec_segment *segments)
+{
+       int result;
+       struct kimage *image;
+       size_t segment_bytes;
+       unsigned long i;
+
+       /* Allocate a controlling structure */
+       result = -ENOMEM;
+       image = kmalloc(sizeof(*image), GFP_KERNEL);
+       if (!image) {
+               goto out;
+       }
+       memset(image, 0, sizeof(*image));
+       image->head = 0;
+       image->entry = &image->head;
+       image->last_entry = &image->head;
+
+       /* Initialize the list of control pages */
+       INIT_LIST_HEAD(&image->control_pages);
+
+       /* Initialize the list of destination pages */
+       INIT_LIST_HEAD(&image->dest_pages);
+
+       /* Initialize the list of unuseable pages */
+       INIT_LIST_HEAD(&image->unuseable_pages);
+
+       /* Read in the segments */
+       image->nr_segments = nr_segments;
+       segment_bytes = nr_segments * sizeof*segments;
+       result = copy_from_user(image->segment, segments, segment_bytes);
+       if (result)
+               goto out;
+
+       /*
+        * Verify we have good destination addresses.  The caller is
+        * responsible for making certain we don't attempt to load
+        * the new image into invalid or reserved areas of RAM.  This
+        * just verifies it is an address we can use.
+        */
+       result = -EADDRNOTAVAIL;
+       for (i = 0; i < nr_segments; i++) {
+               unsigned long mend;
+               mend = ((unsigned long)(image->segment[i].mem)) +
+                       image->segment[i].memsz;
+               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+                       goto out;
+       }
+
+       /*
+        * Find a location for the control code buffer, and add it
+        * the vector of segments so that it's pages will also be
+        * counted as destination pages.
+        */
+       result = -ENOMEM;
+       image->control_code_page = kimage_alloc_control_pages(image,
+               get_order(KEXEC_CONTROL_CODE_SIZE));
+       if (!image->control_code_page) {
+               printk(KERN_ERR "Could not allocate control_code_buffer\n");
+               goto out;
+       }
+
+       result = 0;
+ out:
+       if (result == 0) {
+               *rimage = image;
+       } else {
+               kfree(image);
+       }
+       return result;
+}
+
+static int kimage_is_destination_range(
+       struct kimage *image, unsigned long start, unsigned long end)
+{
+       unsigned long i;
+
+       for (i = 0; i < image->nr_segments; i++) {
+               unsigned long mstart, mend;
+               mstart = (unsigned long)image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+               if ((end > mstart) && (start < mend)) {
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+       struct page *pages;
+       pages = alloc_pages(gfp_mask, order);
+       if (pages) {
+               unsigned int count, i;
+               pages->mapping = NULL;
+               pages->private = order;
+               count = 1 << order;
+               for(i = 0; i < count; i++) {
+                       SetPageReserved(pages + i);
+               }
+       }
+       return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+       unsigned int order, count, i;
+       order = page->private;
+       count = 1 << order;
+       for(i = 0; i < count; i++) {
+               ClearPageReserved(page + i);
+       }
+       __free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+       struct list_head *pos, *next;
+       list_for_each_safe(pos, next, list) {
+               struct page *page;
+
+               page = list_entry(pos, struct page, lru);
+               list_del(&page->lru);
+
+               kimage_free_pages(page);
+       }
+}
+
+struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order)
+{
+       /* Control pages are special, they are the intermediaries
+        * that are needed while we copy the rest of the pages
+        * to their final resting place.  As such they must
+        * not conflict with either the destination addresses
+        * or memory the kernel is already using.
+        *
+        * The only case where we really need more than one of
+        * these are for architectures where we cannot disable
+        * the MMU and must instead generate an identity mapped
+        * page table for all of the memory.
+        *
+        * At worst this runs in O(N) of the image size.
+        */
+       struct list_head extra_pages;
+       struct page *pages;
+       unsigned int count;
+
+       count = 1 << order;
+       INIT_LIST_HEAD(&extra_pages);
+
+       /* Loop while I can allocate a page and the page allocated
+        * is a destination page.
+        */
+       do {
+               unsigned long pfn, epfn, addr, eaddr;
+               pages = kimage_alloc_pages(GFP_KERNEL, order);
+               if (!pages)
+                       break;
+               pfn   = page_to_pfn(pages);
+               epfn  = pfn + count;
+               addr  = pfn << PAGE_SHIFT;
+               eaddr = epfn << PAGE_SHIFT;
+               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+                       kimage_is_destination_range(image, addr, eaddr))
+               {
+                       list_add(&pages->lru, &extra_pages);
+                       pages = NULL;
+               }
+       } while(!pages);
+       if (pages) {
+               /* Remember the allocated page... */
+               list_add(&pages->lru, &image->control_pages);
+
+               /* Because the page is already in it's destination
+                * location we will never allocate another page at
+                * that address.  Therefore kimage_alloc_pages
+                * will not return it (again) and we don't need
+                * to give it an entry in image->segment[].
+                */
+       }
+       /* Deal with the destination pages I have inadvertently allocated.
+        *
+        * Ideally I would convert multi-page allocations into single
+        * page allocations, and add everyting to image->dest_pages.
+        *
+        * For now it is simpler to just free the pages.
+        */
+       kimage_free_page_list(&extra_pages);
+       return pages;
+
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+       if (*image->entry != 0) {
+               image->entry++;
+       }
+       if (image->entry == image->last_entry) {
+               kimage_entry_t *ind_page;
+               struct page *page;
+               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+               if (!page) {
+                       return -ENOMEM;
+               }
+               ind_page = page_address(page);
+               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               image->entry = ind_page;
+               image->last_entry =
+                       ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+       }
+       *image->entry = entry;
+       image->entry++;
+       *image->entry = 0;
+       return 0;
+}
+
+static int kimage_set_destination(
+       struct kimage *image, unsigned long destination)
+{
+       int result;
+
+       destination &= PAGE_MASK;
+       result = kimage_add_entry(image, destination | IND_DESTINATION);
+       if (result == 0) {
+               image->destination = destination;
+       }
+       return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+       int result;
+
+       page &= PAGE_MASK;
+       result = kimage_add_entry(image, page | IND_SOURCE);
+       if (result == 0) {
+               image->destination += PAGE_SIZE;
+       }
+       return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+       /* Walk through and free any extra destination pages I may have */
+       kimage_free_page_list(&image->dest_pages);
+
+       /* Walk through and free any unuseable pages I have cached */
+       kimage_free_page_list(&image->unuseable_pages);
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+       int result;
+
+       result = kimage_add_entry(image, IND_DONE);
+       if (result == 0) {
+               /* Point at the terminating element */
+               image->entry--;
+               kimage_free_extra_pages(image);
+       }
+       return result;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+               ptr = (entry & IND_INDIRECTION)? \
+                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+       struct page *page;
+
+       page = pfn_to_page(entry >> PAGE_SHIFT);
+       kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+       kimage_entry_t *ptr, entry;
+       kimage_entry_t ind = 0;
+
+       if (!image)
+               return;
+       kimage_free_extra_pages(image);
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_INDIRECTION) {
+                       /* Free the previous indirection page */
+                       if (ind & IND_INDIRECTION) {
+                               kimage_free_entry(ind);
+                       }
+                       /* Save this indirection page until we are
+                        * done with it.
+                        */
+                       ind = entry;
+               }
+               else if (entry & IND_SOURCE) {
+                       kimage_free_entry(entry);
+               }
+       }
+       /* Free the final indirection page */
+       if (ind & IND_INDIRECTION) {
+               kimage_free_entry(ind);
+       }
+
+       /* Handle any machine specific cleanup */
+       machine_kexec_cleanup(image);
+
+       /* Free the kexec control pages... */
+       kimage_free_page_list(&image->control_pages);
+       kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+{
+       kimage_entry_t *ptr, entry;
+       unsigned long destination = 0;
+
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_DESTINATION) {
+                       destination = entry & PAGE_MASK;
+               }
+               else if (entry & IND_SOURCE) {
+                       if (page == destination) {
+                               return ptr;
+                       }
+                       destination += PAGE_SIZE;
+               }
+       }
+       return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+{
+       /*
+        * Here we implement safeguards to ensure that a source page
+        * is not copied to its destination page before the data on
+        * the destination page is no longer useful.
+        *
+        * To do this we maintain the invariant that a source page is
+        * either its own destination page, or it is not a
+        * destination page at all.
+        *
+        * That is slightly stronger than required, but the proof
+        * that no problems will not occur is trivial, and the
+        * implementation is simply to verify.
+        *
+        * When allocating all pages normally this algorithm will run
+        * in O(N) time, but in the worst case it will run in O(N^2)
+        * time.   If the runtime is a problem the data structures can
+        * be fixed.
+        */
+       struct page *page;
+       unsigned long addr;
+
+       /*
+        * Walk through the list of destination pages, and see if I
+        * have a match.
+        */
+       list_for_each_entry(page, &image->dest_pages, lru) {
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+               if (addr == destination) {
+                       list_del(&page->lru);
+                       return page;
+               }
+       }
+       page = NULL;
+       while (1) {
+               kimage_entry_t *old;
+
+               /* Allocate a page, if we run out of memory give up */
+               page = kimage_alloc_pages(gfp_mask, 0);
+               if (!page) {
+                       return 0;
+               }
+               /* If the page cannot be used file it away */
+               if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+                       list_add(&page->lru, &image->unuseable_pages);
+                       continue;
+               }
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+
+               /* If it is the destination page we want use it */
+               if (addr == destination)
+                       break;
+
+               /* If the page is not a destination page use it */
+               if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+                       break;
+
+               /*
+                * I know that the page is someones destination page.
+                * See if there is already a source page for this
+                * destination page.  And if so swap the source pages.
+                */
+               old = kimage_dst_used(image, addr);
+               if (old) {
+                       /* If so move it */
+                       unsigned long old_addr;
+                       struct page *old_page;
+
+                       old_addr = *old & PAGE_MASK;
+                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       copy_highpage(page, old_page);
+                       *old = addr | (*old & ~PAGE_MASK);
+
+                       /* The old page I have found cannot be a
+                        * destination page, so return it.
+                        */
+                       addr = old_addr;
+                       page = old_page;
+                       break;
+               }
+               else {
+                       /* Place the page on the destination list I
+                        * will use it later.
+                        */
+                       list_add(&page->lru, &image->dest_pages);
+               }
+       }
+       return page;
+}
+
+static int kimage_load_segment(struct kimage *image,
+       struct kexec_segment *segment)
+{
+       unsigned long mstart;
+       int result;
+       unsigned long offset;
+       unsigned long offset_end;
+       unsigned char *buf;
+
+       result = 0;
+       buf = segment->buf;
+       mstart = (unsigned long)segment->mem;
+
+       offset_end = segment->memsz;
+
+       result = kimage_set_destination(image, mstart);
+       if (result < 0) {
+               goto out;
+       }
+       for (offset = 0;  offset < segment->memsz; offset += PAGE_SIZE) {
+               struct page *page;
+               char *ptr;
+               size_t size, leader;
+               page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
+               if (page == 0) {
+                       result  = -ENOMEM;
+                       goto out;
+               }
+               result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
+               if (result < 0) {
+                       goto out;
+               }
+               ptr = kmap(page);
+               if (segment->bufsz < offset) {
+                       /* We are past the end zero the whole page */
+                       memset(ptr, 0, PAGE_SIZE);
+                       kunmap(page);
+                       continue;
+               }
+               size = PAGE_SIZE;
+               leader = 0;
+               if ((offset == 0)) {
+                       leader = mstart & ~PAGE_MASK;
+               }
+               if (leader) {
+                       /* We are on the first page zero the unused portion */
+                       memset(ptr, 0, leader);
+                       size -= leader;
+                       ptr += leader;
+               }
+               if (size > (segment->bufsz - offset)) {
+                       size = segment->bufsz - offset;
+               }
+               if (size < (PAGE_SIZE - leader)) {
+                       /* zero the trailing part of the page */
+                       memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
+               }
+               result = copy_from_user(ptr, buf + offset, size);
+               kunmap(page);
+               if (result) {
+                       result = (result < 0) ? result : -EIO;
+                       goto out;
+               }
+       }
+ out:
+       return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and the copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = NULL;
+
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+       struct kexec_segment *segments, unsigned long flags)
+{
+       struct kimage *image;
+       int result;
+
+       /* We only trust the superuser with rebooting the system. */
+       if (!capable(CAP_SYS_BOOT))
+               return -EPERM;
+
+       /*
+        * In case we need just a little bit of special behavior for
+        * reboot on panic.
+        */
+       if (flags != 0)
+               return -EINVAL;
+
+       if (nr_segments > KEXEC_SEGMENT_MAX)
+               return -EINVAL;
+
+       image = NULL;
+       result = 0;
+
+       if (nr_segments > 0) {
+               unsigned long i;
+               result = kimage_alloc(&image, nr_segments, segments);
+               if (result) {
+                       goto out;
+               }
+               result = machine_kexec_prepare(image);
+               if (result) {
+                       goto out;
+               }
+               image->start = entry;
+               for (i = 0; i < nr_segments; i++) {
+                       result = kimage_load_segment(image, &image->segment[i]);
+                       if (result) {
+                               goto out;
+                       }
+               }
+               result = kimage_terminate(image);
+               if (result) {
+                       goto out;
+               }
+       }
+
+       image = xchg(&kexec_image, image);
+
+ out:
+       kimage_free(image);
+       return result;
+}
diff --git a/kernel/sched.c b/kernel/sched.c

index 20b0921..f835611 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -352,7 +352,7 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
  
         // it is guaranteed be the ( rq->nr_running > 0 ) check in 
         // schedule that a task will be found.
-
+       
   retry_next_class:
         queue = rq_get_next_class(rq);
         // BUG_ON( !queue );
@@ -376,7 +376,7 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
         // BUG_ON(!array->nr_active);
  
         idx = queue->top_priority;
-       // BUG_ON (idx == MAX_PRIO);
+       //BUG_ON(idx == MAX_PRIO);
         next = task_list_entry(array->queue[idx].next);
         return next;
  }
@@ -1596,7 +1596,7 @@ static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
  /*
   * we don't want to migrate tasks that will reverse the balance
   *     or the tasks that make too small difference
- */
+        */
  #define CKRM_BALANCE_MAX_RATIO 100
  #define CKRM_BALANCE_MIN_RATIO 1
   start:
@@ -1614,11 +1614,11 @@ static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
                 array = src_lrq->active;
                 dst_array = dst_lrq->active;
         }
-       
- new_array:
+
+new_array:
         /* Start searching at priority 0: */
         idx = 0;
- skip_bitmap:
+skip_bitmap:
         if (!idx)
                 idx = sched_find_first_bit(array->bitmap);
         else
@@ -1634,14 +1634,14 @@ static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
                 else 
                         goto out; //finished search for this lrq
         }
-       
+
         head = array->queue + idx;
         curr = head->prev;
- skip_queue:
+skip_queue:
         tmp = list_entry(curr, task_t, run_list);
-       
+
         curr = curr->prev;
-       
+
         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
                 if (curr != head)
                         goto skip_queue;
@@ -1658,27 +1658,27 @@ static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
                 *pressure_imbalance -= task_load(tmp);
                 pull_task(busiest, array, tmp, 
                           this_rq, dst_array, this_cpu);
-               pulled++;
+       pulled++;
  
                 if (*pressure_imbalance <= balance_min)
                         goto out;
         }
                 
-       if (curr != head)
-               goto skip_queue;
-       idx++;
-       goto skip_bitmap;
- out:         
+               if (curr != head)
+                       goto skip_queue;
+               idx++;
+               goto skip_bitmap;
+out:
         return pulled;
  }
  
  static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
  {
         long imbalance;
-       /*
+/*
          * make sure after balance, imbalance' > - imbalance/2
          * we don't want the imbalance be reversed too much
-        */
+ */
         imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) 
                 - pid_get_pressure(rq_ckrm_load(this_rq),1);
         imbalance /= 2;
@@ -1848,15 +1848,15 @@ nextgroup:
         /* hzheng: debugging: 105 is a magic number
          * 100*max_load <= sd->imbalance_pct*this_load)
          * should use imbalance_pct instead
-        */
+                */
         if (this_load > avg_load 
             || 100*max_load < 105*this_load
             || 100*min_load < 70*this_load
             )
-               goto out_balanced;
+                       goto out_balanced;
  
         return avg_load;
- out_balanced:
+out_balanced:
         return 0;
  }
  
@@ -1870,7 +1870,7 @@ ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
                      int nr_group)
  {
         struct sched_group *group;
-       runqueue_t * busiest=NULL;
+       runqueue_t *busiest = NULL;
         unsigned long rand;
         
         group = sd->groups;
@@ -1880,10 +1880,10 @@ ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
         do {
                 unsigned long load,total_load,max_load;
                 cpumask_t tmp;
-               int i;
+       int i;
                 runqueue_t * grp_busiest;
  
-               cpus_and(tmp, group->cpumask, cpu_online_map);
+       cpus_and(tmp, group->cpumask, cpu_online_map);
                 if (unlikely(cpus_empty(tmp)))
                         goto find_nextgroup;
  
@@ -1893,18 +1893,18 @@ ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
                 for_each_cpu_mask(i, tmp) {
                         load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
                         total_load += load;
-                       if (load > max_load) {
-                               max_load = load;
+               if (load > max_load) {
+                       max_load = load;
                                 grp_busiest = cpu_rq(i);
-                       }                               
                 }
+       }
  
                 total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
                 if (total_load > avg_load) {
                         busiest = grp_busiest;
                         if (nr_group >= rand)
                                 break;
-               }
+}
         find_nextgroup:         
                 group = group->next;
                 nr_group ++;
@@ -1928,7 +1928,7 @@ static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
                 goto out_balanced;
  
         busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
-       if (! busiest)
+       if (!busiest)
                 goto out_balanced;
         /*
          * This should be "impossible", but since load
@@ -1954,13 +1954,13 @@ static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
                 spin_unlock(&busiest->lock);
                 if (nr_moved) {
                         adjust_local_weight();
-               }
         }
+                       }
  
         if (!nr_moved) 
                 sd->nr_balance_failed ++;
         else
-               sd->nr_balance_failed  = 0;             
+               sd->nr_balance_failed = 0;
  
         /* We were unbalanced, so reset the balancing interval */
         sd->balance_interval = sd->min_interval;
@@ -1979,7 +1979,7 @@ out_balanced:
   * this_rq->lock is already held
   */
  static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
-                                      struct sched_domain *sd)
+                               struct sched_domain *sd)
  {
         int ret;
         read_lock(&class_list_lock);
@@ -2020,12 +2020,12 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
         if (max_nr_move <= 0 || busiest->nr_running <= 1)
                 goto out;
  
-       /*
+/*
          * We first consider expired tasks. Those will likely not be
          * executed in the near future, and they are most likely to
          * be cache-cold, thus switching CPUs has the least effect
          * on them.
-        */
+ */
         if (busiest->expired->nr_active) {
                 array = busiest->expired;
                 dst_array = this_rq->expired;
@@ -2049,7 +2049,7 @@ skip_bitmap:
                         goto new_array;
                 }
                 goto out;
-       }
+                       }
  
         head = array->queue + idx;
         curr = head->prev;
@@ -2063,7 +2063,7 @@ skip_queue:
                         goto skip_queue;
                 idx++;
                 goto skip_bitmap;
-       }
+               }
         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
         pulled++;
  
@@ -2092,7 +2092,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  
         max_load = this_load = total_load = total_pwr = 0;
  
-       do {
+       do {
                 cpumask_t tmp;
                 unsigned long load;
                 int local_group;
@@ -2106,7 +2106,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 if (unlikely(cpus_empty(tmp)))
                         goto nextgroup;
  
-               for_each_cpu_mask(i, tmp) {
+               for_each_cpu_mask(i, tmp) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
                                 load = target_load(i);
@@ -2115,7 +2115,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  
                         nr_cpus++;
                         avg_load += load;
-               }
+               }
  
                 if (!nr_cpus)
                         goto nextgroup;
@@ -2173,11 +2173,11 @@ nextgroup:
                         return busiest;
                 }
  
-               /*
+       /*
                  * OK, we don't have enough imbalance to justify moving tasks,
                  * however we may be able to increase total CPU power used by
                  * moving them.
-                */
+        */
  
                 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
                 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
@@ -2202,7 +2202,7 @@ nextgroup:
  
                 *imbalance = 1;
                 return busiest;
-       }
+               }
  
         /* Get rid of the scaling factor, rounding down as we divide */
         *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
@@ -2238,7 +2238,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
                         max_load = load;
                         busiest = cpu_rq(i);
                 }
-       }
+}
  
         return busiest;
  }
@@ -2288,7 +2288,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                                 imbalance, sd, idle);
                 spin_unlock(&busiest->lock);
-       }
+}
         spin_unlock(&this_rq->lock);
  
         if (!nr_moved) {
@@ -2307,10 +2307,10 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                         if (wake)
                                 wake_up_process(busiest->migration_thread);
  
-                       /*
+/*
                          * We've kicked active balancing, reset the failure
                          * counter.
-                        */
+ */
                         sd->nr_balance_failed = sd->cache_nice_tries;
                 }
         } else
@@ -2793,14 +2793,14 @@ asmlinkage void __sched schedule(void)
         int maxidle = -HZ;
  #endif
  
-       /*
+       /*
          * If crash dump is in progress, this other cpu's
          * need to wait until it completes.
          * NB: this code is optimized away for kernels without
          * dumping enabled.
          */
-       if (unlikely(dump_oncpu))
-               goto dump_scheduling_disabled;
+        if (unlikely(dump_oncpu))
+                goto dump_scheduling_disabled;
  
         //WARN_ON(system_state == SYSTEM_BOOTING);
         /*
@@ -2865,8 +2865,9 @@ need_resched:
         }
  
         cpu = smp_processor_id();
+
  #ifdef CONFIG_VSERVER_HARDCPU          
-       if (!list_empty(&rq->hold_queue)) {
+       if (!list_empty(&rq->hold_queue)) {
                 struct list_head *l, *n;
                 int ret;
  
@@ -2875,7 +2876,7 @@ need_resched:
                         next = list_entry(l, task_t, run_list);
                         if (vxi == next->vx_info)
                                 continue;
-
+                       
                         vxi = next->vx_info;
                         ret = vx_tokens_recalc(vxi);
                         // tokens = vx_tokens_avail(next);
@@ -2890,22 +2891,22 @@ need_resched:
                         }
                         if ((ret < 0) && (maxidle < ret))
                                 maxidle = ret;
-               }       
+               }
         }
-       rq->idle_tokens = -maxidle;
-
-pick_next:
+       rq->idle_tokens = -maxidle;
+       
+ pick_next:
  #endif
         if (unlikely(!rq->nr_running)) {
                 idle_balance(cpu, rq);
                  if (!rq->nr_running) {
-                        next = rq->idle;
+               next = rq->idle;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
                          rq->expired_timestamp = 0;
  #endif
                          wake_sleeping_dependent(cpu, rq);
-                        goto switch_tasks;
-                }
+               goto switch_tasks;
+       }
         }
  
         next = rq_get_next_task(rq);
@@ -2916,20 +2917,20 @@ pick_next:
         }
  
  #ifdef CONFIG_VSERVER_HARDCPU          
-       vxi = next->vx_info;
-       if (vxi && __vx_flags(vxi->vx_flags,
-               VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
-               int ret = vx_tokens_recalc(vxi);
-
-               if (unlikely(ret <= 0)) {
-                       if (ret && (rq->idle_tokens > -ret))
-                               rq->idle_tokens = -ret;
-                       deactivate_task(next, rq);
-                       list_add_tail(&next->run_list, &rq->hold_queue);
-                       next->state |= TASK_ONHOLD;                     
-                       goto pick_next;
-               }
-       }
+       vxi = next->vx_info;
+       if (vxi && __vx_flags(vxi->vx_flags,
+                             VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
+               int ret = vx_tokens_recalc(vxi);
+               
+               if (unlikely(ret <= 0)) {
+                       if (ret && (rq->idle_tokens > -ret))
+                               rq->idle_tokens = -ret;
+                       deactivate_task(next, rq);
+                       list_add_tail(&next->run_list, &rq->hold_queue);
+                       next->state |= TASK_ONHOLD;                     
+                       goto pick_next;
+               }
+       }
  #endif
  
         if (!rt_task(next) && next->activated > 0) {
@@ -2980,15 +2981,15 @@ switch_tasks:
         if (test_thread_flag(TIF_NEED_RESCHED))
                 goto need_resched;
  
-       return;
-
+       return;
+       
   dump_scheduling_disabled:
-       /* allow scheduling only if this is the dumping cpu */
-       if (dump_oncpu != smp_processor_id()+1) {
-               while (dump_oncpu)
-                       cpu_relax();
-       }
-       return;
+       /* allow scheduling only if this is the dumping cpu */
+       if (dump_oncpu != smp_processor_id()+1) {
+               while (dump_oncpu)
+                       cpu_relax();
+       }
+       return;
  }
  
  EXPORT_SYMBOL(schedule);
@@ -3175,11 +3176,11 @@ EXPORT_SYMBOL(wait_for_completion);
         spin_unlock_irqrestore(&q->lock, flags);
  
  #define SLEEP_ON_BKLCHECK                              \
-       if (unlikely(!kernel_locked()) &&               \
-           sleep_on_bkl_warnings < 10) {               \
-               sleep_on_bkl_warnings++;                \
-               WARN_ON(1);                             \
-       }
+       if (unlikely(!kernel_locked()) &&               \
+           sleep_on_bkl_warnings < 10) {               \
+               sleep_on_bkl_warnings++;                \
+               WARN_ON(1);                             \
+       }
  
  static int sleep_on_bkl_warnings;
  
@@ -3202,7 +3203,7 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long
  {
         SLEEP_ON_VAR
  
-       SLEEP_ON_BKLCHECK
+        SLEEP_ON_BKLCHECK
  
         current->state = TASK_INTERRUPTIBLE;
  
@@ -3219,7 +3220,7 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
  {
         SLEEP_ON_VAR
  
-       SLEEP_ON_BKLCHECK
+        SLEEP_ON_BKLCHECK
  
         current->state = TASK_UNINTERRUPTIBLE;
  
@@ -3346,7 +3347,6 @@ int task_nice(const task_t *p)
  {
         return TASK_NICE(p);
  }
-
  EXPORT_SYMBOL(task_nice);
  
  /**
@@ -4650,10 +4650,10 @@ void __init sched_init(void)
                         for (k = 0; k < MAX_PRIO; k++) {
                                 INIT_LIST_HEAD(array->queue + k);
                                 __clear_bit(k, array->bitmap);
-                       }
+               }
                         // delimiter for bitsearch
                         __set_bit(MAX_PRIO, array->bitmap);
-               }
+       }
  
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
@@ -4722,7 +4722,7 @@ void __might_sleep(char *file, int line, int atomic_depth)
                 printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
                         in_atomic(), atomic_depth, irqs_disabled());
                 dump_stack();
-       }
+}
  #endif
  }
  EXPORT_SYMBOL(__might_sleep);
@@ -4739,7 +4739,7 @@ EXPORT_SYMBOL(__might_sleep);
   * hand while permitting preemption.
   *
   * Called inside preempt_disable().
- */
+        */
  void __sched __preempt_spin_lock(spinlock_t *lock)
  {
         if (preempt_count() > 1) {
@@ -4778,14 +4778,14 @@ EXPORT_SYMBOL(__preempt_write_lock);
  int task_running_sys(struct task_struct *p)
  {
         return task_running(task_rq(p),p);
-}
+               }
  EXPORT_SYMBOL(task_running_sys);
  #endif
  
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
  /**
   * return the classqueue object of a certain processor
- */
+                        */
  struct classqueue_struct * get_cpu_classqueue(int cpu)
  {
         return (& (cpu_rq(cpu)->classqueue) );
@@ -4799,7 +4799,7 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
         prio_array_t *array;
         struct runqueue *rq;
         unsigned long flags;
-
+       
         rq = task_rq_lock(tsk,&flags); 
         array = tsk->array;
         if (array) {
diff --git a/kernel/sys.c b/kernel/sys.c

index c69f6ed..6e8b073 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -18,6 +18,8 @@
  #include <linux/init.h>
  #include <linux/highuid.h>
  #include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
  #include <linux/workqueue.h>
  #include <linux/device.h>
  #include <linux/times.h>
@@ -511,6 +513,25 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
                 machine_restart(buffer);
                 break;
  
+#ifdef CONFIG_KEXEC
+       case LINUX_REBOOT_CMD_KEXEC:
+       {
+               struct kimage *image;
+               image = xchg(&kexec_image, 0);
+               if (!image) {
+                       unlock_kernel();
+                       return -EINVAL;
+               }
+               notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+               system_state = SYSTEM_RESTART;
+               device_shutdown();
+               system_state = SYSTEM_BOOTING;
+               printk(KERN_EMERG "Starting new kernel\n");
+               machine_shutdown();
+               machine_kexec(image);
+               break;
+       }
+#endif
  #ifdef CONFIG_SOFTWARE_SUSPEND
         case LINUX_REBOOT_CMD_SW_SUSPEND:
                 {
diff --git a/lib/.cvsignore b/lib/.cvsignore

new file mode 100644 (file)

index 0000000..30d3818
--- /dev/null
+++ b/lib/.cvsignore
@@ -0,0 +1,2 @@
+crc32table.h
+gen_crc32table
diff --git a/net/core/scm.c b/net/core/scm.c

index 3699df3..a2ebf30 100644 (file)
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -127,9 +127,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
                    for too short ancillary data object at all! Oops.
                    OK, let's add it...
                  */
-               if (cmsg->cmsg_len < sizeof(struct cmsghdr) ||
-                   (unsigned long)(((char*)cmsg - (char*)msg->msg_control)
-                                   + cmsg->cmsg_len) > msg->msg_controllen)
+               if (!CMSG_OK(msg, cmsg))
                         goto error;
  
                 if (cmsg->cmsg_level != SOL_SOCKET)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig

index b58141e..c4bae8c 100644 (file)
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -628,5 +628,50 @@ config IP_NF_MATCH_REALM
           If you want to compile it as a module, say M here and read
           Documentation/modules.txt.  If unsure, say `N'.
  
+config IP_NF_CT_ACCT
+       bool "Connection tracking flow accounting"
+       depends on IP_NF_CONNTRACK
+
+config IP_NF_CT_PROTO_GRE
+       tristate  ' GRE protocol support'
+       depends on IP_NF_CONNTRACK
+       help
+         This module adds generic support for connection tracking and NAT of the
+         GRE protocol (RFC1701, RFC2784).  Please note that this will only work
+         with GRE connections using the key field of the GRE header.
+       
+         You will need GRE support to enable PPTP support.
+       
+         If you want to compile it as a module, say `M' here and read
+         Documentation/modules.txt.  If unsire, say `N'.
+
+config IP_NF_PPTP
+       tristate  'PPTP protocol support'
+       depends on IP_NF_CT_PROTO_GRE
+       help
+         This module adds support for PPTP (Point to Point Tunnelling Protocol, 
+         RFC2637) conncection tracking and NAT. 
+       
+         If you are running PPTP sessions over a stateful firewall or NAT box,
+         you may want to enable this feature.  
+       
+         Please note that not all PPTP modes of operation are supported yet.
+         For more info, read top of the file net/ipv4/netfilter/ip_conntrack_pptp.c
+       
+         If you want to compile it as a module, say M here and read
+         Documentation/modules.txt.  If unsure, say `N'.
+
+config IP_NF_NAT_PPTP
+       tristate
+       depends on IP_NF_NAT!=n && IP_NF_PPTP!=n
+       default IP_NF_NAT if IP_NF_PPTP=y
+       default m if IP_NF_PPTP=m
+
+config IP_NF_NAT_PROTO_GRE
+       tristate
+       depends on IP_NF_NAT!=n && IP_NF_CT_PROTO_GRE!=n
+       default IP_NF_NAT if IP_NF_CT_PROTO_GRE=y
+       default m if IP_NF_CT_PROTO_GRE=m
+
  endmenu
  
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile

index bdb23fd..f54887b 100644 (file)
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -19,17 +19,25 @@ ipchains-objs               := $(ip_nf_compat-objs) ipchains_core.o
  # connection tracking
  obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
  
+# connection tracking protocol helpers
+obj-$(CONFIG_IP_NF_CT_PROTO_GRE) += ip_conntrack_proto_gre.o
+
+# NAT protocol helpers
+obj-$(CONFIG_IP_NF_NAT_PROTO_GRE) += ip_nat_proto_gre.o
+
  # connection tracking helpers
  obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
  obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
  obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
  obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o
  
  # NAT helpers 
  obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
  obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
  obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
  obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o
  
  # generic IP tables 
  obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c

index 4e8f4d8..40ed447 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -58,7 +58,7 @@ static int help(struct sk_buff *skb,
  
         /* increase the UDP timeout of the master connection as replies from
          * Amanda clients to the server can be quite delayed */
-       ip_ct_refresh(ct, master_timeout * HZ);
+       ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
  
         /* No data? */
         dataoff = skb->nh.iph->ihl*4 + sizeof(struct udphdr);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c

index 05fbb43..757af68 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -143,6 +143,7 @@ get_tuple(const struct iphdr *iph,
         tuple->src.ip = iph->saddr;
         tuple->dst.ip = iph->daddr;
         tuple->dst.protonum = iph->protocol;
+       tuple->src.u.all = tuple->dst.u.all = 0;
  
         return protocol->pkt_to_tuple(skb, dataoff, tuple);
  }
@@ -156,6 +157,8 @@ invert_tuple(struct ip_conntrack_tuple *inverse,
         inverse->dst.ip = orig->src.ip;
         inverse->dst.protonum = orig->dst.protonum;
  
+       inverse->src.u.all = inverse->dst.u.all = 0;
+
         return protocol->invert_tuple(inverse, orig);
  }
  
@@ -976,8 +979,8 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect,
          * so there is no need to use the tuple lock too */
  
         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
-       DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
-       DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
+       DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
+       DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
  
         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
                         struct ip_conntrack_expect *, &expect->tuple, 
@@ -1070,15 +1073,14 @@ int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
  
         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
-
         DEBUGP("change_expect:\n");
-       DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
-       DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
-       DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
+       DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
+       DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
+       DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
         if (expect->ct_tuple.dst.protonum == 0) {
                 /* Never seen before */
                 DEBUGP("change expect: never seen before\n");
-               if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
+               if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
                         /* Force NAT to find an unused tuple */
@@ -1166,21 +1168,39 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
         synchronize_net();
  }
  
-/* Refresh conntrack for this many jiffies. */
-void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+static inline void ct_add_counters(struct ip_conntrack *ct,
+                                  enum ip_conntrack_info ctinfo,
+                                  const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_NF_CT_ACCT
+       if (skb) {
+               ct->counters[CTINFO2DIR(ctinfo)].packets++;
+               ct->counters[CTINFO2DIR(ctinfo)].bytes += 
+                                       ntohs(skb->nh.iph->tot_len);
+       }
+#endif
+}
+
+/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
+void ip_ct_refresh_acct(struct ip_conntrack *ct, 
+                       enum ip_conntrack_info ctinfo,
+                       const struct sk_buff *skb,
+                       unsigned long extra_jiffies)
  {
         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
  
         /* If not in hash table, timer will not be active yet */
-       if (!is_confirmed(ct))
+       if (!is_confirmed(ct)) {
                 ct->timeout.expires = extra_jiffies;
-       else {
+               ct_add_counters(ct, ctinfo, skb);
+       } else {
                 WRITE_LOCK(&ip_conntrack_lock);
                 /* Need del_timer for race avoidance (may already be dying). */
                 if (del_timer(&ct->timeout)) {
                         ct->timeout.expires = jiffies + extra_jiffies;
                         add_timer(&ct->timeout);
                 }
+               ct_add_counters(ct, ctinfo, skb);
                 WRITE_UNLOCK(&ip_conntrack_lock);
         }
  }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c

index 0df558a..6a7db77 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
@@ -50,9 +50,9 @@ static unsigned int generic_print_conntrack(char *buffer,
  /* Returns verdict for packet, or -1 for invalid. */
  static int packet(struct ip_conntrack *conntrack,
                   const struct sk_buff *skb,
-                 enum ip_conntrack_info conntrackinfo)
+                 enum ip_conntrack_info ctinfo)
  {
-       ip_ct_refresh(conntrack, ip_ct_generic_timeout);
+       ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
         return NF_ACCEPT;
  }
  
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c

index 4711484..e854193 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -94,7 +94,7 @@ static int icmp_packet(struct ip_conntrack *ct,
                         ct->timeout.function((unsigned long)ct);
         } else {
                 atomic_inc(&ct->proto.icmp.count);
-               ip_ct_refresh(ct, ip_ct_icmp_timeout);
+               ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
         }
  
         return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c

index 463cafa..73fe040 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -225,7 +225,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
                 set_bit(IPS_ASSURED_BIT, &conntrack->status);
  
  out:   WRITE_UNLOCK(&tcp_lock);
-       ip_ct_refresh(conntrack, *tcp_timeouts[newconntrack]);
+       ip_ct_refresh_acct(conntrack, ctinfo, skb, *tcp_timeouts[newconntrack]);
  
         return NF_ACCEPT;
  }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c

index a63c32d..a69e14b 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -60,16 +60,17 @@ static unsigned int udp_print_conntrack(char *buffer,
  /* Returns verdict for packet, and may modify conntracktype */
  static int udp_packet(struct ip_conntrack *conntrack,
                       const struct sk_buff *skb,
-                     enum ip_conntrack_info conntrackinfo)
+                     enum ip_conntrack_info ctinfo)
  {
         /* If we've seen traffic both ways, this is some kind of UDP
            stream.  Extend timeout. */
         if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
-               ip_ct_refresh(conntrack, ip_ct_udp_timeout_stream);
+               ip_ct_refresh_acct(conntrack, ctinfo, skb, 
+                                  ip_ct_udp_timeout_stream);
                 /* Also, more likely to be important, and not a probe */
                 set_bit(IPS_ASSURED_BIT, &conntrack->status);
         } else
-               ip_ct_refresh(conntrack, ip_ct_udp_timeout);
+               ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
  
         return NF_ACCEPT;
  }
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c

index fd688f4..76c827d 100644 (file)
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -83,6 +83,17 @@ print_expect(char *buffer, const struct ip_conntrack_expect *expect)
         return len;
  }
  
+#ifdef CONFIG_IP_NF_CT_ACCT
+static unsigned int
+print_counters(char *buffer, struct ip_conntrack_counter *counter)
+{
+       return sprintf(buffer, "packets=%llu bytes=%llu ", 
+                       counter->packets, counter->bytes);
+}
+#else
+#define print_counters(x, y)   0
+#endif
+
  static unsigned int
  print_conntrack(char *buffer, struct ip_conntrack *conntrack)
  {
@@ -103,12 +114,16 @@ print_conntrack(char *buffer, struct ip_conntrack *conntrack)
                            &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                            proto);
         len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_ORIGINAL]);
+       len += print_counters(buffer + len, 
+                             &conntrack->counters[IP_CT_DIR_ORIGINAL]);
         if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
                 len += sprintf(buffer + len, "[UNREPLIED] ");
         len += print_tuple(buffer + len,
                            &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
                            proto);
         len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_REPLY]);
+       len += print_counters(buffer + len, 
+                             &conntrack->counters[IP_CT_DIR_REPLY]);
         if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
                 len += sprintf(buffer + len, "[ASSURED] ");
         len += sprintf(buffer + len, "use=%u ",
@@ -640,7 +655,7 @@ EXPORT_SYMBOL(need_ip_conntrack);
  EXPORT_SYMBOL(ip_conntrack_helper_register);
  EXPORT_SYMBOL(ip_conntrack_helper_unregister);
  EXPORT_SYMBOL(ip_ct_selective_cleanup);
-EXPORT_SYMBOL(ip_ct_refresh);
+EXPORT_SYMBOL(ip_ct_refresh_acct);
  EXPORT_SYMBOL(ip_ct_find_proto);
  EXPORT_SYMBOL(__ip_ct_find_proto);
  EXPORT_SYMBOL(ip_ct_find_helper);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c

index 1c6b781..130b01c 100644 (file)
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -438,7 +438,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
         *tuple = *orig_tuple;
         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
                != NULL) {
-               DEBUGP("Found best for "); DUMP_TUPLE(tuple);
+               DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple);
                 /* 3) The per-protocol part of the manip is made to
                    map into the range to make a unique tuple. */
  
@@ -580,9 +580,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
                        conntrack);
                 DEBUGP("Original: ");
-               DUMP_TUPLE(&orig_tp);
+               DUMP_TUPLE_RAW(&orig_tp);
                 DEBUGP("New: ");
-               DUMP_TUPLE(&new_tuple);
+               DUMP_TUPLE_RAW(&new_tuple);
  #endif
  
                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index 70945b4..55b8060 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1856,7 +1856,8 @@ no_tcp_socket:
         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
  bad_packet:
                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
-       } else {
+       } else if (!skb->sk) {
+               /* VNET: Suppress RST if the port was bound to a (presumably raw) socket */
                 tcp_v4_send_reset(skb);
         }
  
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index 23f8f51..d956482 100644 (file)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1107,6 +1107,75 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
         return 0;
  }
  
+/* XXX (mef) need to generalize the IPOD stuff.  Right now I am borrowing 
+   from the ICMP infrastructure. */
+#ifdef CONFIG_ICMP_IPOD
+#include <linux/reboot.h>
+
+extern int sysctl_icmp_ipod_version;
+extern int sysctl_icmp_ipod_enabled;
+extern u32 sysctl_icmp_ipod_host;
+extern u32 sysctl_icmp_ipod_mask;
+extern char sysctl_icmp_ipod_key[32+1];
+#define IPOD_CHECK_KEY \
+       (sysctl_icmp_ipod_key[0] != 0)
+#define IPOD_VALID_KEY(d) \
+       (strncmp(sysctl_icmp_ipod_key, (char *)(d), strlen(sysctl_icmp_ipod_key)) == 0)
+
+static void udp_ping_of_death(struct sk_buff *skb, struct udphdr *uh, u32 saddr)
+{
+       int doit = 0;
+
+       /*
+        * If IPOD not enabled or wrong UDP IPOD port, ignore.
+        */
+       if (!sysctl_icmp_ipod_enabled || (ntohs(uh->dest) != 664))
+               return;
+
+#if 0
+       printk(KERN_INFO "IPOD: got udp pod request, host=%u.%u.%u.%u\n", NIPQUAD(saddr));
+#endif
+
+
+       /*
+        * First check the source address info.
+        * If host not set, ignore.
+        */
+       if (sysctl_icmp_ipod_host != 0xffffffff &&
+           (ntohl(saddr) & sysctl_icmp_ipod_mask) == sysctl_icmp_ipod_host) {
+               /*
+                * Now check the key if enabled.
+                * If packet doesn't contain enough data or key
+                * is otherwise invalid, ignore.
+                */
+               if (IPOD_CHECK_KEY) {
+                       if (pskb_may_pull(skb, sizeof(sysctl_icmp_ipod_key)+sizeof(struct udphdr)-1)){
+#if 0
+                           int i;
+                           for (i=0;i<32+1;i++){
+                               printk("%c",((char*)skb->data)[i+sizeof(struct udphdr)]);
+                           }   
+                           printk("\n");
+#endif
+                           if (IPOD_VALID_KEY(skb->data+sizeof(struct udphdr)))
+                               doit = 1;
+                       }
+               } else {
+                       doit = 1;
+               }
+       }
+       if (doit) {
+               sysctl_icmp_ipod_enabled = 0;
+               printk(KERN_CRIT "IPOD: reboot forced by %u.%u.%u.%u...\n",
+                      NIPQUAD(saddr));
+               machine_restart(NULL);
+       } else {
+               printk(KERN_WARNING "IPOD: from %u.%u.%u.%u rejected\n",
+                      NIPQUAD(saddr));
+       }
+}
+#endif
+
  /*
   *     All we need to do is get the socket, and then do a checksum. 
   */
@@ -1143,6 +1212,10 @@ int udp_rcv(struct sk_buff *skb)
         if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
                 return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
  
+#ifdef CONFIG_ICMP_IPOD
+       udp_ping_of_death(skb, uh, saddr);
+#endif
+
         sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
  
         if (sk != NULL) {
@@ -1164,8 +1237,11 @@ int udp_rcv(struct sk_buff *skb)
         if (udp_checksum_complete(skb))
                 goto csum_error;
  
-       UDP_INC_STATS_BH(UDP_MIB_NOPORTS);
-       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+       /* VNET: Suppress ICMP Unreachable if the port was bound to a (presumably raw) socket */
+       if (!skb->sk) {
+               UDP_INC_STATS_BH(UDP_MIB_NOPORTS);
+               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+       }
  
         /*
          * Hmm.  We got an UDP packet to a port to which we
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c

index 08432ce..5deed5c 100644 (file)
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -147,7 +147,7 @@ static inline unsigned unix_hash_fold(unsigned hash)
         return hash&(UNIX_HASH_SIZE-1);
  }
  
-#define unix_peer(sk) ((sk)->sk_pair)
+#define unix_peer(sk) (unix_sk(sk)->peer)
  
  static inline int unix_our_peer(struct sock *sk, struct sock *osk)
  {
@@ -483,6 +483,8 @@ static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
                               struct msghdr *, size_t, int);
  static int unix_dgram_connect(struct socket *, struct sockaddr *,
                               int, int);
+static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
+                                 struct msghdr *, size_t);
  
  static struct proto_ops unix_stream_ops = {
         .family =       PF_UNIX,
@@ -541,7 +543,7 @@ static struct proto_ops unix_seqpacket_ops = {
         .shutdown =     unix_shutdown,
         .setsockopt =   sock_no_setsockopt,
         .getsockopt =   sock_no_getsockopt,
-       .sendmsg =      unix_dgram_sendmsg,
+       .sendmsg =      unix_seqpacket_sendmsg,
         .recvmsg =      unix_dgram_recvmsg,
         .mmap =         sock_no_mmap,
         .sendpage =     sock_no_sendpage,
@@ -1378,9 +1380,11 @@ restart:
         if (other->sk_shutdown & RCV_SHUTDOWN)
                 goto out_unlock;
  
-       err = security_unix_may_send(sk->sk_socket, other->sk_socket);
-       if (err)
-               goto out_unlock;
+       if (sk->sk_type != SOCK_SEQPACKET) {
+               err = security_unix_may_send(sk->sk_socket, other->sk_socket);
+               if (err)
+                       goto out_unlock;
+       }
  
         if (unix_peer(other) != sk &&
             (skb_queue_len(&other->sk_receive_queue) >
@@ -1530,6 +1534,25 @@ out_err:
         return sent ? : err;
  }
  
+static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
+                                 struct msghdr *msg, size_t len)
+{
+       int err;
+       struct sock *sk = sock->sk;
+       
+       err = sock_error(sk);
+       if (err)
+               return err;
+
+       if (sk->sk_state != TCP_ESTABLISHED)
+               return -ENOTCONN;
+
+       if (msg->msg_namelen)
+               msg->msg_namelen = 0;
+
+       return unix_dgram_sendmsg(kiocb, sock, msg, len);
+}
+                                                                                            
  static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
  {
         struct unix_sock *u = unix_sk(sk);
@@ -1559,9 +1582,11 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
  
         msg->msg_namelen = 0;
  
+       down(&u->readsem);
+
         skb = skb_recv_datagram(sk, flags, noblock, &err);
         if (!skb)
-               goto out;
+               goto out_unlock;
  
         wake_up_interruptible(&u->peer_wait);
  
@@ -1611,6 +1636,8 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
  
  out_free:
         skb_free_datagram(sk,skb);
+out_unlock:
+       up(&u->readsem);
  out:
         return err;
  }
diff --git a/scripts/.cvsignore b/scripts/.cvsignore

new file mode 100644 (file)

index 0000000..d95bc0a
--- /dev/null
+++ b/scripts/.cvsignore
@@ -0,0 +1,4 @@
+bin2c
+conmakehash
+kallsyms
+pnmtologo
diff --git a/scripts/basic/.cvsignore b/scripts/basic/.cvsignore

new file mode 100644 (file)

index 0000000..fa6c888
--- /dev/null
+++ b/scripts/basic/.cvsignore
@@ -0,0 +1,3 @@
+docproc
+fixdep
+split-include
diff --git a/scripts/kconfig/.cvsignore b/scripts/kconfig/.cvsignore

new file mode 100644 (file)

index 0000000..37981a9
--- /dev/null
+++ b/scripts/kconfig/.cvsignore
@@ -0,0 +1,5 @@
+conf
+lex.zconf.c
+mconf
+zconf.tab.c
+zconf.tab.h
diff --git a/scripts/kernel-2.6-planetlab.spec b/scripts/kernel-2.6-planetlab.spec

index 4e2be56..84f9f99 100644 (file)
--- a/scripts/kernel-2.6-planetlab.spec
+++ b/scripts/kernel-2.6-planetlab.spec
@@ -22,7 +22,7 @@ Summary: The Linux kernel (the core of the Linux operating system)
  %define kversion 2.6.%{sublevel}
  %define rpmversion 2.6.%{sublevel}
  %define rhbsys  %([ -r /etc/beehive-root ] && echo  || echo .`whoami`)
-%define release 1.521.2.6.planetlab%{?date:.%{date}}
+%define release 1.521.3.planetlab%{?date:.%{date}}
  %define signmodules 0
  
  %define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE}
diff --git a/scripts/lxdialog/.cvsignore b/scripts/lxdialog/.cvsignore

new file mode 100644 (file)

index 0000000..bebf295
--- /dev/null
+++ b/scripts/lxdialog/.cvsignore
@@ -0,0 +1 @@
+lxdialog
diff --git a/scripts/mod/.cvsignore b/scripts/mod/.cvsignore

new file mode 100644 (file)

index 0000000..a6dd5e2
--- /dev/null
+++ b/scripts/mod/.cvsignore
@@ -0,0 +1,3 @@
+elfconfig.h
+mk_elfconfig
+modpost
diff --git a/usr/.cvsignore b/usr/.cvsignore

new file mode 100644 (file)

index 0000000..d06dfff
--- /dev/null
+++ b/usr/.cvsignore
@@ -0,0 +1,3 @@
+gen_init_cpio
+initramfs_data.cpio
+initramfs_data.cpio.gz
author	Planet-Lab Support <support@planet-lab.org>
	Fri, 21 Jan 2005 03:34:25 +0000 (03:34 +0000)
committer	Planet-Lab Support <support@planet-lab.org>
	Fri, 21 Jan 2005 03:34:25 +0000 (03:34 +0000)
.cvsignore	[new file with mode: 0644]	patch \| blob
Documentation/ckrm/cpusched	[new file with mode: 0644]	patch \| blob
MAINTAINERS		patch \| blob \| history
Makefile		patch \| blob \| history
arch/i386/Kconfig		patch \| blob \| history
arch/i386/boot/.cvsignore	[new file with mode: 0644]	patch \| blob
arch/i386/boot/compressed/.cvsignore	[new file with mode: 0644]	patch \| blob
arch/i386/boot/compressed/misc.c		patch \| blob \| history
arch/i386/boot/tools/.cvsignore	[new file with mode: 0644]	patch \| blob
arch/i386/defconfig		patch \| blob \| history
arch/i386/kernel/.cvsignore	[new file with mode: 0644]	patch \| blob
arch/i386/kernel/Makefile		patch \| blob \| history
arch/i386/kernel/apic.c		patch \| blob \| history
arch/i386/kernel/asm-offsets.c		patch \| blob \| history
arch/i386/kernel/entry.S		patch \| blob \| history
arch/i386/kernel/i386_ksyms.c		patch \| blob \| history
arch/i386/kernel/i8259.c		patch \| blob \| history
arch/i386/kernel/init_task.c		patch \| blob \| history
arch/i386/kernel/io_apic.c		patch \| blob \| history
arch/i386/kernel/irq.c		patch \| blob \| history
arch/i386/kernel/machine_kexec.c	[new file with mode: 0644]	patch \| blob
arch/i386/kernel/process.c		patch \| blob \| history
arch/i386/kernel/reboot.c		patch \| blob \| history
arch/i386/kernel/relocate_kernel.S	[new file with mode: 0644]	patch \| blob
configs/kernel-2.6.8-i686-planetlab.config		patch \| blob \| history
drivers/block/cfq-iosched-orig.c	[deleted file]	patch \| blob \| history
drivers/block/cfq-iosched.c		patch \| blob \| history
drivers/block/ckrm-io.c		patch \| blob \| history
drivers/block/ckrm-iostub.c		patch \| blob \| history
drivers/char/.cvsignore	[new file with mode: 0644]	patch \| blob
drivers/pci/.cvsignore	[new file with mode: 0644]	patch \| blob
drivers/scsi/aic7xxx/.cvsignore	[new file with mode: 0644]	patch \| blob
drivers/usb/serial/io_edgeport.c		patch \| blob \| history
drivers/usb/serial/io_edgeport.h		patch \| blob \| history
fs/aio.c		patch \| blob \| history
include/.cvsignore	[new file with mode: 0644]	patch \| blob
include/asm-i386/.cvsignore	[new file with mode: 0644]	patch \| blob
include/asm-i386/apicdef.h		patch \| blob \| history
include/asm-i386/irq.h		patch \| blob \| history
include/asm-i386/kexec.h	[new file with mode: 0644]	patch \| blob
include/asm-i386/module.h		patch \| blob \| history
include/asm-i386/processor.h		patch \| blob \| history
include/asm-i386/segment.h		patch \| blob \| history
include/asm-i386/thread_info.h		patch \| blob \| history
include/linux/.cvsignore	[new file with mode: 0644]	patch \| blob
include/linux/ckrm-io.h		patch \| blob \| history
include/linux/ckrm_classqueue.h		patch \| blob \| history
include/linux/ckrm_sched.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/kexec.h	[new file with mode: 0644]	patch \| blob
include/linux/mm.h		patch \| blob \| history
include/linux/netfilter_ipv4/ip_conntrack.h		patch \| blob \| history
include/linux/netfilter_ipv4/ip_conntrack_tuple.h		patch \| blob \| history
include/linux/reboot.h		patch \| blob \| history
include/linux/skbuff.h		patch \| blob \| history
include/linux/socket.h		patch \| blob \| history
include/net/af_unix.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/.cvsignore	[new file with mode: 0644]	patch \| blob
kernel/Makefile		patch \| blob \| history
kernel/ckrm/Makefile		patch \| blob \| history
kernel/ckrm/ckrm.c		patch \| blob \| history
kernel/ckrm/ckrm_cpu_class.c		patch \| blob \| history
kernel/ckrm/ckrm_cpu_monitor.c		patch \| blob \| history
kernel/ckrm/ckrm_laq.c	[deleted file]	patch \| blob \| history
kernel/ckrm/ckrm_listenaq.c		patch \| blob \| history
kernel/ckrm/ckrmutils.c		patch \| blob \| history
kernel/ckrm_classqueue.c		patch \| blob \| history
kernel/ckrm_sched.c		patch \| blob \| history
kernel/kexec.c	[new file with mode: 0644]	patch \| blob
kernel/sched.c		patch \| blob \| history
kernel/sys.c		patch \| blob \| history
lib/.cvsignore	[new file with mode: 0644]	patch \| blob
net/core/scm.c		patch \| blob \| history
net/ipv4/netfilter/Kconfig		patch \| blob \| history
net/ipv4/netfilter/Makefile		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_amanda.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_core.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_generic.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_icmp.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_tcp.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_proto_udp.c		patch \| blob \| history
net/ipv4/netfilter/ip_conntrack_standalone.c		patch \| blob \| history
net/ipv4/netfilter/ip_nat_core.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history
net/ipv4/udp.c		patch \| blob \| history
net/unix/af_unix.c		patch \| blob \| history
scripts/.cvsignore	[new file with mode: 0644]	patch \| blob
scripts/basic/.cvsignore	[new file with mode: 0644]	patch \| blob
scripts/kconfig/.cvsignore	[new file with mode: 0644]	patch \| blob
scripts/kernel-2.6-planetlab.spec		patch \| blob \| history
scripts/lxdialog/.cvsignore	[new file with mode: 0644]	patch \| blob
scripts/mod/.cvsignore	[new file with mode: 0644]	patch \| blob
usr/.cvsignore	[new file with mode: 0644]	patch \| blob