From 1dd89ace53a178aa654c67d6d74fcc259fb303f1 Mon Sep 17 00:00:00 2001 From: Planet-Lab Support Date: Fri, 21 Jan 2005 03:35:09 +0000 Subject: [PATCH] This commit was manufactured by cvs2svn to create tag 'before-ckrm_E16-mem-controller-O1-merge'. --- .cvsignore | 13 + MAINTAINERS | 11 + Makefile | 6 +- arch/i386/Kconfig | 68 ++ arch/i386/boot/.cvsignore | 4 + arch/i386/boot/compressed/.cvsignore | 3 + arch/i386/boot/compressed/misc.c | 3 + arch/i386/boot/tools/.cvsignore | 1 + arch/i386/defconfig | 2 +- arch/i386/kernel/.cvsignore | 2 + arch/i386/kernel/Makefile | 1 + arch/i386/kernel/apic.c | 30 + arch/i386/kernel/asm-offsets.c | 2 +- arch/i386/kernel/entry.S | 49 +- arch/i386/kernel/i386_ksyms.c | 6 + arch/i386/kernel/i8259.c | 12 + arch/i386/kernel/init_task.c | 7 + arch/i386/kernel/io_apic.c | 31 + arch/i386/kernel/irq.c | 14 +- arch/i386/kernel/machine_kexec.c | 208 +++++ arch/i386/kernel/process.c | 26 + arch/i386/kernel/reboot.c | 81 +- arch/i386/kernel/relocate_kernel.S | 118 +++ configs/kernel-2.6.8-i686-planetlab.config | 10 +- drivers/block/cfq-iosched-orig.c | 706 ----------------- drivers/block/cfq-iosched.c | 408 ++++++---- drivers/block/ckrm-io.c | 280 +++---- drivers/block/ckrm-iostub.c | 18 +- drivers/char/.cvsignore | 2 + drivers/pci/.cvsignore | 3 + drivers/scsi/aic7xxx/.cvsignore | 4 + fs/aio.c | 2 +- include/.cvsignore | 1 + include/asm-i386/.cvsignore | 1 + include/asm-i386/apicdef.h | 1 + include/asm-i386/irq.h | 5 + include/asm-i386/kexec.h | 25 + include/asm-i386/module.h | 12 + include/asm-i386/processor.h | 8 +- include/asm-i386/segment.h | 2 + include/asm-i386/thread_info.h | 5 +- include/linux/.cvsignore | 3 + include/linux/ckrm-io.h | 1 + include/linux/fs.h | 9 + include/linux/kexec.h | 56 ++ .../linux/netfilter_ipv4/ip_conntrack_pptp.h | 310 -------- .../netfilter_ipv4/ip_conntrack_proto_gre.h | 123 --- include/linux/netfilter_ipv4/ip_nat_pptp.h | 11 - include/linux/reboot.h | 2 + kernel/.cvsignore | 2 + kernel/Makefile | 1 + kernel/ckrm/Makefile | 2 +- kernel/ckrm/ckrm_laq.c | 495 ------------ kernel/ckrm/ckrm_listenaq.c | 12 +- kernel/itimer.c | 4 +- kernel/kexec.c | 640 ++++++++++++++++ kernel/signal.c | 22 +- kernel/sys.c | 21 + lib/.cvsignore | 2 + net/ipv4/netfilter/ip_conntrack_pptp.c | 712 ------------------ net/ipv4/netfilter/ip_conntrack_pptp_priv.h | 24 - net/ipv4/netfilter/ip_conntrack_proto_gre.c | 349 --------- net/ipv4/netfilter/ip_nat_pptp.c | 477 ------------ net/ipv4/netfilter/ip_nat_proto_gre.c | 210 ------ net/ipv4/udp.c | 73 ++ scripts/.cvsignore | 4 + scripts/basic/.cvsignore | 3 + scripts/kconfig/.cvsignore | 5 + scripts/kernel-2.6-planetlab.spec | 2 +- scripts/lxdialog/.cvsignore | 1 + scripts/mod/.cvsignore | 3 + usr/.cvsignore | 3 + 72 files changed, 1933 insertions(+), 3840 deletions(-) create mode 100644 .cvsignore create mode 100644 arch/i386/boot/.cvsignore create mode 100644 arch/i386/boot/compressed/.cvsignore create mode 100644 arch/i386/boot/tools/.cvsignore create mode 100644 arch/i386/kernel/.cvsignore create mode 100644 arch/i386/kernel/machine_kexec.c create mode 100644 arch/i386/kernel/relocate_kernel.S delete mode 100644 drivers/block/cfq-iosched-orig.c create mode 100644 drivers/char/.cvsignore create mode 100644 drivers/pci/.cvsignore create mode 100644 drivers/scsi/aic7xxx/.cvsignore create mode 100644 include/.cvsignore create mode 100644 include/asm-i386/.cvsignore create mode 100644 include/asm-i386/kexec.h create mode 100644 include/linux/.cvsignore create mode 100644 include/linux/kexec.h delete mode 100644 include/linux/netfilter_ipv4/ip_conntrack_pptp.h delete mode 100644 include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h delete mode 100644 include/linux/netfilter_ipv4/ip_nat_pptp.h create mode 100644 kernel/.cvsignore delete mode 100644 kernel/ckrm/ckrm_laq.c create mode 100644 kernel/kexec.c create mode 100644 lib/.cvsignore delete mode 100644 net/ipv4/netfilter/ip_conntrack_pptp.c delete mode 100644 net/ipv4/netfilter/ip_conntrack_pptp_priv.h delete mode 100644 net/ipv4/netfilter/ip_conntrack_proto_gre.c delete mode 100644 net/ipv4/netfilter/ip_nat_pptp.c delete mode 100644 net/ipv4/netfilter/ip_nat_proto_gre.c create mode 100644 scripts/.cvsignore create mode 100644 scripts/basic/.cvsignore create mode 100644 scripts/kconfig/.cvsignore create mode 100644 scripts/lxdialog/.cvsignore create mode 100644 scripts/mod/.cvsignore create mode 100644 usr/.cvsignore diff --git a/.cvsignore b/.cvsignore new file mode 100644 index 000000000..5e7d07457 --- /dev/null +++ b/.cvsignore @@ -0,0 +1,13 @@ +.config +.tmp_System.map +.tmp_kallsyms1.S +.tmp_kallsyms2.S +.tmp_kallsyms3.S +.tmp_versions +.tmp_vmlinux1 +.tmp_vmlinux2 +.tmp_vmlinux3 +.version +Module.symvers +System.map +vmlinux diff --git a/MAINTAINERS b/MAINTAINERS index c8c25df43..523f115fb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1226,6 +1226,17 @@ W: http://nfs.sourceforge.net/ W: http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/ S: Maintained +KEXEC +P: Eric Biederman +P: Randy Dunlap +M: ebiederm@xmission.com +M: rddunlap@osdl.org +W: http://www.xmission.com/~ebiederm/files/kexec/ +W: http://developer.osdl.org/rddunlap/kexec/ +L: linux-kernel@vger.kernel.org +L: fastboot@osdl.org +S: Maintained + LANMEDIA WAN CARD DRIVER P: Andrew Stanley-Jones M: asj@lanmedia.com diff --git a/Makefile b/Makefile index 4d94580e0..00c5f30c3 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 8 -EXTRAVERSION = -1.521.2.5.planetlab +EXTRAVERSION = -1.521.3.planetlab.2004.12.14 NAME=Zonked Quokka # *DOCUMENTATION* @@ -453,6 +453,10 @@ ifndef CONFIG_FRAME_POINTER CFLAGS += -fomit-frame-pointer endif +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + ifdef CONFIG_DEBUG_INFO CFLAGS += -g endif diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 15b003b50..3a3ba7fec 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -926,6 +926,74 @@ config REGPARM generate incorrect output with certain kernel constructs when -mregparm=3 is used. +config IRQSTACKS + bool "Use separate IRQ stacks" + help + If you say Y here the kernel will use a separate IRQ stack on each + cpu to handle interrupts. + +config STACK_SIZE_SHIFT + int "Kernel stack size (12 => 4KB, 13 => 8KB, 14 => 16KB)" + range 12 14 + default 12 if IRQSTACKS + default 13 + help + Select kernel stack size. 4KB stacks are best as they let + the system scale further. Use 8KB stacks if you have an + experimental kernel where a stack overlow with a 4KB stack + might occur. Use 16KB stacks if you want to safely support + Windows device drivers using either Linuxant or ndiswrapper. + +config STACK_WARN + int "Print stack trace when stack grows beyond specified bytes" + default 4096 if IRQSTACKS + default 4096 + help + The kernel will print a stack trace when the current stack exceeds + the specified size. + +config X86_STACK_CHECK + bool "Check for stack overflows" + default n + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N + +config STACK_PANIC + int "Panic when stack approaches with specified bytes of the stack limit" + depends on X86_STACK_CHECK + default 512 if IRQSTACKS + default 512 + help + Panic if the stack grows to within specified byte range. + +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similiarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + endmenu diff --git a/arch/i386/boot/.cvsignore b/arch/i386/boot/.cvsignore new file mode 100644 index 000000000..2d8a3afa4 --- /dev/null +++ b/arch/i386/boot/.cvsignore @@ -0,0 +1,4 @@ +bootsect +bzImage +setup +vmlinux.bin diff --git a/arch/i386/boot/compressed/.cvsignore b/arch/i386/boot/compressed/.cvsignore new file mode 100644 index 000000000..96b1b0022 --- /dev/null +++ b/arch/i386/boot/compressed/.cvsignore @@ -0,0 +1,3 @@ +vmlinux +vmlinux.bin +vmlinux.bin.gz diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index fa6704523..874568330 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -380,3 +380,6 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode) if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); diff --git a/arch/i386/boot/tools/.cvsignore b/arch/i386/boot/tools/.cvsignore new file mode 100644 index 000000000..378eac25d --- /dev/null +++ b/arch/i386/boot/tools/.cvsignore @@ -0,0 +1 @@ +build diff --git a/arch/i386/defconfig b/arch/i386/defconfig index aed3bc298..ed2bbb54d 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1221,7 +1221,7 @@ CONFIG_OPROFILE=y CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_FRAME_POINTER is not set -CONFIG_4KSTACKS=y +# CONFIG_4KSTACKS is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y diff --git a/arch/i386/kernel/.cvsignore b/arch/i386/kernel/.cvsignore new file mode 100644 index 000000000..21c28761b --- /dev/null +++ b/arch/i386/kernel/.cvsignore @@ -0,0 +1,2 @@ +asm-offsets.s +vmlinux.lds.s diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index a056d5068..ab1ef80d1 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_MODULES) += module.o diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index ecf2b632f..eb4d41628 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -193,6 +193,36 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT); + apic_write_around(APIC_LVT0, value); + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 43943f871..b03f579a6 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -7,11 +7,11 @@ #include #include #include +#include #include #include "sigframe.h" #include #include -#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3ac74183c..dfbade1b9 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1029,8 +1029,55 @@ ENTRY(sys_call_table) .long sys_mq_timedreceive /* 280 */ .long sys_mq_notify .long sys_mq_getsetattr - .long sys_ni_syscall /* reserved for kexec */ + .long sys_kexec_load .long sys_ioprio_set .long sys_ioprio_get /* 285 */ syscall_table_size=(.-sys_call_table) + +#ifdef CONFIG_X86_STACK_CHECK +.data +.globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) +#warning stack check enabled + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax + jle 1f +2: + popl %eax + ret +1: + /* prevent infinite recursion from call to mcount from the + * stack_overflow function. Need to revisit this code for + * SMP based systems. + */ + lock; btsl $0,stack_overflowed + jc 2b + + /* prepare to jmp to stack_overflow directly, as if it were + * called directly by the caller of mcount. + */ + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + call stack_overflow + /* Note that stack_overflow() will clear the stack_overflowed + * variable. + */ + + popl %edi + popl %esi + popl %ebx + popl %ebp + + popl %eax + ret +#endif diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 5a50c536d..584982c3e 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -188,6 +188,12 @@ EXPORT_SYMBOL(atomic_dec_and_lock); EXPORT_SYMBOL(__PAGE_KERNEL); +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); +#endif + + #ifdef CONFIG_HIGHMEM EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index 97653d20f..7141d27ec 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -244,9 +244,21 @@ static int i8259A_resume(struct sys_device *dev) return 0; } +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .resume = i8259A_resume, + .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c index 7422d73ee..30cfd4085 100644 --- a/arch/i386/kernel/init_task.c +++ b/arch/i386/kernel/init_task.c @@ -29,6 +29,13 @@ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = { INIT_THREAD_INFO(init_task, init_thread_union) }; +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task, stack_overflow_stack) }; +#endif + + /* * Initial task structure. * diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 39af35d19..f600e6799 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1604,11 +1604,42 @@ static void __init enable_IO_APIC(void) */ void disable_IO_APIC(void) { + int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); + /* + * If the i82559 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrups can be delivered. + */ + pin = find_isa_irq_pin(0, mp_ExtINT); + if (pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = 7; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = 0; + + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } disconnect_bsp_APIC(); } diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 22f7fc771..1c8bedaeb 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -76,8 +76,10 @@ static void register_irq_proc (unsigned int irq); /* * per-CPU IRQ handling stacks */ +#ifdef CONFIG_IRQSTACKS union irq_ctx *hardirq_ctx[NR_CPUS]; union irq_ctx *softirq_ctx[NR_CPUS]; +#endif /* * Special irq handlers. @@ -220,6 +222,9 @@ asmlinkage int handle_IRQ_event(unsigned int irq, int status = 1; /* Force the "do bottom halves" bit */ int retval = 0; + if (!(action->flags & SA_INTERRUPT)) + local_irq_enable(); + do { status |= action->flags; retval |= action->handler(irq, action->dev_id, regs); @@ -489,10 +494,12 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs) u32 *isp; union irq_ctx * curctx; union irq_ctx * irqctx; - +#ifdef CONFIG_IRQSTACKS curctx = (union irq_ctx *) current_thread_info(); irqctx = hardirq_ctx[smp_processor_id()]; - +#else + curctx = irqctx = (union irq_ctx *)0; +#endif spin_unlock(&desc->lock); /* @@ -536,7 +543,6 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs) break; desc->status &= ~IRQ_PENDING; } - desc->status &= ~IRQ_INPROGRESS; out: @@ -1095,6 +1101,7 @@ void init_irq_proc (void) } +#ifdef CONFIG_IRQSTACKS /* * These should really be __section__(".bss.page_aligned") as well, but * gcc's 3.0 and earlier don't handle that correctly. @@ -1174,3 +1181,4 @@ asmlinkage void do_softirq(void) } EXPORT_SYMBOL(do_softirq); +#endif diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c new file mode 100644 index 000000000..3a9e878f8 --- /dev/null +++ b/arch/i386/kernel/machine_kexec.c @@ -0,0 +1,208 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline unsigned long read_cr3(void) +{ + unsigned long cr3; + asm volatile("movl %%cr3,%0": "=r"(cr3)); + return cr3; +} + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) + +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L2_ATTR (_PAGE_PRESENT) + +#define LEVEL0_SIZE (1UL << 12UL) + +#ifndef CONFIG_X86_PAE +#define LEVEL1_SIZE (1UL << 22UL) +static u32 pgtable_level1[1024] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long level1_index, level2_index; + u32 *pgtable_level2; + + /* Find the current page table */ + pgtable_level2 = __va(read_cr3()); + + /* Find the indexes of the physical address to identity map */ + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; + level2_index = address / LEVEL1_SIZE; + + /* Identity map the page table entry */ + pgtable_level1[level1_index] = address | L0_ATTR; + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(pgtable_level2); +} + +#else +#define LEVEL1_SIZE (1UL << 21UL) +#define LEVEL2_SIZE (1UL << 30UL) +static u64 pgtable_level1[512] PAGE_ALIGNED; +static u64 pgtable_level2[512] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long level1_index, level2_index, level3_index; + u64 *pgtable_level3; + + /* Find the current page table */ + pgtable_level3 = __va(read_cr3()); + + /* Find the indexes of the physical address to identity map */ + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; + level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; + level3_index = address / LEVEL2_SIZE; + + /* Identity map the page table entry */ + pgtable_level1[level1_index] = address | L0_ATTR; + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; + set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR); + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(pgtable_level3); +} +#endif + + +static void set_idt(void *newidt, __u16 limit) +{ + unsigned char curidt[6]; + + /* ia32 supports unaliged loads & stores */ + (*(__u16 *)(curidt)) = limit; + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt); + + __asm__ __volatile__ ( + "lidt %0\n" + : "=m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, __u16 limit) +{ + unsigned char curgdt[6]; + + /* ia32 supports unaligned loads & stores */ + (*(__u16 *)(curgdt)) = limit; + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt); + + __asm__ __volatile__ ( + "lgdt %0\n" + : "=m" (curgdt) + ); +}; + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%eax\n" + "\tmovl %eax,%ds\n" + "\tmovl %eax,%es\n" + "\tmovl %eax,%fs\n" + "\tmovl %eax,%gs\n" + "\tmovl %eax,%ss\n" + ); +#undef STR +#undef __STR +} + +typedef asmlinkage void (*relocate_new_kernel_t)( + unsigned long indirection_page, unsigned long reboot_code_buffer, + unsigned long start_address, unsigned int has_pae); + +const extern unsigned char relocate_new_kernel[]; +extern void relocate_new_kernel_end(void); +const extern unsigned int relocate_new_kernel_size; + +/* + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. Currently nothing. + */ +int machine_kexec_prepare(struct kimage *image) +{ + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long indirection_page; + unsigned long reboot_code_buffer; + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Compute some offsets */ + reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + indirection_page = image->head & PAGE_MASK; + + /* Set up an identity mapping for the reboot_code_buffer */ + identity_map_page(reboot_code_buffer); + + /* copy it out */ + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size); + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again you set the segment to a different selector. + * + * The more common model is are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + + /* now call it */ + rnk = (relocate_new_kernel_t) reboot_code_buffer; + (*rnk)(indirection_page, reboot_code_buffer, image->start, cpu_has_pae); +} diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 3093d1fc6..e8a01f2b5 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -219,6 +219,32 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); +void stack_overflow(void) +{ + extern unsigned long stack_overflowed; + unsigned long esp = current_stack_pointer(); + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + oops_in_progress = 1; + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%lx %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, + (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing); + show_trace(current,(void*)esp); + + if (panicing) + panic("stack overflow\n"); + + oops_in_progress = 0; + + /* Just let it happen once per task, as otherwise it goes nuts + * in printing stack traces. This means that I need to dump + * the stack_overflowed boolean into the task or thread_info + * structure. For now just turn it off all together. + */ + + /* stack_overflowed = 0; */ +} + void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index e8d5cd3ab..85e89f94b 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -23,7 +23,6 @@ static int reboot_mode; int reboot_thru_bios; #ifdef CONFIG_SMP -int reboot_smp = 0; static int reboot_cpu = -1; /* shamelessly grabbed from lib/vsprintf.c for readability */ #define is_digit(c) ((c) >= '0' && (c) <= '9') @@ -85,33 +84,9 @@ static int __init set_bios_reboot(struct dmi_system_id *d) return 0; } -/* - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic. - */ -static int __init set_smp_reboot(struct dmi_system_id *d) -{ -#ifdef CONFIG_SMP - if (!reboot_smp) { - reboot_smp = 1; - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident); - } -#endif - return 0; -} - -/* - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic. - */ -static int __init set_smp_bios_reboot(struct dmi_system_id *d) -{ - set_smp_reboot(d); - set_bios_reboot(d); - return 0; -} - static struct dmi_system_id __initdata reboot_dmi_table[] = { { /* Handle problems with rebooting on Dell 1300's */ - .callback = set_smp_bios_reboot, + .callback = set_bios_reboot, .ident = "Dell PowerEdge 1300", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), @@ -294,41 +269,32 @@ void machine_real_restart(unsigned char *code, int length) : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); } -void machine_restart(char * __unused) +void machine_shutdown(void) { #ifdef CONFIG_SMP - int cpuid; - - cpuid = GET_APIC_ID(apic_read(APIC_ID)); - - if (reboot_smp) { - - /* check to see if reboot_cpu is valid - if its not, default to the BSP */ - if ((reboot_cpu == -1) || - (reboot_cpu > (NR_CPUS -1)) || - !physid_isset(cpuid, phys_cpu_present_map)) - reboot_cpu = boot_cpu_physical_apicid; - - reboot_smp = 0; /* use this as a flag to only go through this once*/ - /* re-run this function on the other CPUs - it will fall though this section since we have - cleared reboot_smp, and do the reboot if it is the - correct CPU, otherwise it halts. */ - if (reboot_cpu != cpuid) - smp_call_function((void *)machine_restart , NULL, 1, 0); + int reboot_cpu_id; + + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* See if there has been given a command line override */ + if ((reboot_cpu_id != -1) && (reboot_cpu < NR_CPUS) && + cpu_isset(reboot_cpu, cpu_online_map)) { + reboot_cpu_id = reboot_cpu; } - /* if reboot_cpu is still -1, then we want a tradional reboot, - and if we are not running on the reboot_cpu,, halt */ - if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) { - for (;;) - __asm__ __volatile__ ("hlt"); + /* Make certain the cpu I'm rebooting on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); } - /* - * Stop all CPUs and turn off local APICs and the IO-APIC, so - * other OSs see a clean IRQ state. + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K. Now that I'm on the appropriate processor, stop + * all of the others, and disable their local APICs. */ + if (!netdump_mode) smp_send_stop(); #elif defined(CONFIG_X86_LOCAL_APIC) @@ -341,6 +307,11 @@ void machine_restart(char * __unused) #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); #endif +} + +void machine_restart(char * __unused) +{ + machine_shutdown(); if (!reboot_thru_bios) { if (efi_enabled) { diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S new file mode 100644 index 000000000..54be4c2ae --- /dev/null +++ b/arch/i386/kernel/relocate_kernel.S @@ -0,0 +1,118 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include + + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + */ + .globl relocate_new_kernel +relocate_new_kernel: + /* read the arguments and say goodbye to the stack */ + movl 4(%esp), %ebx /* indirection_page */ + movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 12(%esp), %edx /* start address */ + movl 16(%esp), %ecx /* cpu_has_pae */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* set a new stack at the bottom of our page... */ + lea 4096(%ebp), %esp + + /* store the parameters back on the stack */ + pushl %edx /* store the start address */ + + /* Set cr0 to a known state: + * 31 0 == Paging disabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movl %cr0, %eax + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax + orl $(1<<0), %eax + movl %eax, %cr0 + + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f + /* Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + movl %cr4, %eax + andl $0, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + /* Do the copies */ + cld +0: /* top, read another word for the indirection page */ + movl %ebx, %ecx + movl (%ebx), %ecx + addl $4, %ebx + testl $0x1, %ecx /* is it a destination page */ + jz 1f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +1: + testl $0x2, %ecx /* is it an indirection page */ + jz 1f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +1: + testl $0x4, %ecx /* is it the done indicator */ + jz 1f + jmp 2f +1: + testl $0x8, %ecx /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl $1024, %ecx + rep ; movsl + jmp 0b + +2: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .long relocate_new_kernel_end - relocate_new_kernel diff --git a/configs/kernel-2.6.8-i686-planetlab.config b/configs/kernel-2.6.8-i686-planetlab.config index ea66387e5..fd74aed78 100644 --- a/configs/kernel-2.6.8-i686-planetlab.config +++ b/configs/kernel-2.6.8-i686-planetlab.config @@ -30,7 +30,7 @@ CONFIG_RCFS_FS=y CONFIG_CKRM_TYPE_TASKCLASS=y CONFIG_CKRM_RES_NUMTASKS=y CONFIG_CKRM_CPU_SCHEDULE=y -CONFIG_CKRM_RES_BLKIO=y +# CONFIG_CKRM_RES_BLKIO is not set # CONFIG_CKRM_RES_MEM is not set # CONFIG_CKRM_TYPE_SOCKETCLASS is not set CONFIG_CKRM_RBCE=y @@ -140,6 +140,12 @@ CONFIG_HIGHPTE=y # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y CONFIG_REGPARM=y +CONFIG_IRQSTACKS=y +CONFIG_STACK_SIZE_SHIFT=13 +CONFIG_STACK_WARN=4000 +CONFIG_X86_STACK_CHECK=y +CONFIG_STACK_PANIC=512 +CONFIG_KEXEC=y # # Power management options (ACPI, APM) @@ -211,7 +217,7 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y # # Block devices # -# CONFIG_BLK_DEV_FD is not set +CONFIG_BLK_DEV_FD=m # CONFIG_BLK_DEV_XD is not set CONFIG_BLK_CPQ_DA=m CONFIG_BLK_CPQ_CISS_DA=m diff --git a/drivers/block/cfq-iosched-orig.c b/drivers/block/cfq-iosched-orig.c deleted file mode 100644 index 977d32ddd..000000000 --- a/drivers/block/cfq-iosched-orig.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * linux/drivers/block/cfq-iosched.c - * - * CFQ, or complete fairness queueing, disk scheduler. - * - * Based on ideas from a previously unfinished io - * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. - * - * Copyright (C) 2003 Jens Axboe - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * tunables - */ -static int cfq_quantum = 4; -static int cfq_queued = 8; - -#define CFQ_QHASH_SHIFT 6 -#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) -#define list_entry_qhash(entry) list_entry((entry), struct cfq_queue, cfq_hash) - -#define CFQ_MHASH_SHIFT 8 -#define CFQ_MHASH_BLOCK(sec) ((sec) >> 3) -#define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT) -#define CFQ_MHASH_FN(sec) (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT)) -#define ON_MHASH(crq) !list_empty(&(crq)->hash) -#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define list_entry_hash(ptr) list_entry((ptr), struct cfq_rq, hash) - -#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) - -#define RQ_DATA(rq) ((struct cfq_rq *) (rq)->elevator_private) - -static kmem_cache_t *crq_pool; -static kmem_cache_t *cfq_pool; -static mempool_t *cfq_mpool; - -struct cfq_data { - struct list_head rr_list; - struct list_head *dispatch; - struct list_head *cfq_hash; - - struct list_head *crq_hash; - - unsigned int busy_queues; - unsigned int max_queued; - - mempool_t *crq_pool; -}; - -struct cfq_queue { - struct list_head cfq_hash; - struct list_head cfq_list; - struct rb_root sort_list; - int pid; - int queued[2]; -#if 0 - /* - * with a simple addition like this, we can do io priorities. almost. - * does need a split request free list, too. - */ - int io_prio -#endif -}; - -struct cfq_rq { - struct rb_node rb_node; - sector_t rb_key; - - struct request *request; - - struct cfq_queue *cfq_queue; - - struct list_head hash; -}; - -static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq); -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid); -static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq); - -/* - * lots of deadline iosched dupes, can be abstracted later... - */ -static inline void __cfq_del_crq_hash(struct cfq_rq *crq) -{ - list_del_init(&crq->hash); -} - -static inline void cfq_del_crq_hash(struct cfq_rq *crq) -{ - if (ON_MHASH(crq)) - __cfq_del_crq_hash(crq); -} - -static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) -{ - cfq_del_crq_hash(crq); - - if (q->last_merge == crq->request) - q->last_merge = NULL; -} - -static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) -{ - struct request *rq = crq->request; - - BUG_ON(ON_MHASH(crq)); - - list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]); -} - -static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) -{ - struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; - struct list_head *entry, *next = hash_list->next; - - while ((entry = next) != hash_list) { - struct cfq_rq *crq = list_entry_hash(entry); - struct request *__rq = crq->request; - - next = entry->next; - - BUG_ON(!ON_MHASH(crq)); - - if (!rq_mergeable(__rq)) { - __cfq_del_crq_hash(crq); - continue; - } - - if (rq_hash_key(__rq) == offset) - return __rq; - } - - return NULL; -} - -/* - * rb tree support functions - */ -#define RB_NONE (2) -#define RB_EMPTY(node) ((node)->rb_node == NULL) -#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) -#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL) -#define ON_RB(node) ((node)->rb_color != RB_NONE) -#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) -#define rq_rb_key(rq) (rq)->sector - -static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) -{ - if (ON_RB(&crq->rb_node)) { - cfqq->queued[rq_data_dir(crq->request)]--; - rb_erase(&crq->rb_node, &cfqq->sort_list); - crq->cfq_queue = NULL; - } -} - -static struct cfq_rq * -__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) -{ - struct rb_node **p = &cfqq->sort_list.rb_node; - struct rb_node *parent = NULL; - struct cfq_rq *__crq; - - while (*p) { - parent = *p; - __crq = rb_entry_crq(parent); - - if (crq->rb_key < __crq->rb_key) - p = &(*p)->rb_left; - else if (crq->rb_key > __crq->rb_key) - p = &(*p)->rb_right; - else - return __crq; - } - - rb_link_node(&crq->rb_node, parent, p); - return 0; -} - -static void -cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) -{ - struct request *rq = crq->request; - struct cfq_rq *__alias; - - crq->rb_key = rq_rb_key(rq); - cfqq->queued[rq_data_dir(rq)]++; -retry: - __alias = __cfq_add_crq_rb(cfqq, crq); - if (!__alias) { - rb_insert_color(&crq->rb_node, &cfqq->sort_list); - crq->cfq_queue = cfqq; - return; - } - - cfq_del_crq_rb(cfqq, __alias); - cfq_dispatch_sort(cfqd->dispatch, __alias); - goto retry; -} - -static struct request * -cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) -{ - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid); - struct rb_node *n; - - if (!cfqq) - goto out; - - n = cfqq->sort_list.rb_node; - while (n) { - struct cfq_rq *crq = rb_entry_crq(n); - - if (sector < crq->rb_key) - n = n->rb_left; - else if (sector > crq->rb_key) - n = n->rb_right; - else - return crq->request; - } - -out: - return NULL; -} - -static void cfq_remove_request(request_queue_t *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); - - if (crq) { - struct cfq_queue *cfqq = crq->cfq_queue; - - cfq_remove_merge_hints(q, crq); - list_del_init(&rq->queuelist); - - if (cfqq) { - cfq_del_crq_rb(cfqq, crq); - - if (RB_EMPTY(&cfqq->sort_list)) - cfq_put_queue(cfqd, cfqq); - } - } -} - -static int -cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct request *__rq; - int ret; - - ret = elv_try_last_merge(q, bio); - if (ret != ELEVATOR_NO_MERGE) { - __rq = q->last_merge; - goto out_insert; - } - - __rq = cfq_find_rq_hash(cfqd, bio->bi_sector); - if (__rq) { - BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_BACK_MERGE; - goto out; - } - } - - __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio)); - if (__rq) { - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_FRONT_MERGE; - goto out; - } - } - - return ELEVATOR_NO_MERGE; -out: - q->last_merge = __rq; -out_insert: - *req = __rq; - return ret; -} - -static void cfq_merged_request(request_queue_t *q, struct request *req) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_DATA(req); - - cfq_del_crq_hash(crq); - cfq_add_crq_hash(cfqd, crq); - - if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) { - struct cfq_queue *cfqq = crq->cfq_queue; - - cfq_del_crq_rb(cfqq, crq); - cfq_add_crq_rb(cfqd, cfqq, crq); - } - - q->last_merge = req; -} - -static void -cfq_merged_requests(request_queue_t *q, struct request *req, - struct request *next) -{ - cfq_merged_request(q, req); - cfq_remove_request(q, next); -} - -static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq) -{ - struct list_head *entry = head; - struct request *__rq; - - if (!list_empty(head)) { - __rq = list_entry_rq(head->next); - - if (crq->request->sector < __rq->sector) { - entry = head->prev; - goto link; - } - } - - while ((entry = entry->prev) != head) { - __rq = list_entry_rq(entry); - - if (crq->request->sector <= __rq->sector) - break; - } - -link: - list_add_tail(&crq->request->queuelist, entry); -} - -static inline void -__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list)); - - cfq_del_crq_rb(cfqq, crq); - cfq_remove_merge_hints(q, crq); - cfq_dispatch_sort(cfqd->dispatch, crq); -} - -static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq; - struct list_head *entry, *tmp; - int ret, queued, good_queues; - - if (list_empty(&cfqd->rr_list)) - return 0; - - queued = ret = 0; -restart: - good_queues = 0; - list_for_each_safe(entry, tmp, &cfqd->rr_list) { - cfqq = list_entry_cfqq(cfqd->rr_list.next); - - BUG_ON(RB_EMPTY(&cfqq->sort_list)); - - __cfq_dispatch_requests(q, cfqd, cfqq); - - if (RB_EMPTY(&cfqq->sort_list)) - cfq_put_queue(cfqd, cfqq); - else - good_queues++; - - queued++; - ret = 1; - } - - if ((queued < cfq_quantum) && good_queues) - goto restart; - - return ret; -} - -static struct request *cfq_next_request(request_queue_t *q) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct request *rq; - - if (!list_empty(cfqd->dispatch)) { - struct cfq_rq *crq; -dispatch: - rq = list_entry_rq(cfqd->dispatch->next); - - crq = RQ_DATA(rq); - if (crq) - cfq_remove_merge_hints(q, crq); - - return rq; - } - - if (cfq_dispatch_requests(q, cfqd)) - goto dispatch; - - return NULL; -} - -static inline struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval) -{ - struct list_head *hash_list = &cfqd->cfq_hash[hashval]; - struct list_head *entry; - - list_for_each(entry, hash_list) { - struct cfq_queue *__cfqq = list_entry_qhash(entry); - - if (__cfqq->pid == pid) - return __cfqq; - } - - return NULL; -} - -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid) -{ - const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); - - return __cfq_find_cfq_hash(cfqd, pid, hashval); -} - -static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfqd->busy_queues--; - list_del(&cfqq->cfq_list); - list_del(&cfqq->cfq_hash); - mempool_free(cfqq, cfq_mpool); -} - -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid) -{ - const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); - struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval); - - if (!cfqq) { - cfqq = mempool_alloc(cfq_mpool, GFP_NOIO); - - INIT_LIST_HEAD(&cfqq->cfq_hash); - INIT_LIST_HEAD(&cfqq->cfq_list); - RB_CLEAR_ROOT(&cfqq->sort_list); - - cfqq->pid = pid; - cfqq->queued[0] = cfqq->queued[1] = 0; - list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); - } - - return cfqq; -} - -static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) -{ - struct cfq_queue *cfqq; - - cfqq = cfq_get_queue(cfqd, current->tgid); - - cfq_add_crq_rb(cfqd, cfqq, crq); - - if (list_empty(&cfqq->cfq_list)) { - list_add(&cfqq->cfq_list, &cfqd->rr_list); - cfqd->busy_queues++; - } -} - -static void -cfq_insert_request(request_queue_t *q, struct request *rq, int where) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); - - switch (where) { - case ELEVATOR_INSERT_BACK: - while (cfq_dispatch_requests(q, cfqd)) - ; - list_add_tail(&rq->queuelist, cfqd->dispatch); - break; - case ELEVATOR_INSERT_FRONT: - list_add(&rq->queuelist, cfqd->dispatch); - break; - case ELEVATOR_INSERT_SORT: - BUG_ON(!blk_fs_request(rq)); - cfq_enqueue(cfqd, crq); - break; - default: - printk("%s: bad insert point %d\n", __FUNCTION__,where); - return; - } - - if (rq_mergeable(rq)) { - cfq_add_crq_hash(cfqd, crq); - - if (!q->last_merge) - q->last_merge = rq; - } -} - -static int cfq_queue_empty(request_queue_t *q) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - - if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list)) - return 1; - - return 0; -} - -static struct request * -cfq_former_request(request_queue_t *q, struct request *rq) -{ - struct cfq_rq *crq = RQ_DATA(rq); - struct rb_node *rbprev = rb_prev(&crq->rb_node); - - if (rbprev) - return rb_entry_crq(rbprev)->request; - - return NULL; -} - -static struct request * -cfq_latter_request(request_queue_t *q, struct request *rq) -{ - struct cfq_rq *crq = RQ_DATA(rq); - struct rb_node *rbnext = rb_next(&crq->rb_node); - - if (rbnext) - return rb_entry_crq(rbnext)->request; - - return NULL; -} - -static int cfq_may_queue(request_queue_t *q, int rw) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_queue *cfqq; - int ret = 1; - - if (!cfqd->busy_queues) - goto out; - - cfqq = cfq_find_cfq_hash(cfqd, current->tgid); - if (cfqq) { - int limit = (q->nr_requests - cfq_queued) / cfqd->busy_queues; - - if (limit < 3) - limit = 3; - else if (limit > cfqd->max_queued) - limit = cfqd->max_queued; - - if (cfqq->queued[rw] > limit) - ret = 0; - } -out: - return ret; -} - -static void cfq_put_request(request_queue_t *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); - - if (crq) { - BUG_ON(q->last_merge == rq); - BUG_ON(ON_MHASH(crq)); - - mempool_free(crq, cfqd->crq_pool); - rq->elevator_private = NULL; - } -} - -static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask); - - if (crq) { - RB_CLEAR(&crq->rb_node); - crq->request = rq; - crq->cfq_queue = NULL; - INIT_LIST_HEAD(&crq->hash); - rq->elevator_private = crq; - return 0; - } - - return 1; -} - -static void cfq_exit(request_queue_t *q, elevator_t *e) -{ - struct cfq_data *cfqd = e->elevator_data; - - e->elevator_data = NULL; - mempool_destroy(cfqd->crq_pool); - kfree(cfqd->crq_hash); - kfree(cfqd->cfq_hash); - kfree(cfqd); -} - -static int cfq_init(request_queue_t *q, elevator_t *e) -{ - struct cfq_data *cfqd; - int i; - - cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL); - if (!cfqd) - return -ENOMEM; - - memset(cfqd, 0, sizeof(*cfqd)); - INIT_LIST_HEAD(&cfqd->rr_list); - - cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); - if (!cfqd->crq_hash) - goto out_crqhash; - - cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); - if (!cfqd->cfq_hash) - goto out_cfqhash; - - cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool); - if (!cfqd->crq_pool) - goto out_crqpool; - - for (i = 0; i < CFQ_MHASH_ENTRIES; i++) - INIT_LIST_HEAD(&cfqd->crq_hash[i]); - for (i = 0; i < CFQ_QHASH_ENTRIES; i++) - INIT_LIST_HEAD(&cfqd->cfq_hash[i]); - - cfqd->dispatch = &q->queue_head; - e->elevator_data = cfqd; - - /* - * just set it to some high value, we want anyone to be able to queue - * some requests. fairness is handled differently - */ - cfqd->max_queued = q->nr_requests; - q->nr_requests = 8192; - - return 0; -out_crqpool: - kfree(cfqd->cfq_hash); -out_cfqhash: - kfree(cfqd->crq_hash); -out_crqhash: - kfree(cfqd); - return -ENOMEM; -} - -static int __init cfq_slab_setup(void) -{ - crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0, - NULL, NULL); - - if (!crq_pool) - panic("cfq_iosched: can't init crq pool\n"); - - cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0, - NULL, NULL); - - if (!cfq_pool) - panic("cfq_iosched: can't init cfq pool\n"); - - cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool); - - if (!cfq_mpool) - panic("cfq_iosched: can't init cfq mpool\n"); - - return 0; -} - -subsys_initcall(cfq_slab_setup); - -elevator_t iosched_cfq = { - .elevator_name = "cfq", - .elevator_merge_fn = cfq_merge, - .elevator_merged_fn = cfq_merged_request, - .elevator_merge_req_fn = cfq_merged_requests, - .elevator_next_req_fn = cfq_next_request, - .elevator_add_req_fn = cfq_insert_request, - .elevator_remove_req_fn = cfq_remove_request, - .elevator_queue_empty_fn = cfq_queue_empty, - .elevator_former_req_fn = cfq_former_request, - .elevator_latter_req_fn = cfq_latter_request, - .elevator_set_req_fn = cfq_set_request, - .elevator_put_req_fn = cfq_put_request, - .elevator_may_queue_fn = cfq_may_queue, - .elevator_init_fn = cfq_init, - .elevator_exit_fn = cfq_exit, -}; - -EXPORT_SYMBOL(iosched_cfq); diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c index 7b45a805d..70d66c5c9 100644 --- a/drivers/block/cfq-iosched.c +++ b/drivers/block/cfq-iosched.c @@ -39,8 +39,6 @@ #error Cannot support this many io priority levels #endif -#define LIMIT_DEBUG 1 - /* * tunables */ @@ -52,6 +50,10 @@ static int cfq_queued = 4; static int cfq_grace_rt = HZ / 100 ?: 1; static int cfq_grace_idle = HZ / 10; +#define CFQ_EPOCH 1000000000 +#define CFQ_SECTORATE 1000 +#define CFQ_HMAX_PCT 80 + #define CFQ_QHASH_SHIFT 6 #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) @@ -69,13 +71,6 @@ static int cfq_grace_idle = HZ / 10; #define cfq_account_io(crq) \ ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT) -/* define to be 50 ms for now; make tunable later */ -#define CFQ_EPOCH 50000 -/* Needs to be made tunable right away, in MiB/s */ -#define CFQ_DISKBW 10 -/* Temporary global limit, as percent of available b/w, for each "class" */ -#define CFQ_TEMPLIM 10 - /* * defines how we distribute bandwidth (can be tgid, uid, etc) */ @@ -87,18 +82,22 @@ static int cfq_grace_idle = HZ / 10; */ #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE) -extern inline void *cki_hash_key(struct task_struct *tsk); -extern inline int cki_ioprio(struct task_struct *tsk); -#define cfq_hash_key(current) ((int)cki_hash_key((current))) -#define cfq_ioprio(current) (cki_ioprio((current))) +extern void *cki_hash_key(struct task_struct *tsk); +extern int cki_ioprio(struct task_struct *tsk); +extern void *cki_cfqpriv(struct task_struct *tsk); + +#define cfq_hash_key(tsk) ((int)cki_hash_key((tsk))) +#define cfq_ioprio(tsk) (cki_ioprio((tsk))) +#define cfq_cfqpriv(cfqd,tsk) (cki_cfqpriv((tsk))) #else -#define cfq_hash_key(current) ((current)->tgid) +#define cfq_hash_key(tsk) ((tsk)->tgid) +#define cfq_cfqpriv(cfqd,tsk) (&(((cfqd)->cid[(tsk)->ioprio]).cfqpriv)) /* * move to io_context */ -#define cfq_ioprio(current) ((current)->ioprio) +#define cfq_ioprio(tsk) ((tsk)->ioprio) #endif #define CFQ_WAIT_RT 0 @@ -125,16 +124,12 @@ struct io_prio_data { atomic_t cum_sectors_in,cum_sectors_out; atomic_t cum_queues_in,cum_queues_out; -#ifdef LIMIT_DEBUG - int nskip; - unsigned long navsec; - unsigned long csectorate; - unsigned long lsectorate; -#endif + cfqlim_t cfqpriv; /* data for enforcing limits */ struct list_head prio_list; int last_rq; int last_sectors; + }; /* @@ -179,8 +174,9 @@ struct cfq_data { unsigned int cfq_grace_rt; unsigned int cfq_grace_idle; - unsigned long cfq_epoch; /* duration for limit enforcement */ - unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */ + unsigned int cfq_epoch; + unsigned int cfq_hmax_pct; + unsigned int cfq_qsectorate; }; /* @@ -194,14 +190,34 @@ struct cfq_queue { int queued[2]; int ioprio; + /* limit related settings/stats obtained + either from io_prio_data or ckrm I/O class + */ + struct cfqlim *cfqpriv; + + u64 epstart; /* current epoch's starting timestamp (ns) */ + u64 epsector[2]; /* Total sectors dispatched in [0] previous + * and [1] current epoch + */ + unsigned long avsec; /* avg sectors dispatched/epoch */ - unsigned long long lastime; /* timestamp of last request served */ - unsigned long sectorate; /* limit for sectors served/epoch */ +// unsigned long long lastime; /* timestamp of last request served */ +// unsigned long sectorate; /* limit for sectors served/epoch */ int skipped; /* queue skipped at last dispatch ? */ + + /* Per queue timer to suspend/resume queue from processing */ + struct timer_list timer; + unsigned long wait_end; + unsigned long flags; + struct work_struct work; + + struct cfq_data *cfqd; }; + + /* - * per-request structure + * Per-request structure */ struct cfq_rq { struct cfq_queue *cfq_queue; @@ -516,69 +532,101 @@ link: list_add_tail(&crq->request->queuelist, entry); } -/* - * remove from io scheduler core and put on dispatch list for service - */ +struct cfq_queue *dcfqq; +u64 dtmp; + + + +/* Over how many ns is sectorate defined */ +#define NS4SCALE (100000000) + static inline int -__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, - struct cfq_queue *cfqq) +__cfq_check_limit(struct cfq_data *cfqd,struct cfq_queue *cfqq, int dontskip) { struct cfq_rq *crq; - unsigned long long ts, gap; - unsigned long newavsec; + unsigned long long ts, gap, epoch, tmp; + unsigned long newavsec, sectorate; crq = rb_entry_crq(rb_first(&cfqq->sort_list)); -#if 1 - /* Determine if queue should be skipped for being overshare */ ts = sched_clock(); - gap = ts - cfqq->lastime; -#ifdef LIMIT_DEBUG - cfqq->sectorate = (cfqd->cfq_epochsectors - * CFQ_TEMPLIM)/100; - -#endif - if ((gap >= cfqd->cfq_epoch) || (gap < 0)) { - cfqq->avsec = crq->nr_sectors ; - cfqq->lastime = ts; + gap = ts - cfqq->epstart; + epoch = cfqd->cfq_epoch; + + sectorate = atomic_read(&cfqq->cfqpriv->sectorate); +// sectorate = atomic_read(&(cfqd->cid[crq->ioprio].sectorate)); + + dcfqq = cfqq; + + if ((gap >= epoch) || (gap < 0)) { + + if (gap >= (epoch << 1)) { + cfqq->epsector[0] = 0; + cfqq->epstart = ts ; + } else { + cfqq->epsector[0] = cfqq->epsector[1]; + cfqq->epstart += epoch; + } + cfqq->epsector[1] = 0; + gap = ts - cfqq->epstart; + + tmp = (cfqq->epsector[0] + crq->nr_sectors) * NS4SCALE; + do_div(tmp,epoch+gap); + + cfqq->avsec = (unsigned long)tmp; + cfqq->skipped = 0; + cfqq->epsector[1] += crq->nr_sectors; + + cfqq->cfqpriv->navsec = cfqq->avsec; + cfqq->cfqpriv->sec[0] = cfqq->epsector[0]; + cfqq->cfqpriv->sec[1] = cfqq->epsector[1]; + cfqq->cfqpriv->timedout++; + /* + cfqd->cid[crq->ioprio].navsec = cfqq->avsec; + cfqd->cid[crq->ioprio].sec[0] = cfqq->epsector[0]; + cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1]; + cfqd->cid[crq->ioprio].timedout++; + */ + return 0; } else { - u64 tmp; - /* Age old average and accumalate request to be served */ - -// tmp = (u64) (cfqq->avsec * gap) ; -// do_div(tmp, cfqd->cfq_epoch); - newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors; -// if (crq->ioprio >= 0 && crq->ioprio <= 20) -// cfqd->cid[crq->ioprio].lsectorate = newavsec; -// atomic_set(&(cfqd->cid[crq->ioprio].lsectorate), -// newavsec); - - if ((newavsec < cfqq->sectorate) || cfqq->skipped) { + + tmp = (cfqq->epsector[0] + cfqq->epsector[1] + crq->nr_sectors) + * NS4SCALE; + do_div(tmp,epoch+gap); + + newavsec = (unsigned long)tmp; + if ((newavsec < sectorate) || dontskip) { cfqq->avsec = newavsec ; - cfqq->lastime = ts; cfqq->skipped = 0; + cfqq->epsector[1] += crq->nr_sectors; + cfqq->cfqpriv->navsec = cfqq->avsec; + cfqq->cfqpriv->sec[1] = cfqq->epsector[1]; + /* + cfqd->cid[crq->ioprio].navsec = cfqq->avsec; + cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1]; + */ } else { - /* queue over share ; skip once */ cfqq->skipped = 1; -#ifdef LIMIT_DEBUG -// atomic_inc(&(cfqd->cid[crq->ioprio].nskip)); -// if (crq->ioprio >= 0 && crq->ioprio <= 20) -// cfqd->cid[crq->ioprio].nskip++; -#endif - return 0; + /* pause q's processing till avsec drops to + cfq_hmax_pct % of its value */ + tmp = (epoch+gap) * (100-cfqd->cfq_hmax_pct); + do_div(tmp,1000000*cfqd->cfq_hmax_pct); + cfqq->wait_end = jiffies+msecs_to_jiffies(tmp); } - } -#endif + } +} -#ifdef LIMIT_DEBUG -// if (crq->ioprio >= 0 && crq->ioprio <= 20) { -// cfqd->cid[crq->ioprio].navsec = cfqq->avsec; -// cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate; -// } +/* + * remove from io scheduler core and put on dispatch list for service + */ +static inline int +__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, + struct cfq_queue *cfqq) +{ + struct cfq_rq *crq; + + crq = rb_entry_crq(rb_first(&cfqq->sort_list)); -// atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec); -// atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate); -#endif cfq_dispatch_sort(cfqd, cfqq, crq); /* @@ -593,44 +641,83 @@ cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors) { struct cfq_data *cfqd = q->elevator.elevator_data; struct list_head *plist = &cfqd->cid[prio].rr_list; + struct cfq_queue *cfqq; struct list_head *entry, *nxt; int q_rq, q_io; - int ret ; + int first_round,busy_queues,busy_unlimited; + /* * for each queue at this prio level, dispatch a request */ q_rq = q_io = 0; + first_round=1; + restart: + busy_unlimited = 0; + busy_queues = 0; list_for_each_safe(entry, nxt, plist) { - struct cfq_queue *cfqq = list_entry_cfqq(entry); + cfqq = list_entry_cfqq(entry); BUG_ON(RB_EMPTY(&cfqq->sort_list)); + busy_queues++; - ret = __cfq_dispatch_requests(q, cfqd, cfqq); - if (ret <= 0) { - continue; /* skip queue */ - /* can optimize more by moving q to end of plist ? */ + + if (first_round || busy_unlimited) + __cfq_check_limit(cfqd,cfqq,0); + else + __cfq_check_limit(cfqd,cfqq,1); + + if (cfqq->skipped) { + cfqq->cfqpriv->nskip++; + /* cfqd->cid[prio].nskip++; */ + busy_queues--; + if (time_before(jiffies, cfqq->wait_end)) { + list_del(&cfqq->cfq_list); + mod_timer(&cfqq->timer,cfqq->wait_end); + } + continue; } - q_io += ret ; - q_rq++ ; + busy_unlimited++; + + q_io += __cfq_dispatch_requests(q, cfqd, cfqq); + q_rq++; - if (RB_EMPTY(&cfqq->sort_list)) + if (RB_EMPTY(&cfqq->sort_list)) { + busy_unlimited--; + busy_queues--; cfq_put_queue(cfqd, cfqq); - /* - * if we hit the queue limit, put the string of serviced - * queues at the back of the pending list - */ + } + if (q_io >= max_sectors || q_rq >= max_rq) { +#if 0 struct list_head *prv = nxt->prev; if (prv != plist) { list_del(plist); list_add(plist, prv); } +#endif break; } } + if ((q_io < max_sectors) && (q_rq < max_rq) && + (busy_queues || first_round)) + { + first_round = 0; + goto restart; + } else { + /* + * if we hit the queue limit, put the string of serviced + * queues at the back of the pending list + */ + struct list_head *prv = nxt->prev; + if (prv != plist) { + list_del(plist); + list_add(plist, prv); + } + } + cfqd->cid[prio].last_rq = q_rq; cfqd->cid[prio].last_sectors = q_io; return q_rq; @@ -806,6 +893,29 @@ static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) mempool_free(cfqq, cfq_mpool); } +static void cfq_pauseq_timer(unsigned long data) +{ + struct cfq_queue *cfqq = (struct cfq_queue *) data; + kblockd_schedule_work(&cfqq->work); +} + +static void cfq_pauseq_work(void *data) +{ + struct cfq_queue *cfqq = (struct cfq_queue *) data; + struct cfq_data *cfqd = cfqq->cfqd; + request_queue_t *q = cfqd->queue; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + list_add_tail(&cfqq->cfq_list,&cfqd->cid[cfqq->ioprio].rr_list); + cfqq->skipped = 0; + if (cfq_next_request(q)) + q->request_fn(q); + spin_unlock_irqrestore(q->queue_lock, flags); + + //del_timer(&cfqq->timer); +} + static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey, int gfp_mask) { @@ -833,9 +943,22 @@ retry: INIT_LIST_HEAD(&cfqq->cfq_list); cfqq->hash_key = cfq_hash_key(current); cfqq->ioprio = cfq_ioprio(current); - cfqq->avsec = 0 ; - cfqq->lastime = sched_clock(); - cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100; + + cfqq->cfqpriv = cfq_cfqpriv(cfqd,current); + if (!cfqq->cfqpriv) + cfqq->cfqpriv = &((cfqd->cid[cfqq->ioprio]).cfqpriv); + + cfqq->epstart = sched_clock(); + /* epsector, avsec, skipped initialized to zero by memset */ + + init_timer(&cfqq->timer); + cfqq->timer.function = cfq_pauseq_timer; + cfqq->timer.data = (unsigned long) cfqq; + + INIT_WORK(&cfqq->work, cfq_pauseq_work, cfqq); + + cfqq->cfqd = cfqd ; + hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); } @@ -1132,6 +1255,8 @@ static void cfq_exit(request_queue_t *q, elevator_t *e) kfree(cfqd); } + + static void cfq_timer(unsigned long data) { struct cfq_data *cfqd = (struct cfq_data *) data; @@ -1182,12 +1307,12 @@ static int cfq_init(request_queue_t *q, elevator_t *e) atomic_set(&cid->cum_sectors_out,0); atomic_set(&cid->cum_queues_in,0); atomic_set(&cid->cum_queues_out,0); -#if 0 - atomic_set(&cid->nskip,0); - atomic_set(&cid->navsec,0); - atomic_set(&cid->csectorate,0); - atomic_set(&cid->lsectorate,0); -#endif + + + atomic_set(&((cid->cfqpriv).sectorate),CFQ_SECTORATE); + (cid->cfqpriv).nskip = 0; + (cid->cfqpriv).navsec = 0; + (cid->cfqpriv).timedout = 0; } cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, @@ -1217,6 +1342,9 @@ static int cfq_init(request_queue_t *q, elevator_t *e) cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io; cfqd->cfq_grace_rt = cfq_grace_rt; cfqd->cfq_grace_idle = cfq_grace_idle; + + cfqd->cfq_epoch = CFQ_EPOCH; + cfqd->cfq_hmax_pct = CFQ_HMAX_PCT; q->nr_requests <<= 2; @@ -1224,14 +1352,6 @@ static int cfq_init(request_queue_t *q, elevator_t *e) e->elevator_data = cfqd; cfqd->queue = q; - cfqd->cfq_epoch = CFQ_EPOCH; - if (q->hardsect_size) - cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/ - q->hardsect_size)* (1000000 / CFQ_EPOCH); - else - cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512) - * (1000000 / CFQ_EPOCH) ; - return 0; out_crqpool: kfree(cfqd->cfq_hash); @@ -1302,6 +1422,8 @@ SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued); SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt); SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle); +SHOW_FUNCTION(cfq_epoch_show, cfqd->cfq_epoch); +SHOW_FUNCTION(cfq_hmax_pct_show, cfqd->cfq_hmax_pct); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ @@ -1321,63 +1443,38 @@ STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX); STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX); STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX); +STORE_FUNCTION(cfq_epoch_store, &cfqd->cfq_epoch, 0, INT_MAX); +STORE_FUNCTION(cfq_hmax_pct_store, &cfqd->cfq_hmax_pct, 1, 100); #undef STORE_FUNCTION -static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page) -{ - return sprintf(page, "%lu\n", cfqd->cfq_epoch); -} - -static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count) -{ - char *p = (char *) page; - cfqd->cfq_epoch = simple_strtoul(p, &p, 10); - return count; -} - -static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page) -{ - return sprintf(page, "%lu\n", cfqd->cfq_epochsectors); -} - -static ssize_t -cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count) -{ - char *p = (char *) page; - cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10); - return count; -} - /* Additional entries to get priority level data */ static ssize_t cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl) { - int r1,r2,s1,s2,q1,q2; + //int r1,r2,s1,s2,q1,q2; if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) return 0; + /* r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in)); r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out)); s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in)); s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out)); q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out)); - - return sprintf(page,"skip %d avsec %lu rate %lu new %lu" - "rq (%d,%d) sec (%d,%d) q (%d,%d)\n", - cfqd->cid[priolvl].nskip, - cfqd->cid[priolvl].navsec, - cfqd->cid[priolvl].csectorate, - cfqd->cid[priolvl].lsectorate, -// atomic_read(&cfqd->cid[priolvl].nskip), -// atomic_read(&cfqd->cid[priolvl].navsec), -// atomic_read(&cfqd->cid[priolvl].csectorate), -// atomic_read(&cfqd->cid[priolvl].lsectorate), - r1,r2, - s1,s2, - q1,q2); + */ + + return sprintf(page,"skip %d timdout %d avsec %lu rate %ld " + " sec0 %lu sec1 %lu\n", + cfqd->cid[priolvl].cfqpriv.nskip, + cfqd->cid[priolvl].cfqpriv.timedout, + cfqd->cid[priolvl].cfqpriv.navsec, + atomic_read(&(cfqd->cid[priolvl].cfqpriv.sectorate)), + (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[0], + (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[1]); + } #define SHOW_PRIO_DATA(__PRIOLVL) \ @@ -1411,12 +1508,25 @@ SHOW_PRIO_DATA(20); static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl) { + + char *p = (char *) page; + int val; + + val = (int) simple_strtoul(p, &p, 10); + + atomic_set(&(cfqd->cid[priolvl].cfqpriv.sectorate),val); + cfqd->cid[priolvl].cfqpriv.nskip = 0; + cfqd->cid[priolvl].cfqpriv.navsec = 0; + cfqd->cid[priolvl].cfqpriv.timedout = 0; + +#if 0 atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0); atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0); atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0); atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0); atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0); atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0); +#endif return count; } @@ -1491,10 +1601,10 @@ static struct cfq_fs_entry cfq_epoch_entry = { .show = cfq_epoch_show, .store = cfq_epoch_store, }; -static struct cfq_fs_entry cfq_epochsectors_entry = { - .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_epochsectors_show, - .store = cfq_epochsectors_store, +static struct cfq_fs_entry cfq_hmax_pct_entry = { + .attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_hmax_pct_show, + .store = cfq_hmax_pct_store, }; #define P_0_STR "p0" @@ -1558,7 +1668,7 @@ static struct attribute *default_attrs[] = { &cfq_grace_rt_entry.attr, &cfq_grace_idle_entry.attr, &cfq_epoch_entry.attr, - &cfq_epochsectors_entry.attr, + &cfq_hmax_pct_entry.attr, &cfq_prio_0_entry.attr, &cfq_prio_1_entry.attr, &cfq_prio_2_entry.attr, diff --git a/drivers/block/ckrm-io.c b/drivers/block/ckrm-io.c index 7edfce727..89910268f 100644 --- a/drivers/block/ckrm-io.c +++ b/drivers/block/ckrm-io.c @@ -35,14 +35,11 @@ #include #include -/* Tie to cfq priorities */ -#define CKI_IOPRIO_NORM IOPRIO_NORM +/* sectorate == 512 byte sectors served in CFQ_EPOCH ns*/ -/* Divisor to get fraction of bandwidth represented by an IOPRIO value */ -/* FIXME: Will not work if IOPRIO_NR > 100 */ -#define CKI_IOPRIO_DIV (IOPRIO_NR-1) -/* Minimum ioprio value to be assigned to a class */ -#define CKI_IOPRIO_MIN 1 +/* CKI_ROOTSECTORATE needs to be made configurable from outside */ +#define CKI_ROOTSECTORATE 100000 +#define CKI_MINSECTORATE 100 #define CKI_IOUSAGE_UNIT 512 @@ -52,7 +49,12 @@ typedef struct ckrm_io_stats{ unsigned long blksz; /* size of bandwidth unit */ atomic_t blkrd; /* read units submitted to DD */ atomic_t blkwr; /* write units submitted to DD */ - + + int nskip; /* # times q skipped */ + unsigned long navsec; /* avg sectors serviced */ + int timedout; /* # times gap > epoch */ + u64 sec[2]; /* sectors serviced in + prev & curr epochs */ } cki_stats_t; /* per class I/O statistics */ /* Note @@ -75,8 +77,12 @@ typedef struct ckrm_io_class { * in local units. */ + cfqlim_t cfqpriv; /* Data common with cfq priolvl's */ + + int cnt_guarantee; /* Allocation as parent */ int cnt_unused; /* Allocation to default subclass */ + int cnt_limit; /* Statistics, for class and default subclass */ cki_stats_t stats; @@ -85,19 +91,16 @@ typedef struct ckrm_io_class { } cki_icls_t; - /* Internal functions */ static inline void cki_reset_stats(cki_stats_t *usg); static inline void init_icls_one(cki_icls_t *icls); -static inline int cki_div(int *a, int b, int c); -//static inline int cki_recalc(cki_icls_t *icls, int rel2abs); static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres); /* External functions e.g. interface to ioscheduler */ void *cki_tsk_icls (struct task_struct *tsk); int cki_tsk_ioprio (struct task_struct *tsk); -extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio); +extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv); /* CKRM Resource Controller API functions */ static void * cki_alloc(struct ckrm_core_class *this, @@ -139,45 +142,27 @@ static inline void init_icls_stats(cki_icls_t *icls) static inline void init_icls_one(cki_icls_t *icls) { - // Assign zero as initial guarantee otherwise creations - // could fail due to inadequate share - - //icls->shares.my_guarantee = - // (CKI_IOPRIO_MIN * CKRM_SHARE_DFLT_TOTAL_GUARANTEE) / - // CKI_IOPRIO_DIV ; - icls->shares.my_guarantee = 0; - icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + /* Zero initial guarantee for scalable creation of + multiple classes */ - icls->shares.unused_guarantee = icls->shares.total_guarantee - - icls->shares.my_guarantee; - icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - - icls->cnt_guarantee = icls->cnt_unused = IOPRIO_IDLE; + /* Try out a new set */ + + icls->shares.my_guarantee = CKRM_SHARE_DONTCARE; + icls->shares.my_limit = CKRM_SHARE_DONTCARE; + icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + icls->shares.unused_guarantee = icls->shares.total_guarantee; + icls->shares.cur_max_limit = 0; - //Same rationale icls->ioprio = CKI_IOPRIO_MIN; - //IOPRIO_IDLE equivalence to zero my_guarantee (set above) relies - //on former being zero. + icls->cnt_guarantee = CKRM_SHARE_DONTCARE; + icls->cnt_unused = CKRM_SHARE_DONTCARE; + icls->cnt_limit = CKRM_SHARE_DONTCARE; init_icls_stats(icls); } - -static inline int cki_div(int *a, int b, int c) -{ - u64 temp = (u64) b * c ; - do_div(temp,CKI_IOPRIO_DIV); - *a = (int) temp; - - return 0; -} - - -/* Recalculate absolute shares from relative (rel2abs=1) - * or vice versa (rel2abs=0) - * Caller should have a lock on icls +/* Recalculate absolute shares from relative + * Caller should hold a lock on icls */ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) @@ -186,17 +171,17 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) ckrm_core_class_t *child = NULL; cki_icls_t *childres; int resid = cki_rcbs.resid; + u64 temp; if (parres) { struct ckrm_shares *par = &parres->shares; struct ckrm_shares *self = &res->shares; - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { res->cnt_guarantee = CKRM_SHARE_DONTCARE; } else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * + temp = (u64) self->my_guarantee * parres->cnt_guarantee; do_div(temp, par->total_guarantee); res->cnt_guarantee = (int) temp; @@ -204,16 +189,36 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) res->cnt_guarantee = 0; } + + if (parres->cnt_limit == CKRM_SHARE_DONTCARE) { + res->cnt_limit = CKRM_SHARE_DONTCARE; + atomic_set(&res->cfqpriv.sectorate,CKI_MINSECTORATE); + } else { + if (par->max_limit) { + temp = (u64) self->my_limit * + parres->cnt_limit; + do_div(temp, par->max_limit); + res->cnt_limit = (int) temp; + } else { + res->cnt_limit = 0; + } + atomic_set(&res->cfqpriv.sectorate,res->cnt_limit); + } + if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) { res->cnt_unused = CKRM_SHARE_DONTCARE; - } else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * - res->cnt_guarantee; - do_div(temp, self->total_guarantee); - res->cnt_unused = (int) temp; } else { - res->cnt_unused = 0; + if (self->total_guarantee) { + temp = (u64) self->unused_guarantee * + res->cnt_guarantee; + do_div(temp, self->total_guarantee); + res->cnt_unused = (int) temp; + } else { + res->cnt_unused = 0; + } + } + } // propagate to children ckrm_lock_hier(res->core); @@ -228,50 +233,6 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) ckrm_unlock_hier(res->core); } -#if 0 -static inline int cki_recalc(cki_icls_t *icls, int rel2abs) -{ - u64 temp; - - if (icls->parent == NULL) { - /* Root, as parent, always gets all */ - - temp = icls->shares.my_guarantee * (IOPRIO_NR-1); - do_div(temp, icls->shares.total_guarantee); - - icls->total = IOPRIO_NR-1; - icls->ioprio = temp ; - icls->unused = icls->total - icls->ioprio; -// icls->unused = (IOPRIO_NR-1)-icls->ioprio; - - } else { - cki_icls_t *parres; - int partot ; - - parres = ckrm_get_res_class(icls->parent, - cki_rcbs.resid, - cki_icls_t); - if (!parres) { - printk(KERN_ERR "cki_recalc: error getting " - "resclass from core \n"); - return -EINVAL; - } - - - temp = (icls->shares.my_guarantee * - parres->total); - do_div(temp, parres->shares.total_guarantee); - - icls->ioprio = temp; - icls->unused = 0; - - } - - return 0; - -} -#endif - void *cki_tsk_icls(struct task_struct *tsk) { return (void *) ckrm_get_res_class(class_core(tsk->taskclass), @@ -279,12 +240,19 @@ void *cki_tsk_icls(struct task_struct *tsk) } int cki_tsk_ioprio(struct task_struct *tsk) +{ + /* Don't use I/O priorities for now */ + return IOPRIO_NORM; +} + +void *cki_tsk_cfqpriv(struct task_struct *tsk) { cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass), cki_rcbs.resid, cki_icls_t); - return icls->cnt_unused; + return (void *)&(icls->cfqpriv); } + static void *cki_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent) { @@ -301,43 +269,13 @@ static void *cki_alloc(struct ckrm_core_class *core, icls->parent = parent; icls->shares_lock = SPIN_LOCK_UNLOCKED; - if (parent == NULL) { - - /* Root class gets same as "normal" CFQ priorities to - * retain compatibility of behaviour in the absence of - * other classes - */ - - icls->cnt_guarantee = icls->cnt_unused = IOPRIO_NR-1; - - /* Default gets normal, not minimum */ - //icls->unused = IOPRIO_NORM; - //icls->unused = icls->guarantee-icls->myguarantee; - //icls->limit = icls->mylimit = IOPRIO_NR; - - /* Compute shares in abstract units */ - icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - // my_guarantee for root is meaningless. Set to default - icls->shares.my_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + init_icls_one(icls); - icls->shares.unused_guarantee = - CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - //temp = (u64) icls->cnt_unused * icls->shares.total_guarantee; - //do_div(temp, CKI_IOPRIO_DIV); - // temp now has root's default's share - //icls->shares.unused_guarantee = - // icls->shares.total_guarantee - temp; - - icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - } else { - init_icls_one(icls); - /* No propagation to parent needed if icls' - initial share is zero */ + if (parent == NULL) { + icls->cnt_guarantee = CKI_ROOTSECTORATE; + icls->cnt_unused = CKI_ROOTSECTORATE; + icls->cnt_limit = CKI_ROOTSECTORATE; + atomic_set(&(icls->cfqpriv.sectorate),icls->cnt_limit); } try_module_get(THIS_MODULE); return icls; @@ -345,7 +283,10 @@ static void *cki_alloc(struct ckrm_core_class *core, static void cki_free(void *res) { - cki_icls_t *icls = res, *parres; + cki_icls_t *icls = res, *parres, *childres; + ckrm_core_class_t *child = NULL; + int maxlimit, resid = cki_rcbs.resid; + if (!res) return; @@ -361,9 +302,7 @@ static void cki_free(void *res) * */ - parres = ckrm_get_res_class(icls->parent, - cki_rcbs.resid, - cki_icls_t); + parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t); if (!parres) { printk(KERN_ERR "cki_free: error getting " "resclass from core \n"); @@ -372,8 +311,23 @@ static void cki_free(void *res) /* Update parent's shares */ spin_lock(&parres->shares_lock); + child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0); parres->cnt_unused += icls->cnt_guarantee; + + // run thru parent's children and get the new max_limit of the parent + ckrm_lock_hier(parres->core); + maxlimit = 0; + while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { + childres = ckrm_get_res_class(child, resid, cki_icls_t); + if (maxlimit < childres->shares.my_limit) { + maxlimit = childres->shares.my_limit; + } + } + ckrm_unlock_hier(parres->core); + if (parres->shares.cur_max_limit < maxlimit) { + parres->shares.cur_max_limit = maxlimit; + } spin_unlock(&parres->shares_lock); kfree(res); @@ -388,26 +342,15 @@ static int cki_setshare(void *res, struct ckrm_shares *new) struct ckrm_shares *cur, *par; int rc = -EINVAL, resid = cki_rcbs.resid; - if (!icls) { - printk(KERN_ERR "No class\n"); + if (!icls) return rc; - } cur = &icls->shares; - - /* limits not supported */ - if ((new->max_limit != CKRM_SHARE_UNCHANGED) - || (new->my_limit != CKRM_SHARE_UNCHANGED)) { - printk(KERN_ERR "limits not supported\n"); - return -EINVAL; - } - if (icls->parent) { parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t); if (!parres) { - printk(KERN_ERR "cki_setshare: error getting " - "resclass from core \n"); + pr_debug("cki_setshare: invalid resclass\n"); return -EINVAL; } spin_lock(&parres->shares_lock); @@ -420,10 +363,8 @@ static int cki_setshare(void *res, struct ckrm_shares *new) } rc = set_shares(new, cur, par); - printk(KERN_ERR "rc from set_shares %d\n", rc); if ((!rc) && parres) { - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { parres->cnt_unused = CKRM_SHARE_DONTCARE; } else if (par->total_guarantee) { @@ -435,17 +376,6 @@ static int cki_setshare(void *res, struct ckrm_shares *new) parres->cnt_unused = 0; } cki_recalc_propagate(res, parres); - -#if 0 - int old = icls->ioprio; - - rc = cki_recalc(icls,0); - - if (!rc && parres) { - int raise_tot = icls->ioprio - old ; - parres->unused -= raise_tot ; - } -#endif } spin_unlock(&icls->shares_lock); if (icls->parent) { @@ -471,15 +401,15 @@ static int cki_getstats(void *res, struct seq_file *sfile) if (!icls) return -EINVAL; -/* - seq_printf(sfile, "%d my_read\n",atomic_read(&icls->mystats.blkrd)); - seq_printf(sfile, "%d my_write\n",atomic_read(&icls->mystats.blkwr)); - seq_printf(sfile, "%d total_read\n",atomic_read(&icls->stats.blkrd)); - seq_printf(sfile, "%d total_write\n",atomic_read(&icls->stats.blkwr)); -*/ - - seq_printf(sfile, "%d total ioprio\n",icls->cnt_guarantee); - seq_printf(sfile, "%d unused/default ioprio\n",icls->cnt_unused); + seq_printf(sfile, "abs limit %d\n",icls->cnt_limit); + seq_printf(sfile, "skip %d timdout %d avsec %lu rate %ld " + " sec0 %ld sec1 %ld\n", + icls->cfqpriv.nskip, + icls->cfqpriv.timedout, + icls->cfqpriv.navsec, + atomic_read(&(icls->cfqpriv.sectorate)), + (unsigned long)icls->cfqpriv.sec[0], + (unsigned long)icls->cfqpriv.sec[1]); return 0; } @@ -554,7 +484,7 @@ int __init cki_init(void) resid = ckrm_register_res_ctlr(clstype, &cki_rcbs); if (resid != -1) { cki_rcbs.classtype = clstype; - cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio); + cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio,cki_tsk_cfqpriv); } } @@ -566,7 +496,7 @@ void __exit cki_exit(void) ckrm_unregister_res_ctlr(&cki_rcbs); cki_rcbs.resid = -1; cki_rcbs.classtype = NULL; - cki_cfq_set(NULL,NULL); + cki_cfq_set(NULL,NULL,NULL); } module_init(cki_init) diff --git a/drivers/block/ckrm-iostub.c b/drivers/block/ckrm-iostub.c index c325d8e8d..f4012545b 100644 --- a/drivers/block/ckrm-iostub.c +++ b/drivers/block/ckrm-iostub.c @@ -25,13 +25,14 @@ static spinlock_t stub_lock = SPIN_LOCK_UNLOCKED; static icls_tsk_t tskiclstub; static icls_ioprio_t tskiopriostub; +static icls_tsk_t tskcfqprivstub; - -void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio) +void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv) { spin_lock(&stub_lock); tskiclstub = tskicls; tskiopriostub = tskioprio; + tskcfqprivstub = tskcfqpriv; spin_unlock(&stub_lock); } @@ -59,6 +60,19 @@ int cki_ioprio(struct task_struct *tsk) return ret; } +void *cki_cfqpriv(struct task_struct *tsk) +{ + void *ret; + spin_lock(&stub_lock); + if (tskiclstub) + ret = (*tskcfqprivstub)(tsk); + else + ret = NULL; + spin_unlock(&stub_lock); + return ret; +} + EXPORT_SYMBOL(cki_cfq_set); EXPORT_SYMBOL(cki_hash_key); EXPORT_SYMBOL(cki_ioprio); +EXPORT_SYMBOL(cki_cfqpriv); diff --git a/drivers/char/.cvsignore b/drivers/char/.cvsignore new file mode 100644 index 000000000..83683a2d8 --- /dev/null +++ b/drivers/char/.cvsignore @@ -0,0 +1,2 @@ +consolemap_deftbl.c +defkeymap.c diff --git a/drivers/pci/.cvsignore b/drivers/pci/.cvsignore new file mode 100644 index 000000000..d5b21d9ee --- /dev/null +++ b/drivers/pci/.cvsignore @@ -0,0 +1,3 @@ +classlist.h +devlist.h +gen-devlist diff --git a/drivers/scsi/aic7xxx/.cvsignore b/drivers/scsi/aic7xxx/.cvsignore new file mode 100644 index 000000000..a1a7fcd04 --- /dev/null +++ b/drivers/scsi/aic7xxx/.cvsignore @@ -0,0 +1,4 @@ +aic79xx_reg.h +aic79xx_seq.h +aic7xxx_reg.h +aic7xxx_seq.h diff --git a/fs/aio.c b/fs/aio.c index 9e7b5928e..2335a0756 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -543,7 +543,7 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id) return ioctx; } -static void use_mm(struct mm_struct *mm) +void use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; diff --git a/include/.cvsignore b/include/.cvsignore new file mode 100644 index 000000000..04204c7c9 --- /dev/null +++ b/include/.cvsignore @@ -0,0 +1 @@ +config diff --git a/include/asm-i386/.cvsignore b/include/asm-i386/.cvsignore new file mode 100644 index 000000000..4ec57ad5b --- /dev/null +++ b/include/asm-i386/.cvsignore @@ -0,0 +1 @@ +asm_offsets.h diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h index c689554ad..9513dd889 100644 --- a/include/asm-i386/apicdef.h +++ b/include/asm-i386/apicdef.h @@ -86,6 +86,7 @@ #define APIC_LVT_REMOTE_IRR (1<<14) #define APIC_INPUT_POLARITY (1<<13) #define APIC_SEND_PENDING (1<<12) +#define APIC_MODE_MASK 0x700 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7) #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8)) #define APIC_MODE_FIXED 0x0 diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index d1a4dd68f..43917d930 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -39,6 +39,7 @@ union irq_ctx { u32 stack[THREAD_SIZE/sizeof(u32)]; }; +#ifdef CONFIG_IRQSTACKS extern union irq_ctx *hardirq_ctx[NR_CPUS]; extern union irq_ctx *softirq_ctx[NR_CPUS]; @@ -46,6 +47,10 @@ extern void irq_ctx_init(int cpu); #define __ARCH_HAS_DO_SOFTIRQ +#else +#define irq_ctx_init(cpu) do { ; } while (0) +#endif + struct irqaction; struct pt_regs; asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *, diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h new file mode 100644 index 000000000..eb8fd9868 --- /dev/null +++ b/include/asm-i386/kexec.h @@ -0,0 +1,25 @@ +#ifndef _I386_KEXEC_H +#define _I386_KEXEC_H + +#include + +/* + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. + * I.e. Maximum page that is mapped directly into kernel memory, + * and kmap is not required. + * + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct + * calculation for the amount of memory directly mappable into the + * kernel memory space. + */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) +/* Maximum address we can use for the control code buffer */ +#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE + +#define KEXEC_CONTROL_CODE_SIZE 4096 + +#endif /* _I386_KEXEC_H */ diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h index 614d05f27..263c6f752 100644 --- a/include/asm-i386/module.h +++ b/include/asm-i386/module.h @@ -60,7 +60,19 @@ struct mod_arch_specific #define MODULE_REGPARM "" #endif +#if (CONFIG_STACK_SIZE_SHIFT < 12) +#define MODULE_STACKSIZE "TINYSTACKS " +#elif (CONFIG_STACK_SIZE_SHIFT == 12) #define MODULE_STACKSIZE "4KSTACKS " +#elif (CONFIG_STACK_SIZE_SHIFT == 13) +#define MODULE_STACKSIZE "8KSTACKS " +#elif (CONFIG_STACK_SIZE_SHIFT == 14) +#define MODULE_STACKSIZE "16KSTACKS " +#elif (CONFIG_STACK_SIZE_SHIFT > 14) +#define MODULE_STACKSIZE "HUGESTACKS " +#else +#define MODULE_STACKSIZE "" +#endif #define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index cd8708b42..3651a3bb0 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -400,10 +400,10 @@ struct tss_struct { #define ARCH_MIN_TASKALIGN 16 - -#define STACK_PAGE_COUNT (4096/PAGE_SIZE) - - +#if ((1< +#include +#include + +/* + * This structure is used to hold the arguments that are used when loading + * kernel binaries. + */ + +typedef unsigned long kimage_entry_t; +#define IND_DESTINATION 0x1 +#define IND_INDIRECTION 0x2 +#define IND_DONE 0x4 +#define IND_SOURCE 0x8 + +#define KEXEC_SEGMENT_MAX 8 +struct kexec_segment { + void *buf; + size_t bufsz; + void *mem; + size_t memsz; +}; + +struct kimage { + kimage_entry_t head; + kimage_entry_t *entry; + kimage_entry_t *last_entry; + + unsigned long destination; + + unsigned long start; + struct page *control_code_page; + + unsigned long nr_segments; + struct kexec_segment segment[KEXEC_SEGMENT_MAX]; + + struct list_head control_pages; + struct list_head dest_pages; + struct list_head unuseable_pages; +}; + + +/* kexec interface functions */ +extern void machine_kexec(struct kimage *image); +extern int machine_kexec_prepare(struct kimage *image); +extern void machine_kexec_cleanup(struct kimage *image); +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments, + struct kexec_segment *segments); +extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +extern struct kimage *kexec_image; +#endif +#endif /* LINUX_KEXEC_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h deleted file mode 100644 index 0fbec884a..000000000 --- a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h +++ /dev/null @@ -1,310 +0,0 @@ -/* PPTP constants and structs */ -#ifndef _CONNTRACK_PPTP_H -#define _CONNTRACK_PPTP_H - -/* state of the control session */ -enum pptp_ctrlsess_state { - PPTP_SESSION_NONE, /* no session present */ - PPTP_SESSION_ERROR, /* some session error */ - PPTP_SESSION_STOPREQ, /* stop_sess request seen */ - PPTP_SESSION_REQUESTED, /* start_sess request seen */ - PPTP_SESSION_CONFIRMED, /* session established */ -}; - -/* state of the call inside the control session */ -enum pptp_ctrlcall_state { - PPTP_CALL_NONE, - PPTP_CALL_ERROR, - PPTP_CALL_OUT_REQ, - PPTP_CALL_OUT_CONF, - PPTP_CALL_IN_REQ, - PPTP_CALL_IN_REP, - PPTP_CALL_IN_CONF, - PPTP_CALL_CLEAR_REQ, -}; - - -/* conntrack private data */ -struct ip_ct_pptp_master { - enum pptp_ctrlsess_state sstate; /* session state */ - - /* everything below is going to be per-expectation in newnat, - * since there could be more than one call within one session */ - enum pptp_ctrlcall_state cstate; /* call state */ - u_int16_t pac_call_id; /* call id of PAC, host byte order */ - u_int16_t pns_call_id; /* call id of PNS, host byte order */ -}; - -/* conntrack_expect private member */ -struct ip_ct_pptp_expect { - enum pptp_ctrlcall_state cstate; /* call state */ - u_int16_t pac_call_id; /* call id of PAC */ - u_int16_t pns_call_id; /* call id of PNS */ -}; - - -#ifdef __KERNEL__ - -#include -DECLARE_LOCK_EXTERN(ip_pptp_lock); - -#define IP_CONNTR_PPTP PPTP_CONTROL_PORT - -#define PPTP_CONTROL_PORT 1723 - -#define PPTP_PACKET_CONTROL 1 -#define PPTP_PACKET_MGMT 2 - -#define PPTP_MAGIC_COOKIE 0x1a2b3c4d - -struct pptp_pkt_hdr { - __u16 packetLength; - __u16 packetType; - __u32 magicCookie; -}; - -/* PptpControlMessageType values */ -#define PPTP_START_SESSION_REQUEST 1 -#define PPTP_START_SESSION_REPLY 2 -#define PPTP_STOP_SESSION_REQUEST 3 -#define PPTP_STOP_SESSION_REPLY 4 -#define PPTP_ECHO_REQUEST 5 -#define PPTP_ECHO_REPLY 6 -#define PPTP_OUT_CALL_REQUEST 7 -#define PPTP_OUT_CALL_REPLY 8 -#define PPTP_IN_CALL_REQUEST 9 -#define PPTP_IN_CALL_REPLY 10 -#define PPTP_IN_CALL_CONNECT 11 -#define PPTP_CALL_CLEAR_REQUEST 12 -#define PPTP_CALL_DISCONNECT_NOTIFY 13 -#define PPTP_WAN_ERROR_NOTIFY 14 -#define PPTP_SET_LINK_INFO 15 - -#define PPTP_MSG_MAX 15 - -/* PptpGeneralError values */ -#define PPTP_ERROR_CODE_NONE 0 -#define PPTP_NOT_CONNECTED 1 -#define PPTP_BAD_FORMAT 2 -#define PPTP_BAD_VALUE 3 -#define PPTP_NO_RESOURCE 4 -#define PPTP_BAD_CALLID 5 -#define PPTP_REMOVE_DEVICE_ERROR 6 - -struct PptpControlHeader { - __u16 messageType; - __u16 reserved; -}; - -/* FramingCapability Bitmap Values */ -#define PPTP_FRAME_CAP_ASYNC 0x1 -#define PPTP_FRAME_CAP_SYNC 0x2 - -/* BearerCapability Bitmap Values */ -#define PPTP_BEARER_CAP_ANALOG 0x1 -#define PPTP_BEARER_CAP_DIGITAL 0x2 - -struct PptpStartSessionRequest { - __u16 protocolVersion; - __u8 reserved1; - __u8 reserved2; - __u32 framingCapability; - __u32 bearerCapability; - __u16 maxChannels; - __u16 firmwareRevision; - __u8 hostName[64]; - __u8 vendorString[64]; -}; - -/* PptpStartSessionResultCode Values */ -#define PPTP_START_OK 1 -#define PPTP_START_GENERAL_ERROR 2 -#define PPTP_START_ALREADY_CONNECTED 3 -#define PPTP_START_NOT_AUTHORIZED 4 -#define PPTP_START_UNKNOWN_PROTOCOL 5 - -struct PptpStartSessionReply { - __u16 protocolVersion; - __u8 resultCode; - __u8 generalErrorCode; - __u32 framingCapability; - __u32 bearerCapability; - __u16 maxChannels; - __u16 firmwareRevision; - __u8 hostName[64]; - __u8 vendorString[64]; -}; - -/* PptpStopReasons */ -#define PPTP_STOP_NONE 1 -#define PPTP_STOP_PROTOCOL 2 -#define PPTP_STOP_LOCAL_SHUTDOWN 3 - -struct PptpStopSessionRequest { - __u8 reason; -}; - -/* PptpStopSessionResultCode */ -#define PPTP_STOP_OK 1 -#define PPTP_STOP_GENERAL_ERROR 2 - -struct PptpStopSessionReply { - __u8 resultCode; - __u8 generalErrorCode; -}; - -struct PptpEchoRequest { - __u32 identNumber; -}; - -/* PptpEchoReplyResultCode */ -#define PPTP_ECHO_OK 1 -#define PPTP_ECHO_GENERAL_ERROR 2 - -struct PptpEchoReply { - __u32 identNumber; - __u8 resultCode; - __u8 generalErrorCode; - __u16 reserved; -}; - -/* PptpFramingType */ -#define PPTP_ASYNC_FRAMING 1 -#define PPTP_SYNC_FRAMING 2 -#define PPTP_DONT_CARE_FRAMING 3 - -/* PptpCallBearerType */ -#define PPTP_ANALOG_TYPE 1 -#define PPTP_DIGITAL_TYPE 2 -#define PPTP_DONT_CARE_BEARER_TYPE 3 - -struct PptpOutCallRequest { - __u16 callID; - __u16 callSerialNumber; - __u32 minBPS; - __u32 maxBPS; - __u32 bearerType; - __u32 framingType; - __u16 packetWindow; - __u16 packetProcDelay; - __u16 reserved1; - __u16 phoneNumberLength; - __u16 reserved2; - __u8 phoneNumber[64]; - __u8 subAddress[64]; -}; - -/* PptpCallResultCode */ -#define PPTP_OUTCALL_CONNECT 1 -#define PPTP_OUTCALL_GENERAL_ERROR 2 -#define PPTP_OUTCALL_NO_CARRIER 3 -#define PPTP_OUTCALL_BUSY 4 -#define PPTP_OUTCALL_NO_DIAL_TONE 5 -#define PPTP_OUTCALL_TIMEOUT 6 -#define PPTP_OUTCALL_DONT_ACCEPT 7 - -struct PptpOutCallReply { - __u16 callID; - __u16 peersCallID; - __u8 resultCode; - __u8 generalErrorCode; - __u16 causeCode; - __u32 connectSpeed; - __u16 packetWindow; - __u16 packetProcDelay; - __u32 physChannelID; -}; - -struct PptpInCallRequest { - __u16 callID; - __u16 callSerialNumber; - __u32 callBearerType; - __u32 physChannelID; - __u16 dialedNumberLength; - __u16 dialingNumberLength; - __u8 dialedNumber[64]; - __u8 dialingNumber[64]; - __u8 subAddress[64]; -}; - -/* PptpInCallResultCode */ -#define PPTP_INCALL_ACCEPT 1 -#define PPTP_INCALL_GENERAL_ERROR 2 -#define PPTP_INCALL_DONT_ACCEPT 3 - -struct PptpInCallReply { - __u16 callID; - __u16 peersCallID; - __u8 resultCode; - __u8 generalErrorCode; - __u16 packetWindow; - __u16 packetProcDelay; - __u16 reserved; -}; - -struct PptpInCallConnected { - __u16 peersCallID; - __u16 reserved; - __u32 connectSpeed; - __u16 packetWindow; - __u16 packetProcDelay; - __u32 callFramingType; -}; - -struct PptpClearCallRequest { - __u16 callID; - __u16 reserved; -}; - -struct PptpCallDisconnectNotify { - __u16 callID; - __u8 resultCode; - __u8 generalErrorCode; - __u16 causeCode; - __u16 reserved; - __u8 callStatistics[128]; -}; - -struct PptpWanErrorNotify { - __u16 peersCallID; - __u16 reserved; - __u32 crcErrors; - __u32 framingErrors; - __u32 hardwareOverRuns; - __u32 bufferOverRuns; - __u32 timeoutErrors; - __u32 alignmentErrors; -}; - -struct PptpSetLinkInfo { - __u16 peersCallID; - __u16 reserved; - __u32 sendAccm; - __u32 recvAccm; -}; - - -struct pptp_priv_data { - __u16 call_id; - __u16 mcall_id; - __u16 pcall_id; -}; - -union pptp_ctrl_union { - struct PptpStartSessionRequest sreq; - struct PptpStartSessionReply srep; - struct PptpStopSessionRequest streq; - struct PptpStopSessionReply strep; - struct PptpOutCallRequest ocreq; - struct PptpOutCallReply ocack; - struct PptpInCallRequest icreq; - struct PptpInCallReply icack; - struct PptpInCallConnected iccon; - struct PptpClearCallRequest clrreq; - struct PptpCallDisconnectNotify disc; - struct PptpWanErrorNotify wanerr; - struct PptpSetLinkInfo setlink; -}; - -#endif /* __KERNEL__ */ -#endif /* _CONNTRACK_PPTP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h b/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h deleted file mode 100644 index 07646857c..000000000 --- a/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef _CONNTRACK_PROTO_GRE_H -#define _CONNTRACK_PROTO_GRE_H -#include - -/* GRE PROTOCOL HEADER */ - -/* GRE Version field */ -#define GRE_VERSION_1701 0x0 -#define GRE_VERSION_PPTP 0x1 - -/* GRE Protocol field */ -#define GRE_PROTOCOL_PPTP 0x880B - -/* GRE Flags */ -#define GRE_FLAG_C 0x80 -#define GRE_FLAG_R 0x40 -#define GRE_FLAG_K 0x20 -#define GRE_FLAG_S 0x10 -#define GRE_FLAG_A 0x80 - -#define GRE_IS_C(f) ((f)&GRE_FLAG_C) -#define GRE_IS_R(f) ((f)&GRE_FLAG_R) -#define GRE_IS_K(f) ((f)&GRE_FLAG_K) -#define GRE_IS_S(f) ((f)&GRE_FLAG_S) -#define GRE_IS_A(f) ((f)&GRE_FLAG_A) - -/* GRE is a mess: Four different standards */ -struct gre_hdr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u16 rec:3, - srr:1, - seq:1, - key:1, - routing:1, - csum:1, - version:3, - reserved:4, - ack:1; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u16 csum:1, - routing:1, - key:1, - seq:1, - srr:1, - rec:3, - ack:1, - reserved:4, - version:3; -#else -#error "Adjust your defines" -#endif - __u16 protocol; -}; - -/* modified GRE header for PPTP */ -struct gre_hdr_pptp { - __u8 flags; /* bitfield */ - __u8 version; /* should be GRE_VERSION_PPTP */ - __u16 protocol; /* should be GRE_PROTOCOL_PPTP */ - __u16 payload_len; /* size of ppp payload, not inc. gre header */ - __u16 call_id; /* peer's call_id for this session */ - __u32 seq; /* sequence number. Present if S==1 */ - __u32 ack; /* seq number of highest packet recieved by */ - /* sender in this session */ -}; - - -/* this is part of ip_conntrack */ -struct ip_ct_gre { - unsigned int stream_timeout; - unsigned int timeout; -}; - -/* this is part of ip_conntrack_expect */ -struct ip_ct_gre_expect { - struct ip_ct_gre_keymap *keymap_orig, *keymap_reply; -}; - -#ifdef __KERNEL__ -struct ip_conntrack_expect; - -/* structure for original <-> reply keymap */ -struct ip_ct_gre_keymap { - struct list_head list; - - struct ip_conntrack_tuple tuple; -}; - - -/* add new tuple->key_reply pair to keymap */ -int ip_ct_gre_keymap_add(struct ip_conntrack_expect *exp, - struct ip_conntrack_tuple *t, - int reply); - -/* change an existing keymap entry */ -void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km, - struct ip_conntrack_tuple *t); - -/* delete keymap entries */ -void ip_ct_gre_keymap_destroy(struct ip_conntrack_expect *exp); - - -/* get pointer to gre key, if present */ -static inline u_int32_t *gre_key(struct gre_hdr *greh) -{ - if (!greh->key) - return NULL; - if (greh->csum || greh->routing) - return (u_int32_t *) (greh+sizeof(*greh)+4); - return (u_int32_t *) (greh+sizeof(*greh)); -} - -/* get pointer ot gre csum, if present */ -static inline u_int16_t *gre_csum(struct gre_hdr *greh) -{ - if (!greh->csum) - return NULL; - return (u_int16_t *) (greh+sizeof(*greh)); -} - -#endif /* __KERNEL__ */ - -#endif /* _CONNTRACK_PROTO_GRE_H */ diff --git a/include/linux/netfilter_ipv4/ip_nat_pptp.h b/include/linux/netfilter_ipv4/ip_nat_pptp.h deleted file mode 100644 index eaf66c2e8..000000000 --- a/include/linux/netfilter_ipv4/ip_nat_pptp.h +++ /dev/null @@ -1,11 +0,0 @@ -/* PPTP constants and structs */ -#ifndef _NAT_PPTP_H -#define _NAT_PPTP_H - -/* conntrack private data */ -struct ip_nat_pptp { - u_int16_t pns_call_id; /* NAT'ed PNS call id */ - u_int16_t pac_call_id; /* NAT'ed PAC call id */ -}; - -#endif /* _NAT_PPTP_H */ diff --git a/include/linux/reboot.h b/include/linux/reboot.h index d60fafc8b..5460e94a1 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -51,6 +51,8 @@ extern void machine_restart(char *cmd); extern void machine_halt(void); extern void machine_power_off(void); +extern void machine_shutdown(void); + #endif #endif /* _LINUX_REBOOT_H */ diff --git a/kernel/.cvsignore b/kernel/.cvsignore new file mode 100644 index 000000000..21426e906 --- /dev/null +++ b/kernel/.cvsignore @@ -0,0 +1,2 @@ +config_data.gz +config_data.h diff --git a/kernel/Makefile b/kernel/Makefile index ec5001052..455ec1eae 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_MODULE_SIG) += module-verify.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index b32530977..4956dcb3a 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -8,6 +8,6 @@ endif obj-$(CONFIG_CKRM_TYPE_TASKCLASS) += ckrm_tc.o obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o - obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o + obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o diff --git a/kernel/ckrm/ckrm_laq.c b/kernel/ckrm/ckrm_laq.c deleted file mode 100644 index b64205a06..000000000 --- a/kernel/ckrm/ckrm_laq.c +++ /dev/null @@ -1,495 +0,0 @@ -/* ckrm_socketaq.c - accept queue resource controller - * - * Copyright (C) Vivek Kashyap, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * Initial version - */ - -/* Code Description: TBD - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define hnode_2_core(ptr) \ - ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL) - -#define CKRM_SAQ_MAX_DEPTH 3 // 0 => /rcfs - // 1 => socket_aq - // 2 => socket_aq/listen_class - // 3 => socket_aq/listen_class/accept_queues - // 4 => Not allowed - -typedef struct ckrm_laq_res { - spinlock_t reslock; - atomic_t refcnt; - struct ckrm_shares shares; - struct ckrm_core_class *core; - struct ckrm_core_class *pcore; - int my_depth; - int my_id; - unsigned int min_ratio; -} ckrm_laq_res_t; - -static int my_resid = -1; - -extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int); -extern struct ckrm_core_class *rcfs_make_core(struct dentry *, - struct ckrm_core_class *); - -void laq_res_hold(struct ckrm_laq_res *res) -{ - atomic_inc(&res->refcnt); - return; -} - -void laq_res_put(struct ckrm_laq_res *res) -{ - if (atomic_dec_and_test(&res->refcnt)) - kfree(res); - return; -} - -/* Initialize rescls values - */ -static void laq_res_initcls(void *my_res) -{ - ckrm_laq_res_t *res = my_res; - - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; -} - -static int atoi(char *s) -{ - int k = 0; - while (*s) - k = *s++ - '0' + (k * 10); - return k; -} - -static char *laq_get_name(struct ckrm_core_class *c) -{ - char *p = (char *)c->name; - - while (*p) - p++; - while (*p != '/' && p != c->name) - p--; - - return ++p; -} - -static void *laq_res_alloc(struct ckrm_core_class *core, - struct ckrm_core_class *parent) -{ - ckrm_laq_res_t *res, *pres; - int pdepth; - - if (parent) - pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t); - else - pres = NULL; - - if (core == core->classtype->default_class) - pdepth = 1; - else { - if (!parent) - return NULL; - pdepth = 1 + pres->my_depth; - } - - res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC); - if (res) { - memset(res, 0, sizeof(res)); - spin_lock_init(&res->reslock); - laq_res_hold(res); - res->my_depth = pdepth; - if (pdepth == 2) // listen class - res->my_id = 0; - else if (pdepth == 3) - res->my_id = atoi(laq_get_name(core)); - res->core = core; - res->pcore = parent; - - // rescls in place, now initialize contents other than - // hierarchy pointers - laq_res_initcls(res); // acts as initialising value - } - - return res; -} - -static void laq_res_free(void *my_res) -{ - ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res; - ckrm_laq_res_t *parent; - - if (!res) - return; - - if (res->my_depth != 3) { - kfree(res); - return; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // Should never happen - return; - - spin_lock(&parent->reslock); - spin_lock(&res->reslock); - - // return child's guarantee to parent node - // Limits have no meaning for accept queue control - child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0); - - spin_unlock(&res->reslock); - laq_res_put(res); - spin_unlock(&parent->reslock); - return; -} - -/************************************************************************** - * SHARES *** - **************************************************************************/ - -void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio) -{ - int i; - struct tcp_opt *tp; - - tp = tcp_sk(ns->ns_sk); - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - tp->acceptq[i].aq_ratio = aq_ratio[i]; - return; -} -void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio) -{ - - struct ckrm_net_struct *ns; - struct ckrm_core_class *core = parent->core; - - class_lock(core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - laq_set_aq_value(ns, aq_ratio); - } - class_unlock(core); - return; -} - -static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio) -{ - struct ckrm_hnode *chnode; - ckrm_laq_res_t *child; - unsigned int min; - int i; - - min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee; - - list_for_each_entry(chnode, &res->core->hnode.children, siblings) { - child = hnode_2_core(chnode)->res_class[my_resid]; - - aq_ratio[child->my_id] = - (unsigned int)child->shares.my_guarantee; - if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE) - aq_ratio[child->my_id] = 0; - if (aq_ratio[child->my_id] && - ((unsigned int)aq_ratio[child->my_id] < min)) - min = (unsigned int)child->shares.my_guarantee; - } - - if (min == 0) { - min = 1; - // default takes all if nothing specified - aq_ratio[0] = 1; - } - res->min_ratio = min; - - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - aq_ratio[i] = aq_ratio[i] / min; -} - -static int laq_set_share_values(void *my_res, struct ckrm_shares *shares) -{ - ckrm_laq_res_t *res = my_res; - ckrm_laq_res_t *parent; - unsigned int aq_ratio[NUM_ACCEPT_QUEUES]; - int rc = 0; - - if (!res) - return -EINVAL; - - if (!res->pcore) { - // something is badly wrong - printk(KERN_ERR "socketaq internal inconsistency\n"); - return -EBADF; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // socketclass does not have a share interface - return -EINVAL; - - // Ensure that we ignore limit values - shares->my_limit = CKRM_SHARE_DONTCARE; - shares->max_limit = CKRM_SHARE_UNCHANGED; - - if (res->my_depth == 0) { - printk(KERN_ERR "socketaq bad entry\n"); - return -EBADF; - } else if (res->my_depth == 1) { - // can't be written to. This is an internal default. - return -EINVAL; - } else if (res->my_depth == 2) { - //nothin to inherit - if (!shares->total_guarantee) { - return -EINVAL; - } - parent = res; - shares->my_guarantee = CKRM_SHARE_DONTCARE; - } else if (res->my_depth == 3) { - // accept queue itself. - shares->total_guarantee = CKRM_SHARE_UNCHANGED; - } - - ckrm_lock_hier(parent->pcore); - spin_lock(&parent->reslock); - rc = set_shares(shares, &res->shares, - (parent == res) ? NULL : &parent->shares); - if (rc) { - spin_unlock(&res->reslock); - ckrm_unlock_hier(res->pcore); - return rc; - } - calculate_aq_ratios(parent, aq_ratio); - laq_set_aq_values(parent, aq_ratio); - spin_unlock(&parent->reslock); - ckrm_unlock_hier(parent->pcore); - - return rc; -} - -static int laq_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - ckrm_laq_res_t *res = my_res; - - if (!res) - return -EINVAL; - *shares = res->shares; - return 0; -} - -/************************************************************************** - * STATS *** - **************************************************************************/ - -void -laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i) -{ - seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t" - "queued: %u\n\twait_time: %u\n", - i, taq->acceptq_count, taq->acceptq_qcount, - jiffies_to_msecs(taq->acceptq_wait_time)); - - if (i) - return; - - for (i = 1; i < NUM_ACCEPT_QUEUES; i++) { - taq[0].acceptq_wait_time += taq[i].acceptq_wait_time; - taq[0].acceptq_qcount += taq[i].acceptq_qcount; - taq[0].acceptq_count += taq[i].acceptq_count; - } - - seq_printf(sfile, "Totals :\n\taccepted: %u\n\t" - "queued: %u\n\twait_time: %u\n", - taq->acceptq_count, taq->acceptq_qcount, - jiffies_to_msecs(taq->acceptq_wait_time)); - - return; -} - -void -laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres, - struct tcp_acceptq_info *taq) -{ - struct ckrm_net_struct *ns; - struct ckrm_core_class *core = pres->core; - struct tcp_opt *tp; - int a = mres->my_id; - int z; - - if (a == 0) - z = NUM_ACCEPT_QUEUES; - else - z = a + 1; - - // XXX Instead of holding a class_lock introduce a rw - // lock to be write locked by listen callbacks and read locked here. - // - VK - class_lock(pres->core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - tp = tcp_sk(ns->ns_sk); - for (; a < z; a++) { - taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time; - taq->acceptq_qcount += tp->acceptq[a].aq_qcount; - taq->acceptq_count += tp->acceptq[a].aq_count; - taq++; - } - } - class_unlock(pres->core); -} - -static int laq_get_stats(void *my_res, struct seq_file *sfile) -{ - ckrm_laq_res_t *res = my_res; - ckrm_laq_res_t *parent; - struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES]; - int rc = 0; - - if (!res) - return -EINVAL; - - if (!res->pcore) { - // something is badly wrong - printk(KERN_ERR "socketaq internal inconsistency\n"); - return -EBADF; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) { // socketclass does not have a stat interface - printk(KERN_ERR "socketaq internal fs inconsistency\n"); - return -EINVAL; - } - - memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES); - - switch (res->my_depth) { - - default: - case 0: - printk(KERN_ERR "socket class bad entry\n"); - rc = -EBADF; - break; - - case 1: // can't be read from. this is internal default. - // return -EINVAL - rc = -EINVAL; - break; - - case 2: // return the default and total - ckrm_lock_hier(res->core); // block any deletes - laq_get_aq_stats(res, res, &taq[0]); - laq_print_aq_stats(sfile, &taq[0], 0); - ckrm_unlock_hier(res->core); // block any deletes - break; - - case 3: - ckrm_lock_hier(parent->core); // block any deletes - laq_get_aq_stats(parent, res, &taq[res->my_id]); - laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id); - ckrm_unlock_hier(parent->core); // block any deletes - break; - } - - return rc; -} - -/* - * The network connection is reclassified to this class. Update its shares. - * The socket lock is held. - */ -static void laq_change_resclass(void *n, void *old, void *r) -{ - struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n; - struct ckrm_laq_res *res = (struct ckrm_laq_res *)r; - unsigned int aq_ratio[NUM_ACCEPT_QUEUES]; - - if (res->my_depth != 2) - return; - - // a change to my_depth == 3 ie. the accept classes cannot happen. - // there is no target file - if (res->my_depth == 2) { // it is one of the socket classes - ckrm_lock_hier(res->pcore); - // share rule: hold parent resource lock. then self. - // However, since my_depth == 1 is a generic class it is not - // needed here. Self lock is enough. - spin_lock(&res->reslock); - calculate_aq_ratios(res, aq_ratio); - class_lock(res->pcore); - laq_set_aq_value(ns, aq_ratio); - class_unlock(res->pcore); - spin_unlock(&res->reslock); - ckrm_unlock_hier(res->pcore); - } - - return; -} - -struct ckrm_res_ctlr laq_rcbs = { - .res_name = "laq", - .resid = -1, // dynamically assigned - .res_alloc = laq_res_alloc, - .res_free = laq_res_free, - .set_share_values = laq_set_share_values, - .get_share_values = laq_get_share_values, - .get_stats = laq_get_stats, - .change_resclass = laq_change_resclass, - //.res_initcls = laq_res_initcls, //HUBERTUS: unnecessary !! -}; - -int __init init_ckrm_laq_res(void) -{ - struct ckrm_classtype *clstype; - int resid; - - clstype = ckrm_find_classtype_by_name("socketclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); - return -ENOENT; - } - - if (my_resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &laq_rcbs); - if (resid >= 0) - my_resid = resid; - printk(KERN_DEBUG "........init_ckrm_listen_aq_res -> %d\n", my_resid); - } - return 0; - -} - -void __exit exit_ckrm_laq_res(void) -{ - ckrm_unregister_res_ctlr(&laq_rcbs); - my_resid = -1; -} - -module_init(init_ckrm_laq_res) - module_exit(exit_ckrm_laq_res) - - MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c index 0fe858633..103e3f957 100644 --- a/kernel/ckrm/ckrm_listenaq.c +++ b/kernel/ckrm/ckrm_listenaq.c @@ -1,4 +1,4 @@ -/* ckrm_socketaq.c - accept queue resource controller +/* ckrm_listenaq.c - accept queue resource controller * * Copyright (C) Vivek Kashyap, IBM Corp. 2004 * @@ -251,7 +251,7 @@ static int laq_set_share_values(void *my_res, struct ckrm_shares *shares) } parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // socket_class does not have a share interface + if (!parent) // socketclass does not have a share interface return -EINVAL; // Ensure that we ignore limit values @@ -380,7 +380,7 @@ static int laq_get_stats(void *my_res, struct seq_file *sfile) } parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) { // socket_class does not have a stat interface + if (!parent) { // socketclass does not have a stat interface printk(KERN_ERR "socketaq internal fs inconsistency\n"); return -EINVAL; } @@ -451,7 +451,7 @@ static void laq_change_resclass(void *n, void *old, void *r) } struct ckrm_res_ctlr laq_rcbs = { - .res_name = "laq", + .res_name = "listenaq", .resid = -1, // dynamically assigned .res_alloc = laq_res_alloc, .res_free = laq_res_free, @@ -467,9 +467,9 @@ int __init init_ckrm_laq_res(void) struct ckrm_classtype *clstype; int resid; - clstype = ckrm_find_classtype_by_name("socket_class"); + clstype = ckrm_find_classtype_by_name("socketclass"); if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); + printk(KERN_INFO " Unknown ckrm classtype"); return -ENOENT; } diff --git a/kernel/itimer.c b/kernel/itimer.c index 5bf6c881c..6918cb746 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -68,9 +68,7 @@ void it_real_fn(unsigned long __data) struct task_struct * p = (struct task_struct *) __data; unsigned long interval; - if (send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p)) - printk("*warning*: failed to send SIGALRM to %u\n", p->pid); - + send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); interval = p->it_real_incr; if (interval) { if (interval > (unsigned long) LONG_MAX) diff --git a/kernel/kexec.c b/kernel/kexec.c new file mode 100644 index 000000000..b59023fbf --- /dev/null +++ b/kernel/kexec.c @@ -0,0 +1,640 @@ +/* + * kexec.c - kexec system call + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in . + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static int kimage_is_destination_range( + struct kimage *image, unsigned long start, unsigned long end); +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest); + + +static int kimage_alloc(struct kimage **rimage, + unsigned long nr_segments, struct kexec_segment *segments) +{ + int result; + struct kimage *image; + size_t segment_bytes; + unsigned long i; + + /* Allocate a controlling structure */ + result = -ENOMEM; + image = kmalloc(sizeof(*image), GFP_KERNEL); + if (!image) { + goto out; + } + memset(image, 0, sizeof(*image)); + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unuseable pages */ + INIT_LIST_HEAD(&image->unuseable_pages); + + /* Read in the segments */ + image->nr_segments = nr_segments; + segment_bytes = nr_segments * sizeof*segments; + result = copy_from_user(image->segment, segments, segment_bytes); + if (result) + goto out; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mend; + mend = ((unsigned long)(image->segment[i].mem)) + + image->segment[i].memsz; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + goto out; + } + + /* + * Find a location for the control code buffer, and add it + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; + out: + if (result == 0) { + *rimage = image; + } else { + kfree(image); + } + return result; +} + +static int kimage_is_destination_range( + struct kimage *image, unsigned long start, unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + mstart = (unsigned long)image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) { + return 1; + } + } + return 0; +} + +static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order) +{ + struct page *pages; + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + pages->mapping = NULL; + pages->private = order; + count = 1 << order; + for(i = 0; i < count; i++) { + SetPageReserved(pages + i); + } + } + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + order = page->private; + count = 1 << order; + for(i = 0; i < count; i++) { + ClearPageReserved(page + i); + } + __free_pages(page, order); +} + +static void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + + kimage_free_pages(page); + } +} + +struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + pages = kimage_alloc_pages(GFP_KERNEL, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) + { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while(!pages); + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everyting to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + return pages; + +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) { + image->entry++; + } + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) { + return -ENOMEM; + } + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = + ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + return 0; +} + +static int kimage_set_destination( + struct kimage *image, unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + if (result == 0) { + image->destination = destination; + } + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + if (result == 0) { + image->destination += PAGE_SIZE; + } + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unuseable pages I have cached */ + kimage_free_page_list(&image->unuseable_pages); + +} +static int kimage_terminate(struct kimage *image) +{ + int result; + + result = kimage_add_entry(image, IND_DONE); + if (result == 0) { + /* Point at the terminating element */ + image->entry--; + kimage_free_extra_pages(image); + } + return result; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION)? \ + phys_to_virt((entry & PAGE_MASK)): ptr +1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +static void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) { + kimage_free_entry(ind); + } + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } + else if (entry & IND_SOURCE) { + kimage_free_entry(entry); + } + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) { + kimage_free_entry(ind); + } + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) { + destination = entry & PAGE_MASK; + } + else if (entry & IND_SOURCE) { + if (page == destination) { + return ptr; + } + destination += PAGE_SIZE; + } + } + return 0; +} + +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) { + return 0; + } + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unuseable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it. + */ + addr = old_addr; + page = old_page; + break; + } + else { + /* Place the page on the destination list I + * will use it later. + */ + list_add(&page->lru, &image->dest_pages); + } + } + return page; +} + +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long mstart; + int result; + unsigned long offset; + unsigned long offset_end; + unsigned char *buf; + + result = 0; + buf = segment->buf; + mstart = (unsigned long)segment->mem; + + offset_end = segment->memsz; + + result = kimage_set_destination(image, mstart); + if (result < 0) { + goto out; + } + for (offset = 0; offset < segment->memsz; offset += PAGE_SIZE) { + struct page *page; + char *ptr; + size_t size, leader; + page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset); + if (page == 0) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT); + if (result < 0) { + goto out; + } + ptr = kmap(page); + if (segment->bufsz < offset) { + /* We are past the end zero the whole page */ + memset(ptr, 0, PAGE_SIZE); + kunmap(page); + continue; + } + size = PAGE_SIZE; + leader = 0; + if ((offset == 0)) { + leader = mstart & ~PAGE_MASK; + } + if (leader) { + /* We are on the first page zero the unused portion */ + memset(ptr, 0, leader); + size -= leader; + ptr += leader; + } + if (size > (segment->bufsz - offset)) { + size = segment->bufsz - offset; + } + if (size < (PAGE_SIZE - leader)) { + /* zero the trailing part of the page */ + memset(ptr + size, 0, (PAGE_SIZE - leader) - size); + } + result = copy_from_user(ptr, buf + offset, size); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + } + out: + return result; +} + +/* + * Exec Kernel system call: for obvious reasons only root may call it. + * + * This call breaks up into three pieces. + * - A generic part which loads the new kernel from the current + * address space, and very carefully places the data in the + * allocated pages. + * + * - A generic part that interacts with the kernel and tells all of + * the devices to shut down. Preventing on-going dmas, and placing + * the devices in a consistent state so a later kernel can + * reinitialize them. + * + * - A machine specific part that includes the syscall number + * and the copies the image to it's final destination. And + * jumps into the image at entry. + * + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ +struct kimage *kexec_image = NULL; + +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment *segments, unsigned long flags) +{ + struct kimage *image; + int result; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* + * In case we need just a little bit of special behavior for + * reboot on panic. + */ + if (flags != 0) + return -EINVAL; + + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + image = NULL; + result = 0; + + if (nr_segments > 0) { + unsigned long i; + result = kimage_alloc(&image, nr_segments, segments); + if (result) { + goto out; + } + result = machine_kexec_prepare(image); + if (result) { + goto out; + } + image->start = entry; + for (i = 0; i < nr_segments; i++) { + result = kimage_load_segment(image, &image->segment[i]); + if (result) { + goto out; + } + } + result = kimage_terminate(image); + if (result) { + goto out; + } + } + + image = xchg(&kexec_image, image); + + out: + kimage_free(image); + return result; +} diff --git a/kernel/signal.c b/kernel/signal.c index e4282d2de..b3574b096 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -603,28 +603,17 @@ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { int error = -EINVAL; - int user; - if (sig < 0 || sig > _NSIG) return error; - - user = (!info || - (info != SEND_SIG_PRIV && - info != SEND_SIG_FORCED && - SI_FROMUSER(info))); - error = -EPERM; - if (user && (sig != SIGCONT || - current->signal->session != t->signal->session) + if ((!info || ((unsigned long)info != 1 && + (unsigned long)info != 2 && SI_FROMUSER(info))) + && ((sig != SIGCONT) || + (current->signal->session != t->signal->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) return error; - - error = -ESRCH; - if (user && !vx_check(vx_task_xid(t), VX_ADMIN|VX_IDENT)) - return error; - return security_task_kill(t, info, sig); } @@ -1066,6 +1055,9 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) unsigned long flags; int ret; + if (!vx_check(vx_task_xid(p), VX_ADMIN|VX_WATCH|VX_IDENT)) + return -ESRCH; + ret = check_kill_permission(sig, info, p); if (!ret && sig && p->sighand) { spin_lock_irqsave(&p->sighand->siglock, flags); diff --git a/kernel/sys.c b/kernel/sys.c index c69f6ed82..6e8b073bc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -511,6 +513,25 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user machine_restart(buffer); break; +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + { + struct kimage *image; + image = xchg(&kexec_image, 0); + if (!image) { + unlock_kernel(); + return -EINVAL; + } + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_shutdown(); + system_state = SYSTEM_BOOTING; + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + machine_kexec(image); + break; + } +#endif #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { diff --git a/lib/.cvsignore b/lib/.cvsignore new file mode 100644 index 000000000..30d38180f --- /dev/null +++ b/lib/.cvsignore @@ -0,0 +1,2 @@ +crc32table.h +gen_crc32table diff --git a/net/ipv4/netfilter/ip_conntrack_pptp.c b/net/ipv4/netfilter/ip_conntrack_pptp.c deleted file mode 100644 index 29ab1a495..000000000 --- a/net/ipv4/netfilter/ip_conntrack_pptp.c +++ /dev/null @@ -1,712 +0,0 @@ -/* - * ip_conntrack_pptp.c - Version 2.0 - * - * Connection tracking support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. - * It is a specification defined by Microsoft and some vendors - * working with Microsoft. PPTP is built on top of a modified - * version of the Internet Generic Routing Encapsulation Protocol. - * GRE is defined in RFC 1701 and RFC 1702. Documentation of - * PPTP can be found in RFC 2637 - * - * (C) 2000-2003 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * Limitations: - * - We blindly assume that control connections are always - * established in PNS->PAC direction. This is a violation - * of RFFC2673 - * - * TODO: - finish support for multiple calls within one session - * (needs expect reservations in newnat) - * - testing of incoming PPTP calls - * - * Changes: - * 2002-02-05 - Version 1.3 - * - Call ip_conntrack_unexpect_related() from - * pptp_timeout_related() to destroy expectations in case - * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen - * (Philip Craig ) - * - Add Version information at module loadtime - * 2002-02-10 - Version 1.6 - * - move to C99 style initializers - * - remove second expectation if first arrives - * 2004-10-22 - Version 2.0 - * - merge Mandrake's 2.6.x port with recent 2.6.x API changes - * - fix lots of linear skb assumptions from Mandrake's port - * - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define IP_CT_PPTP_VERSION "2.0" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP"); - -DECLARE_LOCK(ip_pptp_lock); - -#if 0 -#include "ip_conntrack_pptp_priv.h" -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#else -#define DEBUGP(format, args...) -#endif - -#define SECS *HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - -#define PPTP_GRE_TIMEOUT (10 MINS) -#define PPTP_GRE_STREAM_TIMEOUT (5 DAYS) - -static int pptp_expectfn(struct ip_conntrack *ct) -{ - struct ip_conntrack *master; - struct ip_conntrack_expect *exp; - - DEBUGP("increasing timeouts\n"); - /* increase timeout of GRE data channel conntrack entry */ - ct->proto.gre.timeout = PPTP_GRE_TIMEOUT; - ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT; - - master = master_ct(ct); - if (!master) { - DEBUGP(" no master!!!\n"); - return 0; - } - - exp = ct->master; - if (!exp) { - DEBUGP("no expectation!!\n"); - return 0; - } - - DEBUGP("completing tuples with ct info\n"); - /* we can do this, since we're unconfirmed */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == - htonl(master->help.ct_pptp_info.pac_call_id)) { - /* assume PNS->PAC */ - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = - htonl(master->help.ct_pptp_info.pns_call_id); - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = - htonl(master->help.ct_pptp_info.pns_call_id); - } else { - /* assume PAC->PNS */ - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = - htonl(master->help.ct_pptp_info.pac_call_id); - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = - htonl(master->help.ct_pptp_info.pac_call_id); - } - - /* delete other expectation */ - if (exp->expected_list.next != &exp->expected_list) { - struct ip_conntrack_expect *other_exp; - struct list_head *cur_item, *next; - - for (cur_item = master->sibling_list.next; - cur_item != &master->sibling_list; cur_item = next) { - next = cur_item->next; - other_exp = list_entry(cur_item, - struct ip_conntrack_expect, - expected_list); - /* remove only if occurred at same sequence number */ - if (other_exp != exp && other_exp->seq == exp->seq) { - DEBUGP("unexpecting other direction\n"); - ip_ct_gre_keymap_destroy(other_exp); - ip_conntrack_unexpect_related(other_exp); - } - } - } - - return 0; -} - -/* timeout GRE data connections */ -static int pptp_timeout_related(struct ip_conntrack *ct) -{ - struct list_head *cur_item, *next; - struct ip_conntrack_expect *exp; - - /* FIXME: do we have to lock something ? */ - for (cur_item = ct->sibling_list.next; - cur_item != &ct->sibling_list; cur_item = next) { - next = cur_item->next; - exp = list_entry(cur_item, struct ip_conntrack_expect, - expected_list); - - ip_ct_gre_keymap_destroy(exp); - if (!exp->sibling) { - ip_conntrack_unexpect_related(exp); - continue; - } - - DEBUGP("setting timeout of conntrack %p to 0\n", - exp->sibling); - exp->sibling->proto.gre.timeout = 0; - exp->sibling->proto.gre.stream_timeout = 0; - /* refresh_acct will not modify counters if skb == NULL */ - ip_ct_refresh_acct(exp->sibling, 0, NULL, 0); - } - - return 0; -} - -/* expect GRE connections (PNS->PAC and PAC->PNS direction) */ -static inline int -exp_gre(struct ip_conntrack *master, - u_int32_t seq, - u_int16_t callid, - u_int16_t peer_callid) -{ - struct ip_conntrack_tuple inv_tuple; - struct ip_conntrack_tuple exp_tuples[] = { - /* tuple in original direction, PNS->PAC */ - { .src = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip, - .u = { .gre = { .key = htonl(ntohs(peer_callid)) } } - }, - .dst = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip, - .u = { .gre = { .key = htonl(ntohs(callid)) } }, - .protonum = IPPROTO_GRE - }, - }, - /* tuple in reply direction, PAC->PNS */ - { .src = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip, - .u = { .gre = { .key = htonl(ntohs(callid)) } } - }, - .dst = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip, - .u = { .gre = { .key = htonl(ntohs(peer_callid)) } }, - .protonum = IPPROTO_GRE - }, - } - }, *exp_tuple; - - for (exp_tuple = exp_tuples; exp_tuple < &exp_tuples[2]; exp_tuple++) { - struct ip_conntrack_expect *exp; - - exp = ip_conntrack_expect_alloc(); - if (exp == NULL) - return 1; - - memcpy(&exp->tuple, exp_tuple, sizeof(exp->tuple)); - - exp->mask.src.ip = 0xffffffff; - exp->mask.src.u.all = 0; - exp->mask.dst.u.all = 0; - exp->mask.dst.u.gre.key = 0xffffffff; - exp->mask.dst.ip = 0xffffffff; - exp->mask.dst.protonum = 0xffff; - - exp->seq = seq; - exp->expectfn = pptp_expectfn; - - exp->help.exp_pptp_info.pac_call_id = ntohs(callid); - exp->help.exp_pptp_info.pns_call_id = ntohs(peer_callid); - - DEBUGP("calling expect_related "); - DUMP_TUPLE_RAW(&exp->tuple); - - /* Add GRE keymap entries */ - if (ip_ct_gre_keymap_add(exp, &exp->tuple, 0) != 0) { - kfree(exp); - return 1; - } - - invert_tuplepr(&inv_tuple, &exp->tuple); - if (ip_ct_gre_keymap_add(exp, &inv_tuple, 1) != 0) { - ip_ct_gre_keymap_destroy(exp); - kfree(exp); - return 1; - } - - if (ip_conntrack_expect_related(exp, master) != 0) { - ip_ct_gre_keymap_destroy(exp); - kfree(exp); - DEBUGP("cannot expect_related()\n"); - return 1; - } - } - - return 0; -} - -static inline int -pptp_inbound_pkt(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int ctlhoff, - size_t datalen, - struct ip_conntrack *ct) -{ - struct PptpControlHeader _ctlh, *ctlh; - unsigned int reqlen; - union pptp_ctrl_union _pptpReq, *pptpReq; - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - u_int16_t msg, *cid, *pcid; - u_int32_t seq; - - ctlh = skb_header_pointer(skb, ctlhoff, sizeof(_ctlh), &_ctlh); - if (unlikely(!ctlh)) { - DEBUGP("error during skb_header_pointer\n"); - return NF_ACCEPT; - } - - reqlen = datalen - sizeof(struct pptp_pkt_hdr) - sizeof(_ctlh); - pptpReq = skb_header_pointer(skb, ctlhoff+sizeof(struct pptp_pkt_hdr), - reqlen, &_pptpReq); - if (unlikely(!pptpReq)) { - DEBUGP("error during skb_header_pointer\n"); - return NF_ACCEPT; - } - - msg = ntohs(ctlh->messageType); - DEBUGP("inbound control message %s\n", strMName[msg]); - - switch (msg) { - case PPTP_START_SESSION_REPLY: - if (reqlen < sizeof(_pptpReq.srep)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server confirms new control session */ - if (info->sstate < PPTP_SESSION_REQUESTED) { - DEBUGP("%s without START_SESS_REQUEST\n", - strMName[msg]); - break; - } - if (pptpReq->srep.resultCode == PPTP_START_OK) - info->sstate = PPTP_SESSION_CONFIRMED; - else - info->sstate = PPTP_SESSION_ERROR; - break; - - case PPTP_STOP_SESSION_REPLY: - if (reqlen < sizeof(_pptpReq.strep)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server confirms end of control session */ - if (info->sstate > PPTP_SESSION_STOPREQ) { - DEBUGP("%s without STOP_SESS_REQUEST\n", - strMName[msg]); - break; - } - if (pptpReq->strep.resultCode == PPTP_STOP_OK) - info->sstate = PPTP_SESSION_NONE; - else - info->sstate = PPTP_SESSION_ERROR; - break; - - case PPTP_OUT_CALL_REPLY: - if (reqlen < sizeof(_pptpReq.ocack)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server accepted call, we now expect GRE frames */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("%s but no session\n", strMName[msg]); - break; - } - if (info->cstate != PPTP_CALL_OUT_REQ && - info->cstate != PPTP_CALL_OUT_CONF) { - DEBUGP("%s without OUTCALL_REQ\n", strMName[msg]); - break; - } - if (pptpReq->ocack.resultCode != PPTP_OUTCALL_CONNECT) { - info->cstate = PPTP_CALL_NONE; - break; - } - - cid = &pptpReq->ocack.callID; - pcid = &pptpReq->ocack.peersCallID; - - info->pac_call_id = ntohs(*cid); - - if (htons(info->pns_call_id) != *pcid) { - DEBUGP("%s for unknown callid %u\n", - strMName[msg], ntohs(*pcid)); - break; - } - - DEBUGP("%s, CID=%X, PCID=%X\n", strMName[msg], - ntohs(*cid), ntohs(*pcid)); - - info->cstate = PPTP_CALL_OUT_CONF; - - seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr) - + sizeof(struct PptpControlHeader) - + ((void *)pcid - (void *)pptpReq); - - if (exp_gre(ct, seq, *cid, *pcid) != 0) - printk("ip_conntrack_pptp: error during exp_gre\n"); - break; - - case PPTP_IN_CALL_REQUEST: - if (reqlen < sizeof(_pptpReq.icack)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server tells us about incoming call request */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("%s but no session\n", strMName[msg]); - break; - } - pcid = &pptpReq->icack.peersCallID; - DEBUGP("%s, PCID=%X\n", strMName[msg], ntohs(*pcid)); - info->cstate = PPTP_CALL_IN_REQ; - info->pac_call_id = ntohs(*pcid); - break; - - case PPTP_IN_CALL_CONNECT: - if (reqlen < sizeof(_pptpReq.iccon)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server tells us about incoming call established */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("%s but no session\n", strMName[msg]); - break; - } - if (info->sstate != PPTP_CALL_IN_REP - && info->sstate != PPTP_CALL_IN_CONF) { - DEBUGP("%s but never sent IN_CALL_REPLY\n", - strMName[msg]); - break; - } - - pcid = &pptpReq->iccon.peersCallID; - cid = &info->pac_call_id; - - if (info->pns_call_id != ntohs(*pcid)) { - DEBUGP("%s for unknown CallID %u\n", - strMName[msg], ntohs(*cid)); - break; - } - - DEBUGP("%s, PCID=%X\n", strMName[msg], ntohs(*pcid)); - info->cstate = PPTP_CALL_IN_CONF; - - /* we expect a GRE connection from PAC to PNS */ - seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr) - + sizeof(struct PptpControlHeader) - + ((void *)pcid - (void *)pptpReq); - - if (exp_gre(ct, seq, *cid, *pcid) != 0) - printk("ip_conntrack_pptp: error during exp_gre\n"); - - break; - - case PPTP_CALL_DISCONNECT_NOTIFY: - if (reqlen < sizeof(_pptpReq.disc)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* server confirms disconnect */ - cid = &pptpReq->disc.callID; - DEBUGP("%s, CID=%X\n", strMName[msg], ntohs(*cid)); - info->cstate = PPTP_CALL_NONE; - - /* untrack this call id, unexpect GRE packets */ - pptp_timeout_related(ct); - break; - - case PPTP_WAN_ERROR_NOTIFY: - break; - - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* I don't have to explain these ;) */ - break; - default: - DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX) - ? strMName[msg]:strMName[0], msg); - break; - } - - return NF_ACCEPT; - -} - -static inline int -pptp_outbound_pkt(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int ctlhoff, - size_t datalen, - struct ip_conntrack *ct) -{ - struct PptpControlHeader _ctlh, *ctlh; - unsigned int reqlen; - union pptp_ctrl_union _pptpReq, *pptpReq; - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - u_int16_t msg, *cid, *pcid; - - ctlh = skb_header_pointer(skb, ctlhoff, sizeof(_ctlh), &_ctlh); - if (!ctlh) - return NF_ACCEPT; - - reqlen = datalen - sizeof(struct pptp_pkt_hdr) - sizeof(_ctlh); - pptpReq = skb_header_pointer(skb, ctlhoff+sizeof(_ctlh), reqlen, - &_pptpReq); - if (!pptpReq) - return NF_ACCEPT; - - msg = ntohs(ctlh->messageType); - DEBUGP("outbound control message %s\n", strMName[msg]); - - switch (msg) { - case PPTP_START_SESSION_REQUEST: - /* client requests for new control session */ - if (info->sstate != PPTP_SESSION_NONE) { - DEBUGP("%s but we already have one", - strMName[msg]); - } - info->sstate = PPTP_SESSION_REQUESTED; - break; - case PPTP_STOP_SESSION_REQUEST: - /* client requests end of control session */ - info->sstate = PPTP_SESSION_STOPREQ; - break; - - case PPTP_OUT_CALL_REQUEST: - if (reqlen < sizeof(_pptpReq.ocreq)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* client initiating connection to server */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("%s but no session\n", - strMName[msg]); - break; - } - info->cstate = PPTP_CALL_OUT_REQ; - /* track PNS call id */ - cid = &pptpReq->ocreq.callID; - DEBUGP("%s, CID=%X\n", strMName[msg], ntohs(*cid)); - info->pns_call_id = ntohs(*cid); - break; - case PPTP_IN_CALL_REPLY: - if (reqlen < sizeof(_pptpReq.icack)) { - DEBUGP("%s: short packet\n", strMName[msg]); - break; - } - - /* client answers incoming call */ - if (info->cstate != PPTP_CALL_IN_REQ - && info->cstate != PPTP_CALL_IN_REP) { - DEBUGP("%s without incall_req\n", - strMName[msg]); - break; - } - if (pptpReq->icack.resultCode != PPTP_INCALL_ACCEPT) { - info->cstate = PPTP_CALL_NONE; - break; - } - pcid = &pptpReq->icack.peersCallID; - if (info->pac_call_id != ntohs(*pcid)) { - DEBUGP("%s for unknown call %u\n", - strMName[msg], ntohs(*pcid)); - break; - } - DEBUGP("%s, CID=%X\n", strMName[msg], ntohs(*pcid)); - /* part two of the three-way handshake */ - info->cstate = PPTP_CALL_IN_REP; - info->pns_call_id = ntohs(pptpReq->icack.callID); - break; - - case PPTP_CALL_CLEAR_REQUEST: - /* client requests hangup of call */ - if (info->sstate != PPTP_SESSION_CONFIRMED) { - DEBUGP("CLEAR_CALL but no session\n"); - break; - } - /* FUTURE: iterate over all calls and check if - * call ID is valid. We don't do this without newnat, - * because we only know about last call */ - info->cstate = PPTP_CALL_CLEAR_REQ; - break; - case PPTP_SET_LINK_INFO: - break; - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* I don't have to explain these ;) */ - break; - default: - DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX)? - strMName[msg]:strMName[0], msg); - /* unknown: no need to create GRE masq table entry */ - break; - } - - return NF_ACCEPT; -} - - -/* track caller id inside control connection, call expect_related */ -static int -conntrack_pptp_help(struct sk_buff *skb, - struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) - -{ - struct pptp_pkt_hdr _pptph, *pptph; - - struct tcphdr _tcph, *tcph; - u_int32_t tcplen = skb->len - skb->nh.iph->ihl * 4; - u_int32_t datalen; - void *datalimit; - int dir = CTINFO2DIR(ctinfo); - struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info; - unsigned int nexthdr_off; - - int oldsstate, oldcstate; - int ret; - - /* don't do any tracking before tcp handshake complete */ - if (ctinfo != IP_CT_ESTABLISHED - && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { - DEBUGP("ctinfo = %u, skipping\n", ctinfo); - return NF_ACCEPT; - } - - nexthdr_off = skb->nh.iph->ihl*4; - tcph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_tcph), - &_tcph); - if (!tcph) - return NF_ACCEPT; - - /* not a complete TCP header? */ - if (tcplen < sizeof(struct tcphdr) || tcplen < tcph->doff * 4) { - DEBUGP("tcplen = %u\n", tcplen); - return NF_ACCEPT; - } - - - datalen = tcplen - tcph->doff * 4; - - /* checksum invalid? */ - if (tcp_v4_check(tcph, tcplen, skb->nh.iph->saddr, skb->nh.iph->daddr, - csum_partial((char *) tcph, tcplen, 0))) { - printk(KERN_NOTICE __FILE__ ": bad csum\n"); - /* W2K PPTP server sends TCP packets with wrong checksum :(( */ - //return NF_ACCEPT; - } - - if (tcph->fin || tcph->rst) { - DEBUGP("RST/FIN received, timeouting GRE\n"); - /* can't do this after real newnat */ - info->cstate = PPTP_CALL_NONE; - - /* untrack this call id, unexpect GRE packets */ - pptp_timeout_related(ct); - } - - nexthdr_off += tcph->doff*4; - pptph = skb_header_pointer(skb, skb->nh.iph->ihl*4 + tcph->doff*4, - sizeof(_pptph), &_pptph); - if (!pptph) { - DEBUGP("no full PPTP header, can't track\n"); - return NF_ACCEPT; - } - - datalimit = (void *) pptph + datalen; - - /* if it's not a control message we can't do anything with it */ - if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || - ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { - DEBUGP("not a control packet\n"); - return NF_ACCEPT; - } - - oldsstate = info->sstate; - oldcstate = info->cstate; - - LOCK_BH(&ip_pptp_lock); - - nexthdr_off += sizeof(_pptph); - /* FIXME: We just blindly assume that the control connection is always - * established from PNS->PAC. However, RFC makes no guarantee */ - if (dir == IP_CT_DIR_ORIGINAL) - /* client -> server (PNS -> PAC) */ - ret = pptp_outbound_pkt(skb, tcph, nexthdr_off, datalen, ct); - else - /* server -> client (PAC -> PNS) */ - ret = pptp_inbound_pkt(skb, tcph, nexthdr_off, datalen, ct); - DEBUGP("sstate: %d->%d, cstate: %d->%d\n", - oldsstate, info->sstate, oldcstate, info->cstate); - UNLOCK_BH(&ip_pptp_lock); - - return ret; -} - -/* control protocol helper */ -static struct ip_conntrack_helper pptp = { - .list = { NULL, NULL }, - .name = "pptp", - .flags = IP_CT_HELPER_F_REUSE_EXPECT, - .me = THIS_MODULE, - .max_expected = 2, - .timeout = 0, - .tuple = { .src = { .ip = 0, - .u = { .tcp = { .port = - __constant_htons(PPTP_CONTROL_PORT) } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = IPPROTO_TCP - } - }, - .mask = { .src = { .ip = 0, - .u = { .tcp = { .port = 0xffff } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = 0xffff - } - }, - .help = conntrack_pptp_help -}; - -/* ip_conntrack_pptp initialization */ -static int __init init(void) -{ - int retcode; - - DEBUGP(__FILE__ ": registering helper\n"); - if ((retcode = ip_conntrack_helper_register(&pptp))) { - printk(KERN_ERR "Unable to register conntrack application " - "helper for pptp: %d\n", retcode); - return -EIO; - } - - printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION); - return 0; -} - -static void __exit fini(void) -{ - ip_conntrack_helper_unregister(&pptp); - printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION); -} - -module_init(init); -module_exit(fini); - -EXPORT_SYMBOL(ip_pptp_lock); diff --git a/net/ipv4/netfilter/ip_conntrack_pptp_priv.h b/net/ipv4/netfilter/ip_conntrack_pptp_priv.h deleted file mode 100644 index 6b52564e8..000000000 --- a/net/ipv4/netfilter/ip_conntrack_pptp_priv.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _IP_CT_PPTP_PRIV_H -#define _IP_CT_PPTP_PRIV_H - -/* PptpControlMessageType names */ -static const char *strMName[] = { - "UNKNOWN_MESSAGE", - "START_SESSION_REQUEST", - "START_SESSION_REPLY", - "STOP_SESSION_REQUEST", - "STOP_SESSION_REPLY", - "ECHO_REQUEST", - "ECHO_REPLY", - "OUT_CALL_REQUEST", - "OUT_CALL_REPLY", - "IN_CALL_REQUEST", - "IN_CALL_REPLY", - "IN_CALL_CONNECT", - "CALL_CLEAR_REQUEST", - "CALL_DISCONNECT_NOTIFY", - "WAN_ERROR_NOTIFY", - "SET_LINK_INFO" -}; - -#endif diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c deleted file mode 100644 index 013f759cc..000000000 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ /dev/null @@ -1,349 +0,0 @@ -/* - * ip_conntrack_proto_gre.c - Version 2.0 - * - * Connection tracking protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2004 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -DECLARE_RWLOCK(ip_ct_gre_lock); -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_ct_gre_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_ct_gre_lock) - -#include -#include -#include -#include - -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE"); - -/* shamelessly stolen from ip_conntrack_proto_udp.c */ -#define GRE_TIMEOUT (30*HZ) -#define GRE_STREAM_TIMEOUT (180*HZ) - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args) -#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x\n", \ - NIPQUAD((x)->src.ip), ntohl((x)->src.u.gre.key), \ - NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.gre.key)) -#else -#define DEBUGP(x, args...) -#define DUMP_TUPLE_GRE(x) -#endif - -/* GRE KEYMAP HANDLING FUNCTIONS */ -static LIST_HEAD(gre_keymap_list); - -static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, - const struct ip_conntrack_tuple *t) -{ - return ((km->tuple.src.ip == t->src.ip) && - (km->tuple.dst.ip == t->dst.ip) && - (km->tuple.dst.protonum == t->dst.protonum) && - (km->tuple.dst.u.all == t->dst.u.all)); -} - -/* look up the source key for a given tuple */ -static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) -{ - struct ip_ct_gre_keymap *km; - u_int32_t key; - - READ_LOCK(&ip_ct_gre_lock); - km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, - struct ip_ct_gre_keymap *, t); - if (!km) { - READ_UNLOCK(&ip_ct_gre_lock); - return 0; - } - - key = km->tuple.src.u.gre.key; - READ_UNLOCK(&ip_ct_gre_lock); - - return key; -} - -/* add a single keymap entry, associate with specified expect */ -int ip_ct_gre_keymap_add(struct ip_conntrack_expect *exp, - struct ip_conntrack_tuple *t, int reply) -{ - struct ip_ct_gre_keymap *km; - - km = kmalloc(sizeof(*km), GFP_ATOMIC); - if (!km) - return -1; - - /* initializing list head should be sufficient */ - memset(km, 0, sizeof(*km)); - - memcpy(&km->tuple, t, sizeof(*t)); - - if (!reply) - exp->proto.gre.keymap_orig = km; - else - exp->proto.gre.keymap_reply = km; - - DEBUGP("adding new entry %p: ", km); - DUMP_TUPLE_GRE(&km->tuple); - - WRITE_LOCK(&ip_ct_gre_lock); - list_append(&gre_keymap_list, km); - WRITE_UNLOCK(&ip_ct_gre_lock); - - return 0; -} - -/* change the tuple of a keymap entry (used by nat helper) */ -void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km, - struct ip_conntrack_tuple *t) -{ - if (!km) - { - printk(KERN_WARNING - "NULL GRE conntrack keymap change requested\n"); - return; - } - - DEBUGP("changing entry %p to: ", km); - DUMP_TUPLE_GRE(t); - - WRITE_LOCK(&ip_ct_gre_lock); - memcpy(&km->tuple, t, sizeof(km->tuple)); - WRITE_UNLOCK(&ip_ct_gre_lock); -} - -/* destroy the keymap entries associated with specified expect */ -void ip_ct_gre_keymap_destroy(struct ip_conntrack_expect *exp) -{ - DEBUGP("entering for exp %p\n", exp); - WRITE_LOCK(&ip_ct_gre_lock); - if (exp->proto.gre.keymap_orig) { - DEBUGP("removing %p from list\n", exp->proto.gre.keymap_orig); - list_del(&exp->proto.gre.keymap_orig->list); - kfree(exp->proto.gre.keymap_orig); - exp->proto.gre.keymap_orig = NULL; - } - if (exp->proto.gre.keymap_reply) { - DEBUGP("removing %p from list\n", exp->proto.gre.keymap_reply); - list_del(&exp->proto.gre.keymap_reply->list); - kfree(exp->proto.gre.keymap_reply); - exp->proto.gre.keymap_reply = NULL; - } - WRITE_UNLOCK(&ip_ct_gre_lock); -} - - -/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ - -/* invert gre part of tuple */ -static int gre_invert_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_conntrack_tuple *orig) -{ - tuple->dst.u.gre.key = orig->src.u.gre.key; - tuple->src.u.gre.key = orig->dst.u.gre.key; - - return 1; -} - -/* gre hdr info to tuple */ -static int gre_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct ip_conntrack_tuple *tuple) -{ - struct gre_hdr _grehdr, *grehdr; - struct gre_hdr_pptp _pgrehdr, *pgrehdr; - u_int32_t srckey; - - grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - /* PPTP header is variable length, only need up to the call_id field */ - pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); - - if (!grehdr || !pgrehdr) - return 0; - - switch (grehdr->version) { - case GRE_VERSION_1701: - if (!grehdr->key) { - DEBUGP("Can't track GRE without key\n"); - return 0; - } - tuple->dst.u.gre.key = *(gre_key(grehdr)); - break; - - case GRE_VERSION_PPTP: - if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { - DEBUGP("GRE_VERSION_PPTP but unknown proto\n"); - return 0; - } - tuple->dst.u.gre.key = htonl(ntohs(pgrehdr->call_id)); - break; - - default: - printk(KERN_WARNING "unknown GRE version %hu\n", - grehdr->version); - return 0; - } - - srckey = gre_keymap_lookup(tuple); - - tuple->src.u.gre.key = srckey; -#if 0 - DEBUGP("found src key %x for tuple ", ntohl(srckey)); - DUMP_TUPLE_GRE(tuple); -#endif - - return 1; -} - -/* print gre part of tuple */ -static unsigned int gre_print_tuple(char *buffer, - const struct ip_conntrack_tuple *tuple) -{ - return sprintf(buffer, "srckey=0x%x dstkey=0x%x ", - ntohl(tuple->src.u.gre.key), - ntohl(tuple->dst.u.gre.key)); -} - -/* print private data for conntrack */ -static unsigned int gre_print_conntrack(char *buffer, - const struct ip_conntrack *ct) -{ - return sprintf(buffer, "timeout=%u, stream_timeout=%u ", - (ct->proto.gre.timeout / HZ), - (ct->proto.gre.stream_timeout / HZ)); -} - -/* Returns verdict for packet, and may modify conntrack */ -static int gre_packet(struct ip_conntrack *ct, - const struct sk_buff *skb, - enum ip_conntrack_info conntrackinfo) -{ - /* If we've seen traffic both ways, this is a GRE connection. - * Extend timeout. */ - if (ct->status & IPS_SEEN_REPLY) { - ip_ct_refresh_acct(ct, conntrackinfo, skb, - ct->proto.gre.stream_timeout); - /* Also, more likely to be important, and not a probe. */ - set_bit(IPS_ASSURED_BIT, &ct->status); - } else - ip_ct_refresh_acct(ct, conntrackinfo, skb, - ct->proto.gre.timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static int gre_new(struct ip_conntrack *ct, - const struct sk_buff *skb) -{ - DEBUGP(": "); - DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - - /* initialize to sane value. Ideally a conntrack helper - * (e.g. in case of pptp) is increasing them */ - ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT; - ct->proto.gre.timeout = GRE_TIMEOUT; - - return 1; -} - -/* Called when a conntrack entry has already been removed from the hashes - * and is about to be deleted from memory */ -static void gre_destroy(struct ip_conntrack *ct) -{ - struct ip_conntrack_expect *master = ct->master; - - DEBUGP(" entering\n"); - - if (!master) { - DEBUGP("no master exp for ct %p\n", ct); - return; - } - - ip_ct_gre_keymap_destroy(master); -} - -/* protocol helper struct */ -static struct ip_conntrack_protocol gre = { - .proto = IPPROTO_GRE, - .name = "gre", - .pkt_to_tuple = gre_pkt_to_tuple, - .invert_tuple = gre_invert_tuple, - .print_tuple = gre_print_tuple, - .print_conntrack = gre_print_conntrack, - .packet = gre_packet, - .new = gre_new, - .destroy = gre_destroy, - .exp_matches_pkt = NULL, - .me = THIS_MODULE -}; - -/* ip_conntrack_proto_gre initialization */ -static int __init init(void) -{ - int retcode; - - if ((retcode = ip_conntrack_protocol_register(&gre))) { - printk(KERN_ERR "Unable to register conntrack protocol " - "helper for gre: %d\n", retcode); - return -EIO; - } - - return 0; -} - -static void __exit fini(void) -{ - struct list_head *pos, *n; - - /* delete all keymap entries */ - WRITE_LOCK(&ip_ct_gre_lock); - list_for_each_safe(pos, n, &gre_keymap_list) { - DEBUGP("deleting keymap %p at module unload time\n", pos); - list_del(pos); - kfree(pos); - } - WRITE_UNLOCK(&ip_ct_gre_lock); - - ip_conntrack_protocol_unregister(&gre); -} - -EXPORT_SYMBOL(ip_ct_gre_keymap_add); -EXPORT_SYMBOL(ip_ct_gre_keymap_change); -EXPORT_SYMBOL(ip_ct_gre_keymap_destroy); - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_pptp.c b/net/ipv4/netfilter/ip_nat_pptp.c deleted file mode 100644 index 2bbb815e9..000000000 --- a/net/ipv4/netfilter/ip_nat_pptp.c +++ /dev/null @@ -1,477 +0,0 @@ -/* - * ip_nat_pptp.c - Version 2.0 - * - * NAT support for PPTP (Point to Point Tunneling Protocol). - * PPTP is a a protocol for creating virtual private networks. - * It is a specification defined by Microsoft and some vendors - * working with Microsoft. PPTP is built on top of a modified - * version of the Internet Generic Routing Encapsulation Protocol. - * GRE is defined in RFC 1701 and RFC 1702. Documentation of - * PPTP can be found in RFC 2637 - * - * (C) 2000-2004 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * TODO: - Support for multiple calls within one session - * (needs netfilter newnat code) - * - NAT to a unique tuple, not to TCP source port - * (needs netfilter tuple reservation) - * - * Changes: - * 2002-02-10 - Version 1.3 - * - Use ip_nat_mangle_tcp_packet() because of cloned skb's - * in local connections (Philip Craig ) - * - add checks for magicCookie and pptp version - * - make argument list of pptp_{out,in}bound_packet() shorter - * - move to C99 style initializers - * - print version number at module loadtime - * 2003-09-22 - Version 1.5 - * - use SNATed tcp sourceport as callid, since we get called before - * TCP header is mangled (Philip Craig ) - * 2004-10-22 - Version 2.0 - * - kernel 2.6.x version - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define IP_NAT_PPTP_VERSION "2.0" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP"); - - -#if 0 -#include "ip_conntrack_pptp_priv.h" -#define DEBUGP(format, args...) printk(KERN_DEBUG __FILE__ ":" __FUNCTION__ \ - ": " format, ## args) -#else -#define DEBUGP(format, args...) -#endif - -static unsigned int -pptp_nat_expected(struct sk_buff **pskb, - unsigned int hooknum, - struct ip_conntrack *ct, - struct ip_nat_info *info) -{ - struct ip_conntrack *master = master_ct(ct); - struct ip_nat_multi_range mr; - struct ip_ct_pptp_master *ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info; - u_int32_t newip, newcid; - int ret; - - IP_NF_ASSERT(info); - IP_NF_ASSERT(master); - IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum)))); - - DEBUGP("we have a connection!\n"); - - LOCK_BH(&ip_pptp_lock); - ct_pptp_info = &master->help.ct_pptp_info; - nat_pptp_info = &master->nat.help.nat_pptp_info; - - /* need to alter GRE tuple because conntrack expectfn() used 'wrong' - * (unmanipulated) values */ - if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) { - DEBUGP("completing tuples with NAT info \n"); - /* we can do this, since we're unconfirmed */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == - htonl(ct_pptp_info->pac_call_id)) { - /* assume PNS->PAC */ - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = - htonl(nat_pptp_info->pns_call_id); - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = - htonl(nat_pptp_info->pns_call_id); - newip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; - newcid = htonl(nat_pptp_info->pac_call_id); - } else { - /* assume PAC->PNS */ - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = - htonl(nat_pptp_info->pac_call_id); - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = - htonl(nat_pptp_info->pac_call_id); - newip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; - newcid = htonl(nat_pptp_info->pns_call_id); - } - } else { - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == - htonl(ct_pptp_info->pac_call_id)) { - /* assume PNS->PAC */ - newip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - newcid = htonl(ct_pptp_info->pns_call_id); - } - else { - /* assume PAC->PNS */ - newip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; - newcid = htonl(ct_pptp_info->pac_call_id); - } - } - - mr.rangesize = 1; - mr.range[0].flags = IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED; - mr.range[0].min_ip = mr.range[0].max_ip = newip; - mr.range[0].min = mr.range[0].max = - ((union ip_conntrack_manip_proto ) { newcid }); - DEBUGP("change ip to %u.%u.%u.%u\n", - NIPQUAD(newip)); - DEBUGP("change key to 0x%x\n", ntohl(newcid)); - ret = ip_nat_setup_info(ct, &mr, hooknum); - - UNLOCK_BH(&ip_pptp_lock); - - return ret; - -} - -/* outbound packets == from PNS to PAC */ -static inline unsigned int -pptp_outbound_pkt(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *exp) - -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct tcphdr *tcph = (void *) iph + iph->ihl*4; - struct pptp_pkt_hdr *pptph = (struct pptp_pkt_hdr *) - ((void *)tcph + tcph->doff*4); - - struct PptpControlHeader *ctlh; - union pptp_ctrl_union *pptpReq; - struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - - u_int16_t msg, *cid = NULL, new_callid; - - /* FIXME: size checks !!! */ - ctlh = (struct PptpControlHeader *) ((void *) pptph + sizeof(*pptph)); - pptpReq = (void *) ((void *) ctlh + sizeof(*ctlh)); - - new_callid = htons(ct_pptp_info->pns_call_id); - - switch (msg = ntohs(ctlh->messageType)) { - case PPTP_OUT_CALL_REQUEST: - cid = &pptpReq->ocreq.callID; - /* FIXME: ideally we would want to reserve a call ID - * here. current netfilter NAT core is not able to do - * this :( For now we use TCP source port. This breaks - * multiple calls within one control session */ - - /* save original call ID in nat_info */ - nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id; - - /* don't use tcph->source since we are at a DSTmanip - * hook (e.g. PREROUTING) and pkt is not mangled yet */ - new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; - - /* save new call ID in ct info */ - ct_pptp_info->pns_call_id = ntohs(new_callid); - break; - case PPTP_IN_CALL_REPLY: - cid = &pptpReq->icreq.callID; - break; - case PPTP_CALL_CLEAR_REQUEST: - cid = &pptpReq->clrreq.callID; - break; - default: - DEBUGP("unknown outbound packet 0x%04x:%s\n", msg, - (msg <= PPTP_MSG_MAX)? strMName[msg]:strMName[0]); - /* fall through */ - - case PPTP_SET_LINK_INFO: - /* only need to NAT in case PAC is behind NAT box */ - case PPTP_START_SESSION_REQUEST: - case PPTP_START_SESSION_REPLY: - case PPTP_STOP_SESSION_REQUEST: - case PPTP_STOP_SESSION_REPLY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* no need to alter packet */ - return NF_ACCEPT; - } - - IP_NF_ASSERT(cid); - - DEBUGP("altering call id from 0x%04x to 0x%04x\n", - ntohs(*cid), ntohs(new_callid)); - - /* mangle packet */ - ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, (void *)cid - (void *)pptph, - sizeof(new_callid), (char *)&new_callid, - sizeof(new_callid)); - - return NF_ACCEPT; -} - -/* inbound packets == from PAC to PNS */ -static inline unsigned int -pptp_inbound_pkt(struct sk_buff **pskb, - struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - struct ip_conntrack_expect *oldexp) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct tcphdr *tcph = (void *) iph + iph->ihl*4; - struct pptp_pkt_hdr *pptph = (struct pptp_pkt_hdr *) - ((void *)tcph + tcph->doff*4); - - struct PptpControlHeader *ctlh; - union pptp_ctrl_union *pptpReq; - struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; - struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - - u_int16_t msg, new_cid = 0, new_pcid, *pcid = NULL, *cid = NULL; - u_int32_t old_dst_ip; - - struct ip_conntrack_tuple t, inv_t; - struct ip_conntrack_tuple *orig_t, *reply_t; - - /* FIXME: size checks !!! */ - ctlh = (struct PptpControlHeader *) ((void *) pptph + sizeof(*pptph)); - pptpReq = (void *) ((void *) ctlh + sizeof(*ctlh)); - - new_pcid = htons(nat_pptp_info->pns_call_id); - - switch (msg = ntohs(ctlh->messageType)) { - case PPTP_OUT_CALL_REPLY: - pcid = &pptpReq->ocack.peersCallID; - cid = &pptpReq->ocack.callID; - if (!oldexp) { - DEBUGP("outcall but no expectation\n"); - break; - } - old_dst_ip = oldexp->tuple.dst.ip; - t = oldexp->tuple; - invert_tuplepr(&inv_t, &t); - - /* save original PAC call ID in nat_info */ - nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id; - - /* alter expectation */ - orig_t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - reply_t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - if (t.src.ip == orig_t->src.ip && t.dst.ip == orig_t->dst.ip) { - /* expectation for PNS->PAC direction */ - t.src.u.gre.key = htonl(nat_pptp_info->pns_call_id); - t.dst.u.gre.key = htonl(ct_pptp_info->pac_call_id); - inv_t.src.ip = reply_t->src.ip; - inv_t.dst.ip = reply_t->dst.ip; - inv_t.src.u.gre.key = htonl(nat_pptp_info->pac_call_id); - inv_t.dst.u.gre.key = htonl(ct_pptp_info->pns_call_id); - } else { - /* expectation for PAC->PNS direction */ - t.src.u.gre.key = htonl(nat_pptp_info->pac_call_id); - t.dst.u.gre.key = htonl(ct_pptp_info->pns_call_id); - inv_t.src.ip = orig_t->src.ip; - inv_t.dst.ip = orig_t->dst.ip; - inv_t.src.u.gre.key = htonl(nat_pptp_info->pns_call_id); - inv_t.dst.u.gre.key = htonl(ct_pptp_info->pac_call_id); - } - - if (!ip_conntrack_change_expect(oldexp, &t)) { - DEBUGP("successfully changed expect\n"); - } else { - DEBUGP("can't change expect\n"); - } - ip_ct_gre_keymap_change(oldexp->proto.gre.keymap_orig, &t); - ip_ct_gre_keymap_change(oldexp->proto.gre.keymap_reply, &inv_t); - break; - case PPTP_IN_CALL_CONNECT: - pcid = &pptpReq->iccon.peersCallID; - if (!oldexp) - break; - old_dst_ip = oldexp->tuple.dst.ip; - t = oldexp->tuple; - - /* alter expectation, no need for callID */ - if (t.dst.ip == ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip) { - /* expectation for PNS->PAC direction */ - t.src.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - } else { - /* expectation for PAC->PNS direction */ - t.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - } - - if (!ip_conntrack_change_expect(oldexp, &t)) { - DEBUGP("successfully changed expect\n"); - } else { - DEBUGP("can't change expect\n"); - } - break; - case PPTP_IN_CALL_REQUEST: - /* only need to nat in case PAC is behind NAT box */ - break; - case PPTP_WAN_ERROR_NOTIFY: - pcid = &pptpReq->wanerr.peersCallID; - break; - case PPTP_CALL_DISCONNECT_NOTIFY: - pcid = &pptpReq->disc.callID; - break; - - default: - DEBUGP("unknown inbound packet %s\n", - (msg <= PPTP_MSG_MAX)? strMName[msg]:strMName[0]); - /* fall through */ - - case PPTP_START_SESSION_REQUEST: - case PPTP_START_SESSION_REPLY: - case PPTP_STOP_SESSION_REQUEST: - case PPTP_STOP_SESSION_REPLY: - case PPTP_ECHO_REQUEST: - case PPTP_ECHO_REPLY: - /* no need to alter packet */ - return NF_ACCEPT; - } - - /* mangle packet */ - IP_NF_ASSERT(pcid); - DEBUGP("altering peer call id from 0x%04x to 0x%04x\n", - ntohs(*pcid), ntohs(new_pcid)); - ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, (void *)pcid - (void *)pptph, - sizeof(new_pcid), (char *)&new_pcid, - sizeof(new_pcid)); - - if (new_cid) { - IP_NF_ASSERT(cid); - DEBUGP("altering call id from 0x%04x to 0x%04x\n", - ntohs(*cid), ntohs(new_cid)); - ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - (void *)cid - (void *)pptph, - sizeof(new_cid), (char *)&new_cid, - sizeof(new_cid)); - } - - /* great, at least we don't need to resize packets */ - return NF_ACCEPT; -} - - -static unsigned int tcp_help(struct ip_conntrack *ct, - struct ip_conntrack_expect *exp, - struct ip_nat_info *info, - enum ip_conntrack_info ctinfo, - unsigned int hooknum, struct sk_buff **pskb) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct tcphdr *tcph = (void *) iph + iph->ihl*4; - unsigned int datalen = (*pskb)->len - iph->ihl*4 - tcph->doff*4; - struct pptp_pkt_hdr *pptph; - - int dir; - - DEBUGP("entering\n"); - - /* Only mangle things once: DST for original direction - and SRC for reply direction. */ - dir = CTINFO2DIR(ctinfo); - if (!((HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - && dir == IP_CT_DIR_ORIGINAL) - || (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST - && dir == IP_CT_DIR_REPLY))) { - DEBUGP("Not touching dir %s at hook %s\n", - dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY", - hooknum == NF_IP_POST_ROUTING ? "POSTROUTING" - : hooknum == NF_IP_PRE_ROUTING ? "PREROUTING" - : hooknum == NF_IP_LOCAL_OUT ? "OUTPUT" - : hooknum == NF_IP_LOCAL_IN ? "INPUT" : "???"); - return NF_ACCEPT; - } - - /* if packet is too small, just skip it */ - if (datalen < sizeof(struct pptp_pkt_hdr)+ - sizeof(struct PptpControlHeader)) { - DEBUGP("pptp packet too short\n"); - return NF_ACCEPT; - } - - pptph = (struct pptp_pkt_hdr *) ((void *)tcph + tcph->doff*4); - - /* if it's not a control message, we can't handle it */ - if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || - ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { - DEBUGP("not a pptp control packet\n"); - return NF_ACCEPT; - } - - LOCK_BH(&ip_pptp_lock); - - if (dir == IP_CT_DIR_ORIGINAL) { - /* reuqests sent by client to server (PNS->PAC) */ - pptp_outbound_pkt(pskb, ct, ctinfo, exp); - } else { - /* response from the server to the client (PAC->PNS) */ - pptp_inbound_pkt(pskb, ct, ctinfo, exp); - } - - UNLOCK_BH(&ip_pptp_lock); - - return NF_ACCEPT; -} - -/* nat helper struct for control connection */ -static struct ip_nat_helper pptp_tcp_helper = { - .list = { NULL, NULL }, - .name = "pptp", - .flags = IP_NAT_HELPER_F_ALWAYS, - .me = THIS_MODULE, - .tuple = { .src = { .ip = 0, - .u = { .tcp = { .port = - __constant_htons(PPTP_CONTROL_PORT) } - } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = IPPROTO_TCP - } - }, - - .mask = { .src = { .ip = 0, - .u = { .tcp = { .port = 0xFFFF } } - }, - .dst = { .ip = 0, - .u = { .all = 0 }, - .protonum = 0xFFFF - } - }, - .help = tcp_help, - .expect = pptp_nat_expected -}; - - -static int __init init(void) -{ - DEBUGP("%s: registering NAT helper\n", __FILE__); - if (ip_nat_helper_register(&pptp_tcp_helper)) { - printk(KERN_ERR "Unable to register NAT application helper " - "for pptp\n"); - return -EIO; - } - - printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION); - return 0; -} - -static void __exit fini(void) -{ - DEBUGP("cleanup_module\n" ); - ip_nat_helper_unregister(&pptp_tcp_helper); - printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c deleted file mode 100644 index 5691a102a..000000000 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * ip_nat_proto_gre.c - Version 2.0 - * - * NAT protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2004 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - */ - -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); - -#if 0 -#define DEBUGP(format, args...) printk(KERN_DEBUG __FILE__ ":" __FUNCTION__ \ - ": " format, ## args) -#else -#define DEBUGP(x, args...) -#endif - -/* is key in given range between min and max */ -static int -gre_in_range(const struct ip_conntrack_tuple *tuple, - enum ip_nat_manip_type maniptype, - const union ip_conntrack_manip_proto *min, - const union ip_conntrack_manip_proto *max) -{ - u_int32_t key; - - if (maniptype == IP_NAT_MANIP_SRC) - key = tuple->src.u.gre.key; - else - key = tuple->dst.u.gre.key; - - return ntohl(key) >= ntohl(min->gre.key) - && ntohl(key) <= ntohl(max->gre.key); -} - -/* generate unique tuple ... */ -static int -gre_unique_tuple(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - enum ip_nat_manip_type maniptype, - const struct ip_conntrack *conntrack) -{ - u_int32_t min, i, range_size; - u_int32_t key = 0, *keyptr; - - if (maniptype == IP_NAT_MANIP_SRC) - keyptr = &tuple->src.u.gre.key; - else - keyptr = &tuple->dst.u.gre.key; - - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - DEBUGP("%p: NATing GRE PPTP\n", conntrack); - min = 1; - range_size = 0xffff; - } else { - min = ntohl(range->min.gre.key); - range_size = ntohl(range->max.gre.key) - min + 1; - } - - DEBUGP("min = %u, range_size = %u\n", min, range_size); - - for (i = 0; i < range_size; i++, key++) { - *keyptr = htonl(min + key % range_size); - if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; - } - - DEBUGP("%p: no NAT mapping\n", conntrack); - - return 0; -} - -/* manipulate a GRE packet according to maniptype */ -static int -gre_manip_pkt(struct sk_buff **pskb, - unsigned int hdroff, - const struct ip_conntrack_manip *manip, - enum ip_nat_manip_type maniptype) -{ - struct gre_hdr *greh; - struct gre_hdr_pptp *pgreh; - - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*pgreh))) - return 0; - - greh = (void *)(*pskb)->data + hdroff; - pgreh = (struct gre_hdr_pptp *) greh; - - /* we only have destination manip of a packet, since 'source key' - * is not present in the packet itself */ - if (maniptype == IP_NAT_MANIP_DST) { - /* key manipulation is always dest */ - switch (greh->version) { - case 0: - if (!greh->key) { - DEBUGP("can't nat GRE w/o key\n"); - break; - } - if (greh->csum) { - /* FIXME: Never tested this code... */ - *(gre_csum(greh)) = - ip_nat_cheat_check(~*(gre_key(greh)), - manip->u.gre.key, - *(gre_csum(greh))); - } - *(gre_key(greh)) = manip->u.gre.key; - break; - case GRE_VERSION_PPTP: - DEBUGP("call_id -> 0x%04x\n", - ntohl(manip->u.gre.key)); - pgreh->call_id = htons(ntohl(manip->u.gre.key)); - break; - default: - DEBUGP("can't nat unknown GRE version\n"); - return 0; - break; - } - } - return 1; -} - -/* print out a nat tuple */ -static unsigned int -gre_print(char *buffer, - const struct ip_conntrack_tuple *match, - const struct ip_conntrack_tuple *mask) -{ - unsigned int len = 0; - - if (mask->src.u.gre.key) - len += sprintf(buffer + len, "srckey=0x%x ", - ntohl(match->src.u.gre.key)); - - if (mask->dst.u.gre.key) - len += sprintf(buffer + len, "dstkey=0x%x ", - ntohl(match->src.u.gre.key)); - - return len; -} - -/* print a range of keys */ -static unsigned int -gre_print_range(char *buffer, const struct ip_nat_range *range) -{ - if (range->min.gre.key != 0 - || range->max.gre.key != 0xFFFF) { - if (range->min.gre.key == range->max.gre.key) - return sprintf(buffer, "key 0x%x ", - ntohl(range->min.gre.key)); - else - return sprintf(buffer, "keys 0x%u-0x%u ", - ntohl(range->min.gre.key), - ntohl(range->max.gre.key)); - } else - return 0; -} - -/* nat helper struct */ -static struct ip_nat_protocol gre = { - .name = "GRE", - .protonum = IPPROTO_GRE, - .manip_pkt = gre_manip_pkt, - .in_range = gre_in_range, - .unique_tuple = gre_unique_tuple, - .print = gre_print, - .print_range = gre_print_range -}; - -static int __init init(void) -{ - if (ip_nat_protocol_register(&gre)) - return -EIO; - - return 0; -} - -static void __exit fini(void) -{ - ip_nat_protocol_unregister(&gre); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 23f8f511d..ad097f510 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1107,6 +1107,75 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, return 0; } +/* XXX (mef) need to generalize the IPOD stuff. Right now I am borrowing + from the ICMP infrastructure. */ +#ifdef CONFIG_ICMP_IPOD +#include + +extern int sysctl_icmp_ipod_version; +extern int sysctl_icmp_ipod_enabled; +extern u32 sysctl_icmp_ipod_host; +extern u32 sysctl_icmp_ipod_mask; +extern char sysctl_icmp_ipod_key[32+1]; +#define IPOD_CHECK_KEY \ + (sysctl_icmp_ipod_key[0] != 0) +#define IPOD_VALID_KEY(d) \ + (strncmp(sysctl_icmp_ipod_key, (char *)(d), strlen(sysctl_icmp_ipod_key)) == 0) + +static void udp_ping_of_death(struct sk_buff *skb, struct udphdr *uh, u32 saddr) +{ + int doit = 0; + + /* + * If IPOD not enabled or wrong UDP IPOD port, ignore. + */ + if (!sysctl_icmp_ipod_enabled || (ntohs(uh->dest) != 664)) + return; + +#if 0 + printk(KERN_INFO "IPOD: got udp pod request, host=%u.%u.%u.%u\n", NIPQUAD(saddr)); +#endif + + + /* + * First check the source address info. + * If host not set, ignore. + */ + if (sysctl_icmp_ipod_host != 0xffffffff && + (ntohl(saddr) & sysctl_icmp_ipod_mask) == sysctl_icmp_ipod_host) { + /* + * Now check the key if enabled. + * If packet doesn't contain enough data or key + * is otherwise invalid, ignore. + */ + if (IPOD_CHECK_KEY) { + if (pskb_may_pull(skb, sizeof(sysctl_icmp_ipod_key)+sizeof(struct udphdr)-1)){ +#if 0 + int i; + for (i=0;i<32+1;i++){ + printk("%c",((char*)skb->data)[i+sizeof(struct udphdr)]); + } + printk("\n"); +#endif + if (IPOD_VALID_KEY(skb->data+sizeof(struct udphdr))) + doit = 1; + } + } else { + doit = 1; + } + } + if (doit) { + sysctl_icmp_ipod_enabled = 0; + printk(KERN_CRIT "IPOD: reboot forced by %u.%u.%u.%u...\n", + NIPQUAD(saddr)); + machine_restart(NULL); + } else { + printk(KERN_WARNING "IPOD: from %u.%u.%u.%u rejected\n", + NIPQUAD(saddr)); + } +} +#endif + /* * All we need to do is get the socket, and then do a checksum. */ @@ -1143,6 +1212,10 @@ int udp_rcv(struct sk_buff *skb) if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); +#ifdef CONFIG_ICMP_IPOD + udp_ping_of_death(skb, uh, saddr); +#endif + sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); if (sk != NULL) { diff --git a/scripts/.cvsignore b/scripts/.cvsignore new file mode 100644 index 000000000..d95bc0ab8 --- /dev/null +++ b/scripts/.cvsignore @@ -0,0 +1,4 @@ +bin2c +conmakehash +kallsyms +pnmtologo diff --git a/scripts/basic/.cvsignore b/scripts/basic/.cvsignore new file mode 100644 index 000000000..fa6c88800 --- /dev/null +++ b/scripts/basic/.cvsignore @@ -0,0 +1,3 @@ +docproc +fixdep +split-include diff --git a/scripts/kconfig/.cvsignore b/scripts/kconfig/.cvsignore new file mode 100644 index 000000000..37981a9ca --- /dev/null +++ b/scripts/kconfig/.cvsignore @@ -0,0 +1,5 @@ +conf +lex.zconf.c +mconf +zconf.tab.c +zconf.tab.h diff --git a/scripts/kernel-2.6-planetlab.spec b/scripts/kernel-2.6-planetlab.spec index 4e2be569b..84f9f996d 100644 --- a/scripts/kernel-2.6-planetlab.spec +++ b/scripts/kernel-2.6-planetlab.spec @@ -22,7 +22,7 @@ Summary: The Linux kernel (the core of the Linux operating system) %define kversion 2.6.%{sublevel} %define rpmversion 2.6.%{sublevel} %define rhbsys %([ -r /etc/beehive-root ] && echo || echo .`whoami`) -%define release 1.521.2.6.planetlab%{?date:.%{date}} +%define release 1.521.3.planetlab%{?date:.%{date}} %define signmodules 0 %define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE} diff --git a/scripts/lxdialog/.cvsignore b/scripts/lxdialog/.cvsignore new file mode 100644 index 000000000..bebf29560 --- /dev/null +++ b/scripts/lxdialog/.cvsignore @@ -0,0 +1 @@ +lxdialog diff --git a/scripts/mod/.cvsignore b/scripts/mod/.cvsignore new file mode 100644 index 000000000..a6dd5e27e --- /dev/null +++ b/scripts/mod/.cvsignore @@ -0,0 +1,3 @@ +elfconfig.h +mk_elfconfig +modpost diff --git a/usr/.cvsignore b/usr/.cvsignore new file mode 100644 index 000000000..d06dfff84 --- /dev/null +++ b/usr/.cvsignore @@ -0,0 +1,3 @@ +gen_init_cpio +initramfs_data.cpio +initramfs_data.cpio.gz -- 2.47.0