From: Planet-Lab Support Date: Fri, 21 Jan 2005 03:34:32 +0000 (+0000) Subject: This commit was manufactured by cvs2svn to create tag X-Git-Tag: after-ckrm_E16-cpu-controller-v9rc1^0 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=14f387ae37713b1527d145daf89a1ce1581b5ad0;hp=a91482bdcc2e0f6035702e46f1b99043a0893346;p=linux-2.6.git This commit was manufactured by cvs2svn to create tag 'after-ckrm_E16-cpu-controller-v9rc1'. --- diff --git a/.cvsignore b/.cvsignore new file mode 100644 index 000000000..5e7d07457 --- /dev/null +++ b/.cvsignore @@ -0,0 +1,13 @@ +.config +.tmp_System.map +.tmp_kallsyms1.S +.tmp_kallsyms2.S +.tmp_kallsyms3.S +.tmp_versions +.tmp_vmlinux1 +.tmp_vmlinux2 +.tmp_vmlinux3 +.version +Module.symvers +System.map +vmlinux diff --git a/Documentation/ckrm/cpusched b/Documentation/ckrm/cpusched new file mode 100644 index 000000000..01f7f232a --- /dev/null +++ b/Documentation/ckrm/cpusched @@ -0,0 +1,86 @@ +CKRM CPU Scheduling +=================== + +Overview +-------- + +In CKRM, cpu scheduling is based on a two level scheduling decision. +Every time a new task is to be selected, the scheduler first determines +which class to run next and then schedules the next task in selected +task. + +The scheduling within a class is performed using the default Linux +O(1) scheduler. + +The class scheduler also follows the O(1) principle and works as +follows: + +Each class maintains a local runqueue per cpu aka or short lrq. The existing O(1) scheduler is used to +schedule within an . + +Weights are assigned to each lrq that mirror the effectives shares of +that class. Every time a task executes, its weighted cycles are +charged against its class. Thus classes progress in time called +cummulative virtual time (CVT). In essence the class with the smallest +CVT is selected next. Provisions are made to keep interactivity and +avoid starvation by longer sleeping classes. + +Load balancing across an SMP is performed by balancing the load of +each class across CPUs such that they produce equal load and thus +on the whole system maintain their share. + +Due to the fact that CKRM uses a class hierarchy, cycles that are unused +by a class are redistributed to among busy siblings. +Enabling the CKRM CPU scheduler +------------------------------- + +The scheduler is integrated into the linux scheduler and therefore +can not be loaded dynamically like other CKRM schedulers + +However it can be selected at boot time or dynamically at run time. + +The boot options "ckrmcpu" OR "nockrmcpu" enable / disable the CKRM +cpu scheduler at boot time. Currently by default the scheduler is +disabled. + +# cat /rcfs/taskclass/config + +"res=cpu,mode=enabled" indicates that the CKRM cpu scheduler is +enabled + +"res=cpu,mode=disabled" indicates that the CKRM cpu scheduler is +disabled + +The strings can also be used to dynamically change the scheduling modus +at runtime. For example, to dynamically activate the scheduler. + +# echo "res=cpu,mode=enabled" > /rcfs/taskclass/config + +# cat /rcfs/taskclass/*/stats + +The cpu portion of the scheduler is shown + + "cpu-usage(2,10,60)= 290 340 510" + +The 3 numbers represent the load for the 2 second, 10 second +and 60 seconds. The base = 1000. +Hence the system has 29.0%, 33.5% and 49.8% respectively + +For debugging purposes additional information can be printed out but +that format should not be relied upon. + +Use `echo "res=cpu,usage_detail=3" for the highest detail on usage. +Please consult the source code for the specifics. + +Assigning shares +---------------- + +Follows the general approach described under ckrm_basics. + +# echo "res=cpu,guarantee=val" > shares + +sets the minimum guarantee of a class. + + + diff --git a/MAINTAINERS b/MAINTAINERS index c8c25df43..523f115fb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1226,6 +1226,17 @@ W: http://nfs.sourceforge.net/ W: http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/ S: Maintained +KEXEC +P: Eric Biederman +P: Randy Dunlap +M: ebiederm@xmission.com +M: rddunlap@osdl.org +W: http://www.xmission.com/~ebiederm/files/kexec/ +W: http://developer.osdl.org/rddunlap/kexec/ +L: linux-kernel@vger.kernel.org +L: fastboot@osdl.org +S: Maintained + LANMEDIA WAN CARD DRIVER P: Andrew Stanley-Jones M: asj@lanmedia.com diff --git a/Makefile b/Makefile index 4d94580e0..c57684382 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 8 -EXTRAVERSION = -1.521.2.5.planetlab +EXTRAVERSION = -1.521.3.planetlab NAME=Zonked Quokka # *DOCUMENTATION* @@ -453,6 +453,10 @@ ifndef CONFIG_FRAME_POINTER CFLAGS += -fomit-frame-pointer endif +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + ifdef CONFIG_DEBUG_INFO CFLAGS += -g endif diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 15b003b50..3a3ba7fec 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -926,6 +926,74 @@ config REGPARM generate incorrect output with certain kernel constructs when -mregparm=3 is used. +config IRQSTACKS + bool "Use separate IRQ stacks" + help + If you say Y here the kernel will use a separate IRQ stack on each + cpu to handle interrupts. + +config STACK_SIZE_SHIFT + int "Kernel stack size (12 => 4KB, 13 => 8KB, 14 => 16KB)" + range 12 14 + default 12 if IRQSTACKS + default 13 + help + Select kernel stack size. 4KB stacks are best as they let + the system scale further. Use 8KB stacks if you have an + experimental kernel where a stack overlow with a 4KB stack + might occur. Use 16KB stacks if you want to safely support + Windows device drivers using either Linuxant or ndiswrapper. + +config STACK_WARN + int "Print stack trace when stack grows beyond specified bytes" + default 4096 if IRQSTACKS + default 4096 + help + The kernel will print a stack trace when the current stack exceeds + the specified size. + +config X86_STACK_CHECK + bool "Check for stack overflows" + default n + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N + +config STACK_PANIC + int "Panic when stack approaches with specified bytes of the stack limit" + depends on X86_STACK_CHECK + default 512 if IRQSTACKS + default 512 + help + Panic if the stack grows to within specified byte range. + +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similiarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + endmenu diff --git a/arch/i386/boot/.cvsignore b/arch/i386/boot/.cvsignore new file mode 100644 index 000000000..2d8a3afa4 --- /dev/null +++ b/arch/i386/boot/.cvsignore @@ -0,0 +1,4 @@ +bootsect +bzImage +setup +vmlinux.bin diff --git a/arch/i386/boot/compressed/.cvsignore b/arch/i386/boot/compressed/.cvsignore new file mode 100644 index 000000000..96b1b0022 --- /dev/null +++ b/arch/i386/boot/compressed/.cvsignore @@ -0,0 +1,3 @@ +vmlinux +vmlinux.bin +vmlinux.bin.gz diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index fa6704523..874568330 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -380,3 +380,6 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode) if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); diff --git a/arch/i386/boot/tools/.cvsignore b/arch/i386/boot/tools/.cvsignore new file mode 100644 index 000000000..378eac25d --- /dev/null +++ b/arch/i386/boot/tools/.cvsignore @@ -0,0 +1 @@ +build diff --git a/arch/i386/defconfig b/arch/i386/defconfig index aed3bc298..ed2bbb54d 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1221,7 +1221,7 @@ CONFIG_OPROFILE=y CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_FRAME_POINTER is not set -CONFIG_4KSTACKS=y +# CONFIG_4KSTACKS is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y diff --git a/arch/i386/kernel/.cvsignore b/arch/i386/kernel/.cvsignore new file mode 100644 index 000000000..21c28761b --- /dev/null +++ b/arch/i386/kernel/.cvsignore @@ -0,0 +1,2 @@ +asm-offsets.s +vmlinux.lds.s diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index a056d5068..ab1ef80d1 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_MODULES) += module.o diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index ecf2b632f..eb4d41628 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -193,6 +193,36 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT); + apic_write_around(APIC_LVT0, value); + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 43943f871..b03f579a6 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -7,11 +7,11 @@ #include #include #include +#include #include #include "sigframe.h" #include #include -#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3ac74183c..dfbade1b9 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1029,8 +1029,55 @@ ENTRY(sys_call_table) .long sys_mq_timedreceive /* 280 */ .long sys_mq_notify .long sys_mq_getsetattr - .long sys_ni_syscall /* reserved for kexec */ + .long sys_kexec_load .long sys_ioprio_set .long sys_ioprio_get /* 285 */ syscall_table_size=(.-sys_call_table) + +#ifdef CONFIG_X86_STACK_CHECK +.data +.globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) +#warning stack check enabled + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax + jle 1f +2: + popl %eax + ret +1: + /* prevent infinite recursion from call to mcount from the + * stack_overflow function. Need to revisit this code for + * SMP based systems. + */ + lock; btsl $0,stack_overflowed + jc 2b + + /* prepare to jmp to stack_overflow directly, as if it were + * called directly by the caller of mcount. + */ + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + call stack_overflow + /* Note that stack_overflow() will clear the stack_overflowed + * variable. + */ + + popl %edi + popl %esi + popl %ebx + popl %ebp + + popl %eax + ret +#endif diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 5a50c536d..584982c3e 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -188,6 +188,12 @@ EXPORT_SYMBOL(atomic_dec_and_lock); EXPORT_SYMBOL(__PAGE_KERNEL); +#ifdef CONFIG_X86_STACK_CHECK +extern void mcount(void); +EXPORT_SYMBOL(mcount); +#endif + + #ifdef CONFIG_HIGHMEM EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index 97653d20f..7141d27ec 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -244,9 +244,21 @@ static int i8259A_resume(struct sys_device *dev) return 0; } +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .resume = i8259A_resume, + .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c index 7422d73ee..30cfd4085 100644 --- a/arch/i386/kernel/init_task.c +++ b/arch/i386/kernel/init_task.c @@ -29,6 +29,13 @@ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = { INIT_THREAD_INFO(init_task, init_thread_union) }; +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task, stack_overflow_stack) }; +#endif + + /* * Initial task structure. * diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 39af35d19..f600e6799 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1604,11 +1604,42 @@ static void __init enable_IO_APIC(void) */ void disable_IO_APIC(void) { + int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); + /* + * If the i82559 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrups can be delivered. + */ + pin = find_isa_irq_pin(0, mp_ExtINT); + if (pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = 7; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = 0; + + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } disconnect_bsp_APIC(); } diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 22f7fc771..1c8bedaeb 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -76,8 +76,10 @@ static void register_irq_proc (unsigned int irq); /* * per-CPU IRQ handling stacks */ +#ifdef CONFIG_IRQSTACKS union irq_ctx *hardirq_ctx[NR_CPUS]; union irq_ctx *softirq_ctx[NR_CPUS]; +#endif /* * Special irq handlers. @@ -220,6 +222,9 @@ asmlinkage int handle_IRQ_event(unsigned int irq, int status = 1; /* Force the "do bottom halves" bit */ int retval = 0; + if (!(action->flags & SA_INTERRUPT)) + local_irq_enable(); + do { status |= action->flags; retval |= action->handler(irq, action->dev_id, regs); @@ -489,10 +494,12 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs) u32 *isp; union irq_ctx * curctx; union irq_ctx * irqctx; - +#ifdef CONFIG_IRQSTACKS curctx = (union irq_ctx *) current_thread_info(); irqctx = hardirq_ctx[smp_processor_id()]; - +#else + curctx = irqctx = (union irq_ctx *)0; +#endif spin_unlock(&desc->lock); /* @@ -536,7 +543,6 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs) break; desc->status &= ~IRQ_PENDING; } - desc->status &= ~IRQ_INPROGRESS; out: @@ -1095,6 +1101,7 @@ void init_irq_proc (void) } +#ifdef CONFIG_IRQSTACKS /* * These should really be __section__(".bss.page_aligned") as well, but * gcc's 3.0 and earlier don't handle that correctly. @@ -1174,3 +1181,4 @@ asmlinkage void do_softirq(void) } EXPORT_SYMBOL(do_softirq); +#endif diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c new file mode 100644 index 000000000..3a9e878f8 --- /dev/null +++ b/arch/i386/kernel/machine_kexec.c @@ -0,0 +1,208 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline unsigned long read_cr3(void) +{ + unsigned long cr3; + asm volatile("movl %%cr3,%0": "=r"(cr3)); + return cr3; +} + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) + +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L2_ATTR (_PAGE_PRESENT) + +#define LEVEL0_SIZE (1UL << 12UL) + +#ifndef CONFIG_X86_PAE +#define LEVEL1_SIZE (1UL << 22UL) +static u32 pgtable_level1[1024] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long level1_index, level2_index; + u32 *pgtable_level2; + + /* Find the current page table */ + pgtable_level2 = __va(read_cr3()); + + /* Find the indexes of the physical address to identity map */ + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; + level2_index = address / LEVEL1_SIZE; + + /* Identity map the page table entry */ + pgtable_level1[level1_index] = address | L0_ATTR; + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(pgtable_level2); +} + +#else +#define LEVEL1_SIZE (1UL << 21UL) +#define LEVEL2_SIZE (1UL << 30UL) +static u64 pgtable_level1[512] PAGE_ALIGNED; +static u64 pgtable_level2[512] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long level1_index, level2_index, level3_index; + u64 *pgtable_level3; + + /* Find the current page table */ + pgtable_level3 = __va(read_cr3()); + + /* Find the indexes of the physical address to identity map */ + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; + level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; + level3_index = address / LEVEL2_SIZE; + + /* Identity map the page table entry */ + pgtable_level1[level1_index] = address | L0_ATTR; + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; + set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR); + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(pgtable_level3); +} +#endif + + +static void set_idt(void *newidt, __u16 limit) +{ + unsigned char curidt[6]; + + /* ia32 supports unaliged loads & stores */ + (*(__u16 *)(curidt)) = limit; + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt); + + __asm__ __volatile__ ( + "lidt %0\n" + : "=m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, __u16 limit) +{ + unsigned char curgdt[6]; + + /* ia32 supports unaligned loads & stores */ + (*(__u16 *)(curgdt)) = limit; + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt); + + __asm__ __volatile__ ( + "lgdt %0\n" + : "=m" (curgdt) + ); +}; + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%eax\n" + "\tmovl %eax,%ds\n" + "\tmovl %eax,%es\n" + "\tmovl %eax,%fs\n" + "\tmovl %eax,%gs\n" + "\tmovl %eax,%ss\n" + ); +#undef STR +#undef __STR +} + +typedef asmlinkage void (*relocate_new_kernel_t)( + unsigned long indirection_page, unsigned long reboot_code_buffer, + unsigned long start_address, unsigned int has_pae); + +const extern unsigned char relocate_new_kernel[]; +extern void relocate_new_kernel_end(void); +const extern unsigned int relocate_new_kernel_size; + +/* + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. Currently nothing. + */ +int machine_kexec_prepare(struct kimage *image) +{ + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long indirection_page; + unsigned long reboot_code_buffer; + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Compute some offsets */ + reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + indirection_page = image->head & PAGE_MASK; + + /* Set up an identity mapping for the reboot_code_buffer */ + identity_map_page(reboot_code_buffer); + + /* copy it out */ + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size); + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again you set the segment to a different selector. + * + * The more common model is are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + + /* now call it */ + rnk = (relocate_new_kernel_t) reboot_code_buffer; + (*rnk)(indirection_page, reboot_code_buffer, image->start, cpu_has_pae); +} diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 3093d1fc6..e8a01f2b5 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -219,6 +219,32 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); +void stack_overflow(void) +{ + extern unsigned long stack_overflowed; + unsigned long esp = current_stack_pointer(); + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + oops_in_progress = 1; + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%lx %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, + (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing); + show_trace(current,(void*)esp); + + if (panicing) + panic("stack overflow\n"); + + oops_in_progress = 0; + + /* Just let it happen once per task, as otherwise it goes nuts + * in printing stack traces. This means that I need to dump + * the stack_overflowed boolean into the task or thread_info + * structure. For now just turn it off all together. + */ + + /* stack_overflowed = 0; */ +} + void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index e8d5cd3ab..85e89f94b 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -23,7 +23,6 @@ static int reboot_mode; int reboot_thru_bios; #ifdef CONFIG_SMP -int reboot_smp = 0; static int reboot_cpu = -1; /* shamelessly grabbed from lib/vsprintf.c for readability */ #define is_digit(c) ((c) >= '0' && (c) <= '9') @@ -85,33 +84,9 @@ static int __init set_bios_reboot(struct dmi_system_id *d) return 0; } -/* - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic. - */ -static int __init set_smp_reboot(struct dmi_system_id *d) -{ -#ifdef CONFIG_SMP - if (!reboot_smp) { - reboot_smp = 1; - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident); - } -#endif - return 0; -} - -/* - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic. - */ -static int __init set_smp_bios_reboot(struct dmi_system_id *d) -{ - set_smp_reboot(d); - set_bios_reboot(d); - return 0; -} - static struct dmi_system_id __initdata reboot_dmi_table[] = { { /* Handle problems with rebooting on Dell 1300's */ - .callback = set_smp_bios_reboot, + .callback = set_bios_reboot, .ident = "Dell PowerEdge 1300", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), @@ -294,41 +269,32 @@ void machine_real_restart(unsigned char *code, int length) : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); } -void machine_restart(char * __unused) +void machine_shutdown(void) { #ifdef CONFIG_SMP - int cpuid; - - cpuid = GET_APIC_ID(apic_read(APIC_ID)); - - if (reboot_smp) { - - /* check to see if reboot_cpu is valid - if its not, default to the BSP */ - if ((reboot_cpu == -1) || - (reboot_cpu > (NR_CPUS -1)) || - !physid_isset(cpuid, phys_cpu_present_map)) - reboot_cpu = boot_cpu_physical_apicid; - - reboot_smp = 0; /* use this as a flag to only go through this once*/ - /* re-run this function on the other CPUs - it will fall though this section since we have - cleared reboot_smp, and do the reboot if it is the - correct CPU, otherwise it halts. */ - if (reboot_cpu != cpuid) - smp_call_function((void *)machine_restart , NULL, 1, 0); + int reboot_cpu_id; + + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* See if there has been given a command line override */ + if ((reboot_cpu_id != -1) && (reboot_cpu < NR_CPUS) && + cpu_isset(reboot_cpu, cpu_online_map)) { + reboot_cpu_id = reboot_cpu; } - /* if reboot_cpu is still -1, then we want a tradional reboot, - and if we are not running on the reboot_cpu,, halt */ - if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) { - for (;;) - __asm__ __volatile__ ("hlt"); + /* Make certain the cpu I'm rebooting on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); } - /* - * Stop all CPUs and turn off local APICs and the IO-APIC, so - * other OSs see a clean IRQ state. + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K. Now that I'm on the appropriate processor, stop + * all of the others, and disable their local APICs. */ + if (!netdump_mode) smp_send_stop(); #elif defined(CONFIG_X86_LOCAL_APIC) @@ -341,6 +307,11 @@ void machine_restart(char * __unused) #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); #endif +} + +void machine_restart(char * __unused) +{ + machine_shutdown(); if (!reboot_thru_bios) { if (efi_enabled) { diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S new file mode 100644 index 000000000..54be4c2ae --- /dev/null +++ b/arch/i386/kernel/relocate_kernel.S @@ -0,0 +1,118 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include + + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + */ + .globl relocate_new_kernel +relocate_new_kernel: + /* read the arguments and say goodbye to the stack */ + movl 4(%esp), %ebx /* indirection_page */ + movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 12(%esp), %edx /* start address */ + movl 16(%esp), %ecx /* cpu_has_pae */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* set a new stack at the bottom of our page... */ + lea 4096(%ebp), %esp + + /* store the parameters back on the stack */ + pushl %edx /* store the start address */ + + /* Set cr0 to a known state: + * 31 0 == Paging disabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movl %cr0, %eax + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax + orl $(1<<0), %eax + movl %eax, %cr0 + + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f + /* Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + movl %cr4, %eax + andl $0, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + /* Do the copies */ + cld +0: /* top, read another word for the indirection page */ + movl %ebx, %ecx + movl (%ebx), %ecx + addl $4, %ebx + testl $0x1, %ecx /* is it a destination page */ + jz 1f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +1: + testl $0x2, %ecx /* is it an indirection page */ + jz 1f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +1: + testl $0x4, %ecx /* is it the done indicator */ + jz 1f + jmp 2f +1: + testl $0x8, %ecx /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl $1024, %ecx + rep ; movsl + jmp 0b + +2: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .long relocate_new_kernel_end - relocate_new_kernel diff --git a/configs/kernel-2.6.8-i686-planetlab.config b/configs/kernel-2.6.8-i686-planetlab.config index ea66387e5..8cc762f56 100644 --- a/configs/kernel-2.6.8-i686-planetlab.config +++ b/configs/kernel-2.6.8-i686-planetlab.config @@ -30,8 +30,9 @@ CONFIG_RCFS_FS=y CONFIG_CKRM_TYPE_TASKCLASS=y CONFIG_CKRM_RES_NUMTASKS=y CONFIG_CKRM_CPU_SCHEDULE=y -CONFIG_CKRM_RES_BLKIO=y +# CONFIG_CKRM_RES_BLKIO is not set # CONFIG_CKRM_RES_MEM is not set +CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y # CONFIG_CKRM_TYPE_SOCKETCLASS is not set CONFIG_CKRM_RBCE=y CONFIG_SYSCTL=y @@ -140,6 +141,12 @@ CONFIG_HIGHPTE=y # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y CONFIG_REGPARM=y +CONFIG_IRQSTACKS=y +CONFIG_STACK_SIZE_SHIFT=13 +CONFIG_STACK_WARN=4000 +CONFIG_X86_STACK_CHECK=y +CONFIG_STACK_PANIC=512 +CONFIG_KEXEC=y # # Power management options (ACPI, APM) @@ -211,7 +218,7 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y # # Block devices # -# CONFIG_BLK_DEV_FD is not set +CONFIG_BLK_DEV_FD=m # CONFIG_BLK_DEV_XD is not set CONFIG_BLK_CPQ_DA=m CONFIG_BLK_CPQ_CISS_DA=m diff --git a/drivers/block/cfq-iosched-orig.c b/drivers/block/cfq-iosched-orig.c deleted file mode 100644 index 977d32ddd..000000000 --- a/drivers/block/cfq-iosched-orig.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * linux/drivers/block/cfq-iosched.c - * - * CFQ, or complete fairness queueing, disk scheduler. - * - * Based on ideas from a previously unfinished io - * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. - * - * Copyright (C) 2003 Jens Axboe - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * tunables - */ -static int cfq_quantum = 4; -static int cfq_queued = 8; - -#define CFQ_QHASH_SHIFT 6 -#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) -#define list_entry_qhash(entry) list_entry((entry), struct cfq_queue, cfq_hash) - -#define CFQ_MHASH_SHIFT 8 -#define CFQ_MHASH_BLOCK(sec) ((sec) >> 3) -#define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT) -#define CFQ_MHASH_FN(sec) (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT)) -#define ON_MHASH(crq) !list_empty(&(crq)->hash) -#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define list_entry_hash(ptr) list_entry((ptr), struct cfq_rq, hash) - -#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) - -#define RQ_DATA(rq) ((struct cfq_rq *) (rq)->elevator_private) - -static kmem_cache_t *crq_pool; -static kmem_cache_t *cfq_pool; -static mempool_t *cfq_mpool; - -struct cfq_data { - struct list_head rr_list; - struct list_head *dispatch; - struct list_head *cfq_hash; - - struct list_head *crq_hash; - - unsigned int busy_queues; - unsigned int max_queued; - - mempool_t *crq_pool; -}; - -struct cfq_queue { - struct list_head cfq_hash; - struct list_head cfq_list; - struct rb_root sort_list; - int pid; - int queued[2]; -#if 0 - /* - * with a simple addition like this, we can do io priorities. almost. - * does need a split request free list, too. - */ - int io_prio -#endif -}; - -struct cfq_rq { - struct rb_node rb_node; - sector_t rb_key; - - struct request *request; - - struct cfq_queue *cfq_queue; - - struct list_head hash; -}; - -static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq); -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid); -static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq); - -/* - * lots of deadline iosched dupes, can be abstracted later... - */ -static inline void __cfq_del_crq_hash(struct cfq_rq *crq) -{ - list_del_init(&crq->hash); -} - -static inline void cfq_del_crq_hash(struct cfq_rq *crq) -{ - if (ON_MHASH(crq)) - __cfq_del_crq_hash(crq); -} - -static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) -{ - cfq_del_crq_hash(crq); - - if (q->last_merge == crq->request) - q->last_merge = NULL; -} - -static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) -{ - struct request *rq = crq->request; - - BUG_ON(ON_MHASH(crq)); - - list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]); -} - -static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) -{ - struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; - struct list_head *entry, *next = hash_list->next; - - while ((entry = next) != hash_list) { - struct cfq_rq *crq = list_entry_hash(entry); - struct request *__rq = crq->request; - - next = entry->next; - - BUG_ON(!ON_MHASH(crq)); - - if (!rq_mergeable(__rq)) { - __cfq_del_crq_hash(crq); - continue; - } - - if (rq_hash_key(__rq) == offset) - return __rq; - } - - return NULL; -} - -/* - * rb tree support functions - */ -#define RB_NONE (2) -#define RB_EMPTY(node) ((node)->rb_node == NULL) -#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) -#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL) -#define ON_RB(node) ((node)->rb_color != RB_NONE) -#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) -#define rq_rb_key(rq) (rq)->sector - -static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) -{ - if (ON_RB(&crq->rb_node)) { - cfqq->queued[rq_data_dir(crq->request)]--; - rb_erase(&crq->rb_node, &cfqq->sort_list); - crq->cfq_queue = NULL; - } -} - -static struct cfq_rq * -__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) -{ - struct rb_node **p = &cfqq->sort_list.rb_node; - struct rb_node *parent = NULL; - struct cfq_rq *__crq; - - while (*p) { - parent = *p; - __crq = rb_entry_crq(parent); - - if (crq->rb_key < __crq->rb_key) - p = &(*p)->rb_left; - else if (crq->rb_key > __crq->rb_key) - p = &(*p)->rb_right; - else - return __crq; - } - - rb_link_node(&crq->rb_node, parent, p); - return 0; -} - -static void -cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) -{ - struct request *rq = crq->request; - struct cfq_rq *__alias; - - crq->rb_key = rq_rb_key(rq); - cfqq->queued[rq_data_dir(rq)]++; -retry: - __alias = __cfq_add_crq_rb(cfqq, crq); - if (!__alias) { - rb_insert_color(&crq->rb_node, &cfqq->sort_list); - crq->cfq_queue = cfqq; - return; - } - - cfq_del_crq_rb(cfqq, __alias); - cfq_dispatch_sort(cfqd->dispatch, __alias); - goto retry; -} - -static struct request * -cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) -{ - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid); - struct rb_node *n; - - if (!cfqq) - goto out; - - n = cfqq->sort_list.rb_node; - while (n) { - struct cfq_rq *crq = rb_entry_crq(n); - - if (sector < crq->rb_key) - n = n->rb_left; - else if (sector > crq->rb_key) - n = n->rb_right; - else - return crq->request; - } - -out: - return NULL; -} - -static void cfq_remove_request(request_queue_t *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); - - if (crq) { - struct cfq_queue *cfqq = crq->cfq_queue; - - cfq_remove_merge_hints(q, crq); - list_del_init(&rq->queuelist); - - if (cfqq) { - cfq_del_crq_rb(cfqq, crq); - - if (RB_EMPTY(&cfqq->sort_list)) - cfq_put_queue(cfqd, cfqq); - } - } -} - -static int -cfq_merge(request_queue_t *q, struct request **req, struct bio *bio) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct request *__rq; - int ret; - - ret = elv_try_last_merge(q, bio); - if (ret != ELEVATOR_NO_MERGE) { - __rq = q->last_merge; - goto out_insert; - } - - __rq = cfq_find_rq_hash(cfqd, bio->bi_sector); - if (__rq) { - BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_BACK_MERGE; - goto out; - } - } - - __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio)); - if (__rq) { - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_FRONT_MERGE; - goto out; - } - } - - return ELEVATOR_NO_MERGE; -out: - q->last_merge = __rq; -out_insert: - *req = __rq; - return ret; -} - -static void cfq_merged_request(request_queue_t *q, struct request *req) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_DATA(req); - - cfq_del_crq_hash(crq); - cfq_add_crq_hash(cfqd, crq); - - if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) { - struct cfq_queue *cfqq = crq->cfq_queue; - - cfq_del_crq_rb(cfqq, crq); - cfq_add_crq_rb(cfqd, cfqq, crq); - } - - q->last_merge = req; -} - -static void -cfq_merged_requests(request_queue_t *q, struct request *req, - struct request *next) -{ - cfq_merged_request(q, req); - cfq_remove_request(q, next); -} - -static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq) -{ - struct list_head *entry = head; - struct request *__rq; - - if (!list_empty(head)) { - __rq = list_entry_rq(head->next); - - if (crq->request->sector < __rq->sector) { - entry = head->prev; - goto link; - } - } - - while ((entry = entry->prev) != head) { - __rq = list_entry_rq(entry); - - if (crq->request->sector <= __rq->sector) - break; - } - -link: - list_add_tail(&crq->request->queuelist, entry); -} - -static inline void -__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list)); - - cfq_del_crq_rb(cfqq, crq); - cfq_remove_merge_hints(q, crq); - cfq_dispatch_sort(cfqd->dispatch, crq); -} - -static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq; - struct list_head *entry, *tmp; - int ret, queued, good_queues; - - if (list_empty(&cfqd->rr_list)) - return 0; - - queued = ret = 0; -restart: - good_queues = 0; - list_for_each_safe(entry, tmp, &cfqd->rr_list) { - cfqq = list_entry_cfqq(cfqd->rr_list.next); - - BUG_ON(RB_EMPTY(&cfqq->sort_list)); - - __cfq_dispatch_requests(q, cfqd, cfqq); - - if (RB_EMPTY(&cfqq->sort_list)) - cfq_put_queue(cfqd, cfqq); - else - good_queues++; - - queued++; - ret = 1; - } - - if ((queued < cfq_quantum) && good_queues) - goto restart; - - return ret; -} - -static struct request *cfq_next_request(request_queue_t *q) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct request *rq; - - if (!list_empty(cfqd->dispatch)) { - struct cfq_rq *crq; -dispatch: - rq = list_entry_rq(cfqd->dispatch->next); - - crq = RQ_DATA(rq); - if (crq) - cfq_remove_merge_hints(q, crq); - - return rq; - } - - if (cfq_dispatch_requests(q, cfqd)) - goto dispatch; - - return NULL; -} - -static inline struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval) -{ - struct list_head *hash_list = &cfqd->cfq_hash[hashval]; - struct list_head *entry; - - list_for_each(entry, hash_list) { - struct cfq_queue *__cfqq = list_entry_qhash(entry); - - if (__cfqq->pid == pid) - return __cfqq; - } - - return NULL; -} - -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid) -{ - const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); - - return __cfq_find_cfq_hash(cfqd, pid, hashval); -} - -static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfqd->busy_queues--; - list_del(&cfqq->cfq_list); - list_del(&cfqq->cfq_hash); - mempool_free(cfqq, cfq_mpool); -} - -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid) -{ - const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); - struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval); - - if (!cfqq) { - cfqq = mempool_alloc(cfq_mpool, GFP_NOIO); - - INIT_LIST_HEAD(&cfqq->cfq_hash); - INIT_LIST_HEAD(&cfqq->cfq_list); - RB_CLEAR_ROOT(&cfqq->sort_list); - - cfqq->pid = pid; - cfqq->queued[0] = cfqq->queued[1] = 0; - list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); - } - - return cfqq; -} - -static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) -{ - struct cfq_queue *cfqq; - - cfqq = cfq_get_queue(cfqd, current->tgid); - - cfq_add_crq_rb(cfqd, cfqq, crq); - - if (list_empty(&cfqq->cfq_list)) { - list_add(&cfqq->cfq_list, &cfqd->rr_list); - cfqd->busy_queues++; - } -} - -static void -cfq_insert_request(request_queue_t *q, struct request *rq, int where) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); - - switch (where) { - case ELEVATOR_INSERT_BACK: - while (cfq_dispatch_requests(q, cfqd)) - ; - list_add_tail(&rq->queuelist, cfqd->dispatch); - break; - case ELEVATOR_INSERT_FRONT: - list_add(&rq->queuelist, cfqd->dispatch); - break; - case ELEVATOR_INSERT_SORT: - BUG_ON(!blk_fs_request(rq)); - cfq_enqueue(cfqd, crq); - break; - default: - printk("%s: bad insert point %d\n", __FUNCTION__,where); - return; - } - - if (rq_mergeable(rq)) { - cfq_add_crq_hash(cfqd, crq); - - if (!q->last_merge) - q->last_merge = rq; - } -} - -static int cfq_queue_empty(request_queue_t *q) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - - if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list)) - return 1; - - return 0; -} - -static struct request * -cfq_former_request(request_queue_t *q, struct request *rq) -{ - struct cfq_rq *crq = RQ_DATA(rq); - struct rb_node *rbprev = rb_prev(&crq->rb_node); - - if (rbprev) - return rb_entry_crq(rbprev)->request; - - return NULL; -} - -static struct request * -cfq_latter_request(request_queue_t *q, struct request *rq) -{ - struct cfq_rq *crq = RQ_DATA(rq); - struct rb_node *rbnext = rb_next(&crq->rb_node); - - if (rbnext) - return rb_entry_crq(rbnext)->request; - - return NULL; -} - -static int cfq_may_queue(request_queue_t *q, int rw) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_queue *cfqq; - int ret = 1; - - if (!cfqd->busy_queues) - goto out; - - cfqq = cfq_find_cfq_hash(cfqd, current->tgid); - if (cfqq) { - int limit = (q->nr_requests - cfq_queued) / cfqd->busy_queues; - - if (limit < 3) - limit = 3; - else if (limit > cfqd->max_queued) - limit = cfqd->max_queued; - - if (cfqq->queued[rw] > limit) - ret = 0; - } -out: - return ret; -} - -static void cfq_put_request(request_queue_t *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); - - if (crq) { - BUG_ON(q->last_merge == rq); - BUG_ON(ON_MHASH(crq)); - - mempool_free(crq, cfqd->crq_pool); - rq->elevator_private = NULL; - } -} - -static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask); - - if (crq) { - RB_CLEAR(&crq->rb_node); - crq->request = rq; - crq->cfq_queue = NULL; - INIT_LIST_HEAD(&crq->hash); - rq->elevator_private = crq; - return 0; - } - - return 1; -} - -static void cfq_exit(request_queue_t *q, elevator_t *e) -{ - struct cfq_data *cfqd = e->elevator_data; - - e->elevator_data = NULL; - mempool_destroy(cfqd->crq_pool); - kfree(cfqd->crq_hash); - kfree(cfqd->cfq_hash); - kfree(cfqd); -} - -static int cfq_init(request_queue_t *q, elevator_t *e) -{ - struct cfq_data *cfqd; - int i; - - cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL); - if (!cfqd) - return -ENOMEM; - - memset(cfqd, 0, sizeof(*cfqd)); - INIT_LIST_HEAD(&cfqd->rr_list); - - cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); - if (!cfqd->crq_hash) - goto out_crqhash; - - cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); - if (!cfqd->cfq_hash) - goto out_cfqhash; - - cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool); - if (!cfqd->crq_pool) - goto out_crqpool; - - for (i = 0; i < CFQ_MHASH_ENTRIES; i++) - INIT_LIST_HEAD(&cfqd->crq_hash[i]); - for (i = 0; i < CFQ_QHASH_ENTRIES; i++) - INIT_LIST_HEAD(&cfqd->cfq_hash[i]); - - cfqd->dispatch = &q->queue_head; - e->elevator_data = cfqd; - - /* - * just set it to some high value, we want anyone to be able to queue - * some requests. fairness is handled differently - */ - cfqd->max_queued = q->nr_requests; - q->nr_requests = 8192; - - return 0; -out_crqpool: - kfree(cfqd->cfq_hash); -out_cfqhash: - kfree(cfqd->crq_hash); -out_crqhash: - kfree(cfqd); - return -ENOMEM; -} - -static int __init cfq_slab_setup(void) -{ - crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0, - NULL, NULL); - - if (!crq_pool) - panic("cfq_iosched: can't init crq pool\n"); - - cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0, - NULL, NULL); - - if (!cfq_pool) - panic("cfq_iosched: can't init cfq pool\n"); - - cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool); - - if (!cfq_mpool) - panic("cfq_iosched: can't init cfq mpool\n"); - - return 0; -} - -subsys_initcall(cfq_slab_setup); - -elevator_t iosched_cfq = { - .elevator_name = "cfq", - .elevator_merge_fn = cfq_merge, - .elevator_merged_fn = cfq_merged_request, - .elevator_merge_req_fn = cfq_merged_requests, - .elevator_next_req_fn = cfq_next_request, - .elevator_add_req_fn = cfq_insert_request, - .elevator_remove_req_fn = cfq_remove_request, - .elevator_queue_empty_fn = cfq_queue_empty, - .elevator_former_req_fn = cfq_former_request, - .elevator_latter_req_fn = cfq_latter_request, - .elevator_set_req_fn = cfq_set_request, - .elevator_put_req_fn = cfq_put_request, - .elevator_may_queue_fn = cfq_may_queue, - .elevator_init_fn = cfq_init, - .elevator_exit_fn = cfq_exit, -}; - -EXPORT_SYMBOL(iosched_cfq); diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c index 7b45a805d..70d66c5c9 100644 --- a/drivers/block/cfq-iosched.c +++ b/drivers/block/cfq-iosched.c @@ -39,8 +39,6 @@ #error Cannot support this many io priority levels #endif -#define LIMIT_DEBUG 1 - /* * tunables */ @@ -52,6 +50,10 @@ static int cfq_queued = 4; static int cfq_grace_rt = HZ / 100 ?: 1; static int cfq_grace_idle = HZ / 10; +#define CFQ_EPOCH 1000000000 +#define CFQ_SECTORATE 1000 +#define CFQ_HMAX_PCT 80 + #define CFQ_QHASH_SHIFT 6 #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) @@ -69,13 +71,6 @@ static int cfq_grace_idle = HZ / 10; #define cfq_account_io(crq) \ ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT) -/* define to be 50 ms for now; make tunable later */ -#define CFQ_EPOCH 50000 -/* Needs to be made tunable right away, in MiB/s */ -#define CFQ_DISKBW 10 -/* Temporary global limit, as percent of available b/w, for each "class" */ -#define CFQ_TEMPLIM 10 - /* * defines how we distribute bandwidth (can be tgid, uid, etc) */ @@ -87,18 +82,22 @@ static int cfq_grace_idle = HZ / 10; */ #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE) -extern inline void *cki_hash_key(struct task_struct *tsk); -extern inline int cki_ioprio(struct task_struct *tsk); -#define cfq_hash_key(current) ((int)cki_hash_key((current))) -#define cfq_ioprio(current) (cki_ioprio((current))) +extern void *cki_hash_key(struct task_struct *tsk); +extern int cki_ioprio(struct task_struct *tsk); +extern void *cki_cfqpriv(struct task_struct *tsk); + +#define cfq_hash_key(tsk) ((int)cki_hash_key((tsk))) +#define cfq_ioprio(tsk) (cki_ioprio((tsk))) +#define cfq_cfqpriv(cfqd,tsk) (cki_cfqpriv((tsk))) #else -#define cfq_hash_key(current) ((current)->tgid) +#define cfq_hash_key(tsk) ((tsk)->tgid) +#define cfq_cfqpriv(cfqd,tsk) (&(((cfqd)->cid[(tsk)->ioprio]).cfqpriv)) /* * move to io_context */ -#define cfq_ioprio(current) ((current)->ioprio) +#define cfq_ioprio(tsk) ((tsk)->ioprio) #endif #define CFQ_WAIT_RT 0 @@ -125,16 +124,12 @@ struct io_prio_data { atomic_t cum_sectors_in,cum_sectors_out; atomic_t cum_queues_in,cum_queues_out; -#ifdef LIMIT_DEBUG - int nskip; - unsigned long navsec; - unsigned long csectorate; - unsigned long lsectorate; -#endif + cfqlim_t cfqpriv; /* data for enforcing limits */ struct list_head prio_list; int last_rq; int last_sectors; + }; /* @@ -179,8 +174,9 @@ struct cfq_data { unsigned int cfq_grace_rt; unsigned int cfq_grace_idle; - unsigned long cfq_epoch; /* duration for limit enforcement */ - unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */ + unsigned int cfq_epoch; + unsigned int cfq_hmax_pct; + unsigned int cfq_qsectorate; }; /* @@ -194,14 +190,34 @@ struct cfq_queue { int queued[2]; int ioprio; + /* limit related settings/stats obtained + either from io_prio_data or ckrm I/O class + */ + struct cfqlim *cfqpriv; + + u64 epstart; /* current epoch's starting timestamp (ns) */ + u64 epsector[2]; /* Total sectors dispatched in [0] previous + * and [1] current epoch + */ + unsigned long avsec; /* avg sectors dispatched/epoch */ - unsigned long long lastime; /* timestamp of last request served */ - unsigned long sectorate; /* limit for sectors served/epoch */ +// unsigned long long lastime; /* timestamp of last request served */ +// unsigned long sectorate; /* limit for sectors served/epoch */ int skipped; /* queue skipped at last dispatch ? */ + + /* Per queue timer to suspend/resume queue from processing */ + struct timer_list timer; + unsigned long wait_end; + unsigned long flags; + struct work_struct work; + + struct cfq_data *cfqd; }; + + /* - * per-request structure + * Per-request structure */ struct cfq_rq { struct cfq_queue *cfq_queue; @@ -516,69 +532,101 @@ link: list_add_tail(&crq->request->queuelist, entry); } -/* - * remove from io scheduler core and put on dispatch list for service - */ +struct cfq_queue *dcfqq; +u64 dtmp; + + + +/* Over how many ns is sectorate defined */ +#define NS4SCALE (100000000) + static inline int -__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, - struct cfq_queue *cfqq) +__cfq_check_limit(struct cfq_data *cfqd,struct cfq_queue *cfqq, int dontskip) { struct cfq_rq *crq; - unsigned long long ts, gap; - unsigned long newavsec; + unsigned long long ts, gap, epoch, tmp; + unsigned long newavsec, sectorate; crq = rb_entry_crq(rb_first(&cfqq->sort_list)); -#if 1 - /* Determine if queue should be skipped for being overshare */ ts = sched_clock(); - gap = ts - cfqq->lastime; -#ifdef LIMIT_DEBUG - cfqq->sectorate = (cfqd->cfq_epochsectors - * CFQ_TEMPLIM)/100; - -#endif - if ((gap >= cfqd->cfq_epoch) || (gap < 0)) { - cfqq->avsec = crq->nr_sectors ; - cfqq->lastime = ts; + gap = ts - cfqq->epstart; + epoch = cfqd->cfq_epoch; + + sectorate = atomic_read(&cfqq->cfqpriv->sectorate); +// sectorate = atomic_read(&(cfqd->cid[crq->ioprio].sectorate)); + + dcfqq = cfqq; + + if ((gap >= epoch) || (gap < 0)) { + + if (gap >= (epoch << 1)) { + cfqq->epsector[0] = 0; + cfqq->epstart = ts ; + } else { + cfqq->epsector[0] = cfqq->epsector[1]; + cfqq->epstart += epoch; + } + cfqq->epsector[1] = 0; + gap = ts - cfqq->epstart; + + tmp = (cfqq->epsector[0] + crq->nr_sectors) * NS4SCALE; + do_div(tmp,epoch+gap); + + cfqq->avsec = (unsigned long)tmp; + cfqq->skipped = 0; + cfqq->epsector[1] += crq->nr_sectors; + + cfqq->cfqpriv->navsec = cfqq->avsec; + cfqq->cfqpriv->sec[0] = cfqq->epsector[0]; + cfqq->cfqpriv->sec[1] = cfqq->epsector[1]; + cfqq->cfqpriv->timedout++; + /* + cfqd->cid[crq->ioprio].navsec = cfqq->avsec; + cfqd->cid[crq->ioprio].sec[0] = cfqq->epsector[0]; + cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1]; + cfqd->cid[crq->ioprio].timedout++; + */ + return 0; } else { - u64 tmp; - /* Age old average and accumalate request to be served */ - -// tmp = (u64) (cfqq->avsec * gap) ; -// do_div(tmp, cfqd->cfq_epoch); - newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors; -// if (crq->ioprio >= 0 && crq->ioprio <= 20) -// cfqd->cid[crq->ioprio].lsectorate = newavsec; -// atomic_set(&(cfqd->cid[crq->ioprio].lsectorate), -// newavsec); - - if ((newavsec < cfqq->sectorate) || cfqq->skipped) { + + tmp = (cfqq->epsector[0] + cfqq->epsector[1] + crq->nr_sectors) + * NS4SCALE; + do_div(tmp,epoch+gap); + + newavsec = (unsigned long)tmp; + if ((newavsec < sectorate) || dontskip) { cfqq->avsec = newavsec ; - cfqq->lastime = ts; cfqq->skipped = 0; + cfqq->epsector[1] += crq->nr_sectors; + cfqq->cfqpriv->navsec = cfqq->avsec; + cfqq->cfqpriv->sec[1] = cfqq->epsector[1]; + /* + cfqd->cid[crq->ioprio].navsec = cfqq->avsec; + cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1]; + */ } else { - /* queue over share ; skip once */ cfqq->skipped = 1; -#ifdef LIMIT_DEBUG -// atomic_inc(&(cfqd->cid[crq->ioprio].nskip)); -// if (crq->ioprio >= 0 && crq->ioprio <= 20) -// cfqd->cid[crq->ioprio].nskip++; -#endif - return 0; + /* pause q's processing till avsec drops to + cfq_hmax_pct % of its value */ + tmp = (epoch+gap) * (100-cfqd->cfq_hmax_pct); + do_div(tmp,1000000*cfqd->cfq_hmax_pct); + cfqq->wait_end = jiffies+msecs_to_jiffies(tmp); } - } -#endif + } +} -#ifdef LIMIT_DEBUG -// if (crq->ioprio >= 0 && crq->ioprio <= 20) { -// cfqd->cid[crq->ioprio].navsec = cfqq->avsec; -// cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate; -// } +/* + * remove from io scheduler core and put on dispatch list for service + */ +static inline int +__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, + struct cfq_queue *cfqq) +{ + struct cfq_rq *crq; + + crq = rb_entry_crq(rb_first(&cfqq->sort_list)); -// atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec); -// atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate); -#endif cfq_dispatch_sort(cfqd, cfqq, crq); /* @@ -593,44 +641,83 @@ cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors) { struct cfq_data *cfqd = q->elevator.elevator_data; struct list_head *plist = &cfqd->cid[prio].rr_list; + struct cfq_queue *cfqq; struct list_head *entry, *nxt; int q_rq, q_io; - int ret ; + int first_round,busy_queues,busy_unlimited; + /* * for each queue at this prio level, dispatch a request */ q_rq = q_io = 0; + first_round=1; + restart: + busy_unlimited = 0; + busy_queues = 0; list_for_each_safe(entry, nxt, plist) { - struct cfq_queue *cfqq = list_entry_cfqq(entry); + cfqq = list_entry_cfqq(entry); BUG_ON(RB_EMPTY(&cfqq->sort_list)); + busy_queues++; - ret = __cfq_dispatch_requests(q, cfqd, cfqq); - if (ret <= 0) { - continue; /* skip queue */ - /* can optimize more by moving q to end of plist ? */ + + if (first_round || busy_unlimited) + __cfq_check_limit(cfqd,cfqq,0); + else + __cfq_check_limit(cfqd,cfqq,1); + + if (cfqq->skipped) { + cfqq->cfqpriv->nskip++; + /* cfqd->cid[prio].nskip++; */ + busy_queues--; + if (time_before(jiffies, cfqq->wait_end)) { + list_del(&cfqq->cfq_list); + mod_timer(&cfqq->timer,cfqq->wait_end); + } + continue; } - q_io += ret ; - q_rq++ ; + busy_unlimited++; + + q_io += __cfq_dispatch_requests(q, cfqd, cfqq); + q_rq++; - if (RB_EMPTY(&cfqq->sort_list)) + if (RB_EMPTY(&cfqq->sort_list)) { + busy_unlimited--; + busy_queues--; cfq_put_queue(cfqd, cfqq); - /* - * if we hit the queue limit, put the string of serviced - * queues at the back of the pending list - */ + } + if (q_io >= max_sectors || q_rq >= max_rq) { +#if 0 struct list_head *prv = nxt->prev; if (prv != plist) { list_del(plist); list_add(plist, prv); } +#endif break; } } + if ((q_io < max_sectors) && (q_rq < max_rq) && + (busy_queues || first_round)) + { + first_round = 0; + goto restart; + } else { + /* + * if we hit the queue limit, put the string of serviced + * queues at the back of the pending list + */ + struct list_head *prv = nxt->prev; + if (prv != plist) { + list_del(plist); + list_add(plist, prv); + } + } + cfqd->cid[prio].last_rq = q_rq; cfqd->cid[prio].last_sectors = q_io; return q_rq; @@ -806,6 +893,29 @@ static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) mempool_free(cfqq, cfq_mpool); } +static void cfq_pauseq_timer(unsigned long data) +{ + struct cfq_queue *cfqq = (struct cfq_queue *) data; + kblockd_schedule_work(&cfqq->work); +} + +static void cfq_pauseq_work(void *data) +{ + struct cfq_queue *cfqq = (struct cfq_queue *) data; + struct cfq_data *cfqd = cfqq->cfqd; + request_queue_t *q = cfqd->queue; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + list_add_tail(&cfqq->cfq_list,&cfqd->cid[cfqq->ioprio].rr_list); + cfqq->skipped = 0; + if (cfq_next_request(q)) + q->request_fn(q); + spin_unlock_irqrestore(q->queue_lock, flags); + + //del_timer(&cfqq->timer); +} + static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey, int gfp_mask) { @@ -833,9 +943,22 @@ retry: INIT_LIST_HEAD(&cfqq->cfq_list); cfqq->hash_key = cfq_hash_key(current); cfqq->ioprio = cfq_ioprio(current); - cfqq->avsec = 0 ; - cfqq->lastime = sched_clock(); - cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100; + + cfqq->cfqpriv = cfq_cfqpriv(cfqd,current); + if (!cfqq->cfqpriv) + cfqq->cfqpriv = &((cfqd->cid[cfqq->ioprio]).cfqpriv); + + cfqq->epstart = sched_clock(); + /* epsector, avsec, skipped initialized to zero by memset */ + + init_timer(&cfqq->timer); + cfqq->timer.function = cfq_pauseq_timer; + cfqq->timer.data = (unsigned long) cfqq; + + INIT_WORK(&cfqq->work, cfq_pauseq_work, cfqq); + + cfqq->cfqd = cfqd ; + hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); } @@ -1132,6 +1255,8 @@ static void cfq_exit(request_queue_t *q, elevator_t *e) kfree(cfqd); } + + static void cfq_timer(unsigned long data) { struct cfq_data *cfqd = (struct cfq_data *) data; @@ -1182,12 +1307,12 @@ static int cfq_init(request_queue_t *q, elevator_t *e) atomic_set(&cid->cum_sectors_out,0); atomic_set(&cid->cum_queues_in,0); atomic_set(&cid->cum_queues_out,0); -#if 0 - atomic_set(&cid->nskip,0); - atomic_set(&cid->navsec,0); - atomic_set(&cid->csectorate,0); - atomic_set(&cid->lsectorate,0); -#endif + + + atomic_set(&((cid->cfqpriv).sectorate),CFQ_SECTORATE); + (cid->cfqpriv).nskip = 0; + (cid->cfqpriv).navsec = 0; + (cid->cfqpriv).timedout = 0; } cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, @@ -1217,6 +1342,9 @@ static int cfq_init(request_queue_t *q, elevator_t *e) cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io; cfqd->cfq_grace_rt = cfq_grace_rt; cfqd->cfq_grace_idle = cfq_grace_idle; + + cfqd->cfq_epoch = CFQ_EPOCH; + cfqd->cfq_hmax_pct = CFQ_HMAX_PCT; q->nr_requests <<= 2; @@ -1224,14 +1352,6 @@ static int cfq_init(request_queue_t *q, elevator_t *e) e->elevator_data = cfqd; cfqd->queue = q; - cfqd->cfq_epoch = CFQ_EPOCH; - if (q->hardsect_size) - cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/ - q->hardsect_size)* (1000000 / CFQ_EPOCH); - else - cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512) - * (1000000 / CFQ_EPOCH) ; - return 0; out_crqpool: kfree(cfqd->cfq_hash); @@ -1302,6 +1422,8 @@ SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued); SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt); SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle); +SHOW_FUNCTION(cfq_epoch_show, cfqd->cfq_epoch); +SHOW_FUNCTION(cfq_hmax_pct_show, cfqd->cfq_hmax_pct); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ @@ -1321,63 +1443,38 @@ STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX); STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX); STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX); +STORE_FUNCTION(cfq_epoch_store, &cfqd->cfq_epoch, 0, INT_MAX); +STORE_FUNCTION(cfq_hmax_pct_store, &cfqd->cfq_hmax_pct, 1, 100); #undef STORE_FUNCTION -static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page) -{ - return sprintf(page, "%lu\n", cfqd->cfq_epoch); -} - -static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count) -{ - char *p = (char *) page; - cfqd->cfq_epoch = simple_strtoul(p, &p, 10); - return count; -} - -static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page) -{ - return sprintf(page, "%lu\n", cfqd->cfq_epochsectors); -} - -static ssize_t -cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count) -{ - char *p = (char *) page; - cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10); - return count; -} - /* Additional entries to get priority level data */ static ssize_t cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl) { - int r1,r2,s1,s2,q1,q2; + //int r1,r2,s1,s2,q1,q2; if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) return 0; + /* r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in)); r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out)); s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in)); s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out)); q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out)); - - return sprintf(page,"skip %d avsec %lu rate %lu new %lu" - "rq (%d,%d) sec (%d,%d) q (%d,%d)\n", - cfqd->cid[priolvl].nskip, - cfqd->cid[priolvl].navsec, - cfqd->cid[priolvl].csectorate, - cfqd->cid[priolvl].lsectorate, -// atomic_read(&cfqd->cid[priolvl].nskip), -// atomic_read(&cfqd->cid[priolvl].navsec), -// atomic_read(&cfqd->cid[priolvl].csectorate), -// atomic_read(&cfqd->cid[priolvl].lsectorate), - r1,r2, - s1,s2, - q1,q2); + */ + + return sprintf(page,"skip %d timdout %d avsec %lu rate %ld " + " sec0 %lu sec1 %lu\n", + cfqd->cid[priolvl].cfqpriv.nskip, + cfqd->cid[priolvl].cfqpriv.timedout, + cfqd->cid[priolvl].cfqpriv.navsec, + atomic_read(&(cfqd->cid[priolvl].cfqpriv.sectorate)), + (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[0], + (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[1]); + } #define SHOW_PRIO_DATA(__PRIOLVL) \ @@ -1411,12 +1508,25 @@ SHOW_PRIO_DATA(20); static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl) { + + char *p = (char *) page; + int val; + + val = (int) simple_strtoul(p, &p, 10); + + atomic_set(&(cfqd->cid[priolvl].cfqpriv.sectorate),val); + cfqd->cid[priolvl].cfqpriv.nskip = 0; + cfqd->cid[priolvl].cfqpriv.navsec = 0; + cfqd->cid[priolvl].cfqpriv.timedout = 0; + +#if 0 atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0); atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0); atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0); atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0); atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0); atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0); +#endif return count; } @@ -1491,10 +1601,10 @@ static struct cfq_fs_entry cfq_epoch_entry = { .show = cfq_epoch_show, .store = cfq_epoch_store, }; -static struct cfq_fs_entry cfq_epochsectors_entry = { - .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_epochsectors_show, - .store = cfq_epochsectors_store, +static struct cfq_fs_entry cfq_hmax_pct_entry = { + .attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR }, + .show = cfq_hmax_pct_show, + .store = cfq_hmax_pct_store, }; #define P_0_STR "p0" @@ -1558,7 +1668,7 @@ static struct attribute *default_attrs[] = { &cfq_grace_rt_entry.attr, &cfq_grace_idle_entry.attr, &cfq_epoch_entry.attr, - &cfq_epochsectors_entry.attr, + &cfq_hmax_pct_entry.attr, &cfq_prio_0_entry.attr, &cfq_prio_1_entry.attr, &cfq_prio_2_entry.attr, diff --git a/drivers/block/ckrm-io.c b/drivers/block/ckrm-io.c index 7edfce727..89910268f 100644 --- a/drivers/block/ckrm-io.c +++ b/drivers/block/ckrm-io.c @@ -35,14 +35,11 @@ #include #include -/* Tie to cfq priorities */ -#define CKI_IOPRIO_NORM IOPRIO_NORM +/* sectorate == 512 byte sectors served in CFQ_EPOCH ns*/ -/* Divisor to get fraction of bandwidth represented by an IOPRIO value */ -/* FIXME: Will not work if IOPRIO_NR > 100 */ -#define CKI_IOPRIO_DIV (IOPRIO_NR-1) -/* Minimum ioprio value to be assigned to a class */ -#define CKI_IOPRIO_MIN 1 +/* CKI_ROOTSECTORATE needs to be made configurable from outside */ +#define CKI_ROOTSECTORATE 100000 +#define CKI_MINSECTORATE 100 #define CKI_IOUSAGE_UNIT 512 @@ -52,7 +49,12 @@ typedef struct ckrm_io_stats{ unsigned long blksz; /* size of bandwidth unit */ atomic_t blkrd; /* read units submitted to DD */ atomic_t blkwr; /* write units submitted to DD */ - + + int nskip; /* # times q skipped */ + unsigned long navsec; /* avg sectors serviced */ + int timedout; /* # times gap > epoch */ + u64 sec[2]; /* sectors serviced in + prev & curr epochs */ } cki_stats_t; /* per class I/O statistics */ /* Note @@ -75,8 +77,12 @@ typedef struct ckrm_io_class { * in local units. */ + cfqlim_t cfqpriv; /* Data common with cfq priolvl's */ + + int cnt_guarantee; /* Allocation as parent */ int cnt_unused; /* Allocation to default subclass */ + int cnt_limit; /* Statistics, for class and default subclass */ cki_stats_t stats; @@ -85,19 +91,16 @@ typedef struct ckrm_io_class { } cki_icls_t; - /* Internal functions */ static inline void cki_reset_stats(cki_stats_t *usg); static inline void init_icls_one(cki_icls_t *icls); -static inline int cki_div(int *a, int b, int c); -//static inline int cki_recalc(cki_icls_t *icls, int rel2abs); static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres); /* External functions e.g. interface to ioscheduler */ void *cki_tsk_icls (struct task_struct *tsk); int cki_tsk_ioprio (struct task_struct *tsk); -extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio); +extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv); /* CKRM Resource Controller API functions */ static void * cki_alloc(struct ckrm_core_class *this, @@ -139,45 +142,27 @@ static inline void init_icls_stats(cki_icls_t *icls) static inline void init_icls_one(cki_icls_t *icls) { - // Assign zero as initial guarantee otherwise creations - // could fail due to inadequate share - - //icls->shares.my_guarantee = - // (CKI_IOPRIO_MIN * CKRM_SHARE_DFLT_TOTAL_GUARANTEE) / - // CKI_IOPRIO_DIV ; - icls->shares.my_guarantee = 0; - icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + /* Zero initial guarantee for scalable creation of + multiple classes */ - icls->shares.unused_guarantee = icls->shares.total_guarantee - - icls->shares.my_guarantee; - icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - - icls->cnt_guarantee = icls->cnt_unused = IOPRIO_IDLE; + /* Try out a new set */ + + icls->shares.my_guarantee = CKRM_SHARE_DONTCARE; + icls->shares.my_limit = CKRM_SHARE_DONTCARE; + icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + icls->shares.unused_guarantee = icls->shares.total_guarantee; + icls->shares.cur_max_limit = 0; - //Same rationale icls->ioprio = CKI_IOPRIO_MIN; - //IOPRIO_IDLE equivalence to zero my_guarantee (set above) relies - //on former being zero. + icls->cnt_guarantee = CKRM_SHARE_DONTCARE; + icls->cnt_unused = CKRM_SHARE_DONTCARE; + icls->cnt_limit = CKRM_SHARE_DONTCARE; init_icls_stats(icls); } - -static inline int cki_div(int *a, int b, int c) -{ - u64 temp = (u64) b * c ; - do_div(temp,CKI_IOPRIO_DIV); - *a = (int) temp; - - return 0; -} - - -/* Recalculate absolute shares from relative (rel2abs=1) - * or vice versa (rel2abs=0) - * Caller should have a lock on icls +/* Recalculate absolute shares from relative + * Caller should hold a lock on icls */ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) @@ -186,17 +171,17 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) ckrm_core_class_t *child = NULL; cki_icls_t *childres; int resid = cki_rcbs.resid; + u64 temp; if (parres) { struct ckrm_shares *par = &parres->shares; struct ckrm_shares *self = &res->shares; - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { res->cnt_guarantee = CKRM_SHARE_DONTCARE; } else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * + temp = (u64) self->my_guarantee * parres->cnt_guarantee; do_div(temp, par->total_guarantee); res->cnt_guarantee = (int) temp; @@ -204,16 +189,36 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) res->cnt_guarantee = 0; } + + if (parres->cnt_limit == CKRM_SHARE_DONTCARE) { + res->cnt_limit = CKRM_SHARE_DONTCARE; + atomic_set(&res->cfqpriv.sectorate,CKI_MINSECTORATE); + } else { + if (par->max_limit) { + temp = (u64) self->my_limit * + parres->cnt_limit; + do_div(temp, par->max_limit); + res->cnt_limit = (int) temp; + } else { + res->cnt_limit = 0; + } + atomic_set(&res->cfqpriv.sectorate,res->cnt_limit); + } + if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) { res->cnt_unused = CKRM_SHARE_DONTCARE; - } else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * - res->cnt_guarantee; - do_div(temp, self->total_guarantee); - res->cnt_unused = (int) temp; } else { - res->cnt_unused = 0; + if (self->total_guarantee) { + temp = (u64) self->unused_guarantee * + res->cnt_guarantee; + do_div(temp, self->total_guarantee); + res->cnt_unused = (int) temp; + } else { + res->cnt_unused = 0; + } + } + } // propagate to children ckrm_lock_hier(res->core); @@ -228,50 +233,6 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) ckrm_unlock_hier(res->core); } -#if 0 -static inline int cki_recalc(cki_icls_t *icls, int rel2abs) -{ - u64 temp; - - if (icls->parent == NULL) { - /* Root, as parent, always gets all */ - - temp = icls->shares.my_guarantee * (IOPRIO_NR-1); - do_div(temp, icls->shares.total_guarantee); - - icls->total = IOPRIO_NR-1; - icls->ioprio = temp ; - icls->unused = icls->total - icls->ioprio; -// icls->unused = (IOPRIO_NR-1)-icls->ioprio; - - } else { - cki_icls_t *parres; - int partot ; - - parres = ckrm_get_res_class(icls->parent, - cki_rcbs.resid, - cki_icls_t); - if (!parres) { - printk(KERN_ERR "cki_recalc: error getting " - "resclass from core \n"); - return -EINVAL; - } - - - temp = (icls->shares.my_guarantee * - parres->total); - do_div(temp, parres->shares.total_guarantee); - - icls->ioprio = temp; - icls->unused = 0; - - } - - return 0; - -} -#endif - void *cki_tsk_icls(struct task_struct *tsk) { return (void *) ckrm_get_res_class(class_core(tsk->taskclass), @@ -279,12 +240,19 @@ void *cki_tsk_icls(struct task_struct *tsk) } int cki_tsk_ioprio(struct task_struct *tsk) +{ + /* Don't use I/O priorities for now */ + return IOPRIO_NORM; +} + +void *cki_tsk_cfqpriv(struct task_struct *tsk) { cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass), cki_rcbs.resid, cki_icls_t); - return icls->cnt_unused; + return (void *)&(icls->cfqpriv); } + static void *cki_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent) { @@ -301,43 +269,13 @@ static void *cki_alloc(struct ckrm_core_class *core, icls->parent = parent; icls->shares_lock = SPIN_LOCK_UNLOCKED; - if (parent == NULL) { - - /* Root class gets same as "normal" CFQ priorities to - * retain compatibility of behaviour in the absence of - * other classes - */ - - icls->cnt_guarantee = icls->cnt_unused = IOPRIO_NR-1; - - /* Default gets normal, not minimum */ - //icls->unused = IOPRIO_NORM; - //icls->unused = icls->guarantee-icls->myguarantee; - //icls->limit = icls->mylimit = IOPRIO_NR; - - /* Compute shares in abstract units */ - icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - // my_guarantee for root is meaningless. Set to default - icls->shares.my_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; + init_icls_one(icls); - icls->shares.unused_guarantee = - CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - //temp = (u64) icls->cnt_unused * icls->shares.total_guarantee; - //do_div(temp, CKI_IOPRIO_DIV); - // temp now has root's default's share - //icls->shares.unused_guarantee = - // icls->shares.total_guarantee - temp; - - icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - - } else { - init_icls_one(icls); - /* No propagation to parent needed if icls' - initial share is zero */ + if (parent == NULL) { + icls->cnt_guarantee = CKI_ROOTSECTORATE; + icls->cnt_unused = CKI_ROOTSECTORATE; + icls->cnt_limit = CKI_ROOTSECTORATE; + atomic_set(&(icls->cfqpriv.sectorate),icls->cnt_limit); } try_module_get(THIS_MODULE); return icls; @@ -345,7 +283,10 @@ static void *cki_alloc(struct ckrm_core_class *core, static void cki_free(void *res) { - cki_icls_t *icls = res, *parres; + cki_icls_t *icls = res, *parres, *childres; + ckrm_core_class_t *child = NULL; + int maxlimit, resid = cki_rcbs.resid; + if (!res) return; @@ -361,9 +302,7 @@ static void cki_free(void *res) * */ - parres = ckrm_get_res_class(icls->parent, - cki_rcbs.resid, - cki_icls_t); + parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t); if (!parres) { printk(KERN_ERR "cki_free: error getting " "resclass from core \n"); @@ -372,8 +311,23 @@ static void cki_free(void *res) /* Update parent's shares */ spin_lock(&parres->shares_lock); + child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0); parres->cnt_unused += icls->cnt_guarantee; + + // run thru parent's children and get the new max_limit of the parent + ckrm_lock_hier(parres->core); + maxlimit = 0; + while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { + childres = ckrm_get_res_class(child, resid, cki_icls_t); + if (maxlimit < childres->shares.my_limit) { + maxlimit = childres->shares.my_limit; + } + } + ckrm_unlock_hier(parres->core); + if (parres->shares.cur_max_limit < maxlimit) { + parres->shares.cur_max_limit = maxlimit; + } spin_unlock(&parres->shares_lock); kfree(res); @@ -388,26 +342,15 @@ static int cki_setshare(void *res, struct ckrm_shares *new) struct ckrm_shares *cur, *par; int rc = -EINVAL, resid = cki_rcbs.resid; - if (!icls) { - printk(KERN_ERR "No class\n"); + if (!icls) return rc; - } cur = &icls->shares; - - /* limits not supported */ - if ((new->max_limit != CKRM_SHARE_UNCHANGED) - || (new->my_limit != CKRM_SHARE_UNCHANGED)) { - printk(KERN_ERR "limits not supported\n"); - return -EINVAL; - } - if (icls->parent) { parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t); if (!parres) { - printk(KERN_ERR "cki_setshare: error getting " - "resclass from core \n"); + pr_debug("cki_setshare: invalid resclass\n"); return -EINVAL; } spin_lock(&parres->shares_lock); @@ -420,10 +363,8 @@ static int cki_setshare(void *res, struct ckrm_shares *new) } rc = set_shares(new, cur, par); - printk(KERN_ERR "rc from set_shares %d\n", rc); if ((!rc) && parres) { - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { parres->cnt_unused = CKRM_SHARE_DONTCARE; } else if (par->total_guarantee) { @@ -435,17 +376,6 @@ static int cki_setshare(void *res, struct ckrm_shares *new) parres->cnt_unused = 0; } cki_recalc_propagate(res, parres); - -#if 0 - int old = icls->ioprio; - - rc = cki_recalc(icls,0); - - if (!rc && parres) { - int raise_tot = icls->ioprio - old ; - parres->unused -= raise_tot ; - } -#endif } spin_unlock(&icls->shares_lock); if (icls->parent) { @@ -471,15 +401,15 @@ static int cki_getstats(void *res, struct seq_file *sfile) if (!icls) return -EINVAL; -/* - seq_printf(sfile, "%d my_read\n",atomic_read(&icls->mystats.blkrd)); - seq_printf(sfile, "%d my_write\n",atomic_read(&icls->mystats.blkwr)); - seq_printf(sfile, "%d total_read\n",atomic_read(&icls->stats.blkrd)); - seq_printf(sfile, "%d total_write\n",atomic_read(&icls->stats.blkwr)); -*/ - - seq_printf(sfile, "%d total ioprio\n",icls->cnt_guarantee); - seq_printf(sfile, "%d unused/default ioprio\n",icls->cnt_unused); + seq_printf(sfile, "abs limit %d\n",icls->cnt_limit); + seq_printf(sfile, "skip %d timdout %d avsec %lu rate %ld " + " sec0 %ld sec1 %ld\n", + icls->cfqpriv.nskip, + icls->cfqpriv.timedout, + icls->cfqpriv.navsec, + atomic_read(&(icls->cfqpriv.sectorate)), + (unsigned long)icls->cfqpriv.sec[0], + (unsigned long)icls->cfqpriv.sec[1]); return 0; } @@ -554,7 +484,7 @@ int __init cki_init(void) resid = ckrm_register_res_ctlr(clstype, &cki_rcbs); if (resid != -1) { cki_rcbs.classtype = clstype; - cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio); + cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio,cki_tsk_cfqpriv); } } @@ -566,7 +496,7 @@ void __exit cki_exit(void) ckrm_unregister_res_ctlr(&cki_rcbs); cki_rcbs.resid = -1; cki_rcbs.classtype = NULL; - cki_cfq_set(NULL,NULL); + cki_cfq_set(NULL,NULL,NULL); } module_init(cki_init) diff --git a/drivers/block/ckrm-iostub.c b/drivers/block/ckrm-iostub.c index c325d8e8d..f4012545b 100644 --- a/drivers/block/ckrm-iostub.c +++ b/drivers/block/ckrm-iostub.c @@ -25,13 +25,14 @@ static spinlock_t stub_lock = SPIN_LOCK_UNLOCKED; static icls_tsk_t tskiclstub; static icls_ioprio_t tskiopriostub; +static icls_tsk_t tskcfqprivstub; - -void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio) +void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv) { spin_lock(&stub_lock); tskiclstub = tskicls; tskiopriostub = tskioprio; + tskcfqprivstub = tskcfqpriv; spin_unlock(&stub_lock); } @@ -59,6 +60,19 @@ int cki_ioprio(struct task_struct *tsk) return ret; } +void *cki_cfqpriv(struct task_struct *tsk) +{ + void *ret; + spin_lock(&stub_lock); + if (tskiclstub) + ret = (*tskcfqprivstub)(tsk); + else + ret = NULL; + spin_unlock(&stub_lock); + return ret; +} + EXPORT_SYMBOL(cki_cfq_set); EXPORT_SYMBOL(cki_hash_key); EXPORT_SYMBOL(cki_ioprio); +EXPORT_SYMBOL(cki_cfqpriv); diff --git a/drivers/char/.cvsignore b/drivers/char/.cvsignore new file mode 100644 index 000000000..83683a2d8 --- /dev/null +++ b/drivers/char/.cvsignore @@ -0,0 +1,2 @@ +consolemap_deftbl.c +defkeymap.c diff --git a/drivers/pci/.cvsignore b/drivers/pci/.cvsignore new file mode 100644 index 000000000..d5b21d9ee --- /dev/null +++ b/drivers/pci/.cvsignore @@ -0,0 +1,3 @@ +classlist.h +devlist.h +gen-devlist diff --git a/drivers/scsi/aic7xxx/.cvsignore b/drivers/scsi/aic7xxx/.cvsignore new file mode 100644 index 000000000..a1a7fcd04 --- /dev/null +++ b/drivers/scsi/aic7xxx/.cvsignore @@ -0,0 +1,4 @@ +aic79xx_reg.h +aic79xx_seq.h +aic7xxx_reg.h +aic7xxx_seq.h diff --git a/fs/aio.c b/fs/aio.c index 9e7b5928e..2335a0756 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -543,7 +543,7 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id) return ioctx; } -static void use_mm(struct mm_struct *mm) +void use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; diff --git a/include/.cvsignore b/include/.cvsignore new file mode 100644 index 000000000..04204c7c9 --- /dev/null +++ b/include/.cvsignore @@ -0,0 +1 @@ +config diff --git a/include/asm-i386/.cvsignore b/include/asm-i386/.cvsignore new file mode 100644 index 000000000..4ec57ad5b --- /dev/null +++ b/include/asm-i386/.cvsignore @@ -0,0 +1 @@ +asm_offsets.h diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h index c689554ad..9513dd889 100644 --- a/include/asm-i386/apicdef.h +++ b/include/asm-i386/apicdef.h @@ -86,6 +86,7 @@ #define APIC_LVT_REMOTE_IRR (1<<14) #define APIC_INPUT_POLARITY (1<<13) #define APIC_SEND_PENDING (1<<12) +#define APIC_MODE_MASK 0x700 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7) #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8)) #define APIC_MODE_FIXED 0x0 diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index d1a4dd68f..43917d930 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -39,6 +39,7 @@ union irq_ctx { u32 stack[THREAD_SIZE/sizeof(u32)]; }; +#ifdef CONFIG_IRQSTACKS extern union irq_ctx *hardirq_ctx[NR_CPUS]; extern union irq_ctx *softirq_ctx[NR_CPUS]; @@ -46,6 +47,10 @@ extern void irq_ctx_init(int cpu); #define __ARCH_HAS_DO_SOFTIRQ +#else +#define irq_ctx_init(cpu) do { ; } while (0) +#endif + struct irqaction; struct pt_regs; asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *, diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h new file mode 100644 index 000000000..eb8fd9868 --- /dev/null +++ b/include/asm-i386/kexec.h @@ -0,0 +1,25 @@ +#ifndef _I386_KEXEC_H +#define _I386_KEXEC_H + +#include + +/* + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. + * I.e. Maximum page that is mapped directly into kernel memory, + * and kmap is not required. + * + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct + * calculation for the amount of memory directly mappable into the + * kernel memory space. + */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) +/* Maximum address we can use for the control code buffer */ +#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE + +#define KEXEC_CONTROL_CODE_SIZE 4096 + +#endif /* _I386_KEXEC_H */ diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h index 614d05f27..263c6f752 100644 --- a/include/asm-i386/module.h +++ b/include/asm-i386/module.h @@ -60,7 +60,19 @@ struct mod_arch_specific #define MODULE_REGPARM "" #endif +#if (CONFIG_STACK_SIZE_SHIFT < 12) +#define MODULE_STACKSIZE "TINYSTACKS " +#elif (CONFIG_STACK_SIZE_SHIFT == 12) #define MODULE_STACKSIZE "4KSTACKS " +#elif (CONFIG_STACK_SIZE_SHIFT == 13) +#define MODULE_STACKSIZE "8KSTACKS " +#elif (CONFIG_STACK_SIZE_SHIFT == 14) +#define MODULE_STACKSIZE "16KSTACKS " +#elif (CONFIG_STACK_SIZE_SHIFT > 14) +#define MODULE_STACKSIZE "HUGESTACKS " +#else +#define MODULE_STACKSIZE "" +#endif #define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index cd8708b42..3651a3bb0 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -400,10 +400,10 @@ struct tss_struct { #define ARCH_MIN_TASKALIGN 16 - -#define STACK_PAGE_COUNT (4096/PAGE_SIZE) - - +#if ((1< -#define CLASSQUEUE_SIZE 1024 // acb: changed from 128 -//#define CLASSQUEUE_SIZE 128 +#warning mef: is classqueue_size big enough for PlanetLab +#define CLASSQUEUE_SIZE_SHIFT 7 +#define CLASSQUEUE_SIZE ( 1 << CLASSQUEUE_SIZE_SHIFT ) #define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long)) /** * struct cq_prio_array: duplicates prio_array defined in sched.c - * - * I duplicate this data structure to make ckrm_classqueue implementation more modular */ struct cq_prio_array { int nr_active; @@ -49,42 +58,50 @@ struct cq_prio_array { * @base: base priority * @base_offset: index in array for the base * - * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks) - * as task runqueue, each processor has a classqueue - * a class enters the classqueue when the first task in this class local runqueue shows up - * a class enters the classqueue when the last task in the local runqueue leaves - * class local runqueues are ordered based their priority - * - * status: - * hzheng: is 32bit base long enough? + * classqueue can be thought of as runqueue of lrq's (per cpu object of + * a CKRM class as task runqueue (instead of runqueue of tasks) + * - a class's local lrq is enqueued into the local classqueue when a + * first task is enqueued lrq. + * - a class's local lrq is removed from the local classqueue when the + * last task is dequeued from the lrq. + * - lrq's are ordered based on their priority (determined elsewhere) + * ( CKRM: caculated based on it's progress (cvt) and urgency (top_priority) */ + struct classqueue_struct { - struct cq_prio_array array; + int enabled; // support dynamic on/off unsigned long base; unsigned long base_offset; + struct cq_prio_array array; }; /** - * struct cq_node_struct - the link object between class local runqueue and classqueue + * struct cq_node_struct: + * - the link object between class local runqueue and classqueue * @list: links the class local runqueue to classqueue - * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority) + * @prio: class priority * @index: real index into the classqueue array, calculated based on priority - * - * NOTE: make sure list is empty when it's not in classqueue */ struct cq_node_struct { struct list_head list; int prio; int index; + /* + * set when the class jump out of the class queue window + * class with this value set should be repositioned whenever classqueue slides window + * real_prio is valid when need_repos is set + */ + int real_prio; + int need_repos; }; typedef struct cq_node_struct cq_node_t; -typedef unsigned long long CVT_t; // cummulative virtual time - static inline void cq_node_init(cq_node_t * node) { node->prio = 0; node->index = -1; + node->real_prio = 0; + node->need_repos = 0; INIT_LIST_HEAD(&node->list); } @@ -95,23 +112,18 @@ static inline int cls_in_classqueue(cq_node_t * node) } /*initialize the data structure*/ -int classqueue_init(struct classqueue_struct *cq); +int classqueue_init(struct classqueue_struct *cq, int enabled); -/*add the class to classqueue*/ -void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio); +/*add the class to classqueue at given priority */ +void classqueue_enqueue(struct classqueue_struct *cq, + cq_node_t * node, int prio); -/** - * classqueue_dequeue - remove the class from classqueue - * - * internal: - * called when the last task is removed from the queue - * checked on load balancing and schedule - * hzheng: why don't I call it on class_dequeue_task? - */ +/*remove the class from classqueue */ void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node); /*change the position of the class in classqueue*/ -void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio); +void classqueue_update_prio(struct classqueue_struct *cq, + cq_node_t * node, int new_prio); /*return the first class in classqueue*/ cq_node_t *classqueue_get_head(struct classqueue_struct *cq); @@ -122,7 +134,8 @@ void classqueue_update_base(struct classqueue_struct *cq); /** * class_compare_prio: compare the priority of this two nodes */ -static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2) +static inline int class_compare_prio(struct cq_node_struct* node1, + struct cq_node_struct* node2) { return ( node1->prio - node2->prio); } diff --git a/include/linux/ckrm_rc.h b/include/linux/ckrm_rc.h index 1bf2d07b5..a134dbc0d 100644 --- a/include/linux/ckrm_rc.h +++ b/include/linux/ckrm_rc.h @@ -113,7 +113,6 @@ typedef struct ckrm_res_ctlr { #define CKRM_MAX_TYPENAME_LEN 32 typedef struct ckrm_classtype { - /* Hubertus: Rearrange slots later for cache friendliness */ /* resource controllers */ spinlock_t res_ctlrs_lock; // protect res ctlr related data @@ -238,27 +237,6 @@ extern int ckrm_init_core_class(struct ckrm_classtype *clstype, struct ckrm_core_class *parent, const char *name); extern int ckrm_release_core_class(struct ckrm_core_class *); -// Hubertus .. can disappear after cls del debugging -extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type, - const char *resname); - -#if 0 - -// Hubertus ... need to straighten out all these I don't think we will even -// call this or are we - -/* interface to the RCFS filesystem */ -extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *, - const char *, int); - -// Reclassify the given pid to the given core class by force -extern void ckrm_forced_reclassify_pid(int, struct ckrm_core_class *); - -// Reclassify the given net_struct to the given core class by force -extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *, - struct ckrm_core_class *); - -#endif extern void ckrm_lock_hier(struct ckrm_core_class *); extern void ckrm_unlock_hier(struct ckrm_core_class *); @@ -290,12 +268,6 @@ extern int ckrm_class_set_shares(struct ckrm_core_class *core, extern int ckrm_class_reset_stats(struct ckrm_core_class *core, const char *resname, const char *unused); -#if 0 -extern void ckrm_ns_hold(struct ckrm_net_struct *); -extern void ckrm_ns_put(struct ckrm_net_struct *); -extern void *ckrm_set_rootcore_byname(char *, void *); -#endif - static inline void ckrm_core_grab(struct ckrm_core_class *core) { if (core) @@ -329,7 +301,6 @@ static inline unsigned int ckrm_is_core_valid(ckrm_core_class_t * core) ) extern struct ckrm_classtype *ckrm_classtypes[]; -/* should provide a different interface */ /*----------------------------------------------------------------------------- * CKRM event callback specification for the classtypes or resource controllers diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h index 3611c2d3e..dc00aeaa0 100644 --- a/include/linux/ckrm_sched.h +++ b/include/linux/ckrm_sched.h @@ -3,8 +3,6 @@ * Copyright (C) Haoqiang Zheng, IBM Corp. 2004 * Copyright (C) Hubertus Franke, IBM Corp. 2004 * - * Latest version, more details at http://ckrm.sf.net - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -12,6 +10,17 @@ * */ +/* + * Overview: + * --------- + * + * Please read Documentation/ckrm/cpu_sched for a general overview of + * how the O(1) CKRM scheduler. + * + * ckrm_sched.h provides the definition for the per class local runqueue. + * + */ + #ifndef _CKRM_SCHED_H #define _CKRM_SCHED_H @@ -27,18 +36,31 @@ struct prio_array { struct list_head queue[MAX_PRIO]; }; -#ifdef CONFIG_CKRM_CPU_SCHEDULE -#define rq_active(p,rq) (get_task_lrq(p)->active) -#define rq_expired(p,rq) (get_task_lrq(p)->expired) -int __init init_ckrm_sched_res(void); -#else + +#ifndef CONFIG_CKRM_CPU_SCHEDULE + #define rq_active(p,rq) (rq->active) #define rq_expired(p,rq) (rq->expired) static inline void init_ckrm_sched_res(void) {} static inline int ckrm_cpu_monitor_init(void) {return 0;} -#endif //CONFIG_CKRM_CPU_SCHEDULE -#ifdef CONFIG_CKRM_CPU_SCHEDULE +#else + +#define rq_active(p,rq) (get_task_lrq(p)->active) +#define rq_expired(p,rq) (get_task_lrq(p)->expired) + +enum ckrm_sched_mode { + CKRM_SCHED_MODE_DISABLED, /* always use default linux scheduling */ + /* effectively disables the ckrm scheduler */ + CKRM_SCHED_MODE_ENABLED /* always uses ckrm scheduling behavior */ +}; + +extern unsigned int ckrm_sched_mode; /* true internal sched_mode (DIS/EN ABLED) */ + +int __init init_ckrm_sched_res(void); + +typedef unsigned long long CVT_t; // cummulative virtual time + struct ckrm_runqueue { cq_node_t classqueue_linkobj; /*links in classqueue */ struct ckrm_cpu_class *cpu_class; // class it belongs to @@ -52,6 +74,7 @@ struct ckrm_runqueue { reset to jiffies if expires */ unsigned long expired_timestamp; + int best_expired_prio; /* * highest priority of tasks in active @@ -62,23 +85,38 @@ struct ckrm_runqueue { CVT_t local_cvt; unsigned long lrq_load; - int local_weight; + /* Three different weights are distinguished: + * local_weight, skewed_weight, over_weight: + * + * - local_weight: main weight to drive CVT progression + * - over_weight: weight to reduce savings when over its guarantee + * - skewed_weight: weight to use when local_weight to small + * avoids starvation problems. + */ + int local_weight; + int over_weight; + int skewed_weight; /* - * unused CPU time accumulated while thoe class + * unused CPU time accumulated while the class * is inactive goes to savings * * initialized to be 0 * a class can't accumulate more than SAVING_THRESHOLD of savings */ - unsigned long long savings; + CVT_t savings; unsigned long magic; //for debugging -}; +} ____cacheline_aligned_in_smp; + +#define CKRM_LRQ_MAGIC (0xACDC0702) typedef struct ckrm_runqueue ckrm_lrq_t; +#define ckrm_cpu_disabled() (ckrm_sched_mode == CKRM_SCHED_MODE_DISABLED) +#define ckrm_cpu_enabled() (ckrm_sched_mode == CKRM_SCHED_MODE_ENABLED) + /** * ckrm_cpu_class_stat - cpu usage statistics maintained for each class * @@ -103,24 +141,31 @@ struct ckrm_cpu_class_stat { */ int eshare; int meshare; + + /* a boolean indicates if the class has savings or not */ + int has_savings; + + /* + * a temporary value used by reorder_surplus_queue + */ + int demand_per_share; }; #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3 -#define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds -#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ)) -#define USAGE_WINDOW_SIZE 60 //keep the last 60 sample +#define USAGE_SAMPLE_FREQ (HZ) //sample every 1 seconds +#define USAGE_MAX_HISTORY (60) // keep the last 60 usage samples +#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ)) struct ckrm_usage { - unsigned long samples[USAGE_WINDOW_SIZE]; //record usages - unsigned long sample_pointer; //pointer for the sliding window - unsigned long long last_ns; //ns for last sample - long long last_sample_jiffies; //in number of jiffies + unsigned long samples[USAGE_MAX_HISTORY]; //record usages + unsigned long sample_pointer; // pointer for the sliding window + unsigned long long last_ns; // ns for last sample + long long last_sample_jiffies; // in number of jiffies }; /* - * manages the class status - * there should be only one instance of this object for each class in the whole system + * CPU controller object allocated for each CLASS */ struct ckrm_cpu_class { struct ckrm_core_class *core; @@ -129,12 +174,16 @@ struct ckrm_cpu_class { spinlock_t cnt_lock; // always grab parent's lock first and then child's struct ckrm_cpu_class_stat stat; struct list_head links; // for linking up in cpu classes - ckrm_lrq_t local_queues[NR_CPUS]; // runqueues + struct list_head surplus_queue; //used for surplus allocation + ckrm_lrq_t* local_queues[NR_CPUS]; // runqueues struct ckrm_usage usage; unsigned long magic; //for debugging +#ifdef __SIMULATOR__ + int class_id; +#endif }; -#define cpu_class_weight(cls) (cls->stat.meshare) +#define cpu_class_weight(cls) (SHARE_TO_WEIGHT(cls->stat.meshare)) #define local_class_weight(lrq) (lrq->local_weight) static inline int valid_cpu_class(struct ckrm_cpu_class * cls) @@ -150,7 +199,7 @@ static inline void ckrm_usage_init(struct ckrm_usage* usage) { int i; - for (i=0; i < USAGE_WINDOW_SIZE; i++) + for (i=0; i < USAGE_MAX_HISTORY; i++) usage->samples[i] = 0; usage->sample_pointer = 0; usage->last_ns = 0; @@ -188,49 +237,21 @@ static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr) // printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies); usage->sample_pointer ++; - if (usage->sample_pointer >= USAGE_WINDOW_SIZE) + if (usage->sample_pointer >= USAGE_MAX_HISTORY) usage->sample_pointer = 0; } -//duration is specified in number of jiffies -//return the usage in percentage -static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration) -{ - int nr_samples = duration/USAGE_SAMPLE_FREQ?:1; - struct ckrm_usage* usage = &clsptr->usage; - unsigned long long total = 0; - int i, idx; - - if (nr_samples > USAGE_WINDOW_SIZE) - nr_samples = USAGE_WINDOW_SIZE; - - idx = usage->sample_pointer; - for (i = 0; i< nr_samples; i++) { - if (! idx) - idx = USAGE_WINDOW_SIZE; - idx --; - total += usage->samples[idx]; - } - total *= 100; - do_div(total,nr_samples); - do_div(total,NS_PER_SAMPLE); - do_div(total,cpus_weight(cpu_online_map)); - return total; -} - - #define lrq_nr_running(lrq) \ (lrq->active->nr_active + lrq->expired->nr_active) -static inline ckrm_lrq_t * -get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) +static inline ckrm_lrq_t *get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) { - return &(cls->local_queues[cpu]); + return cls->local_queues[cpu]; } static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p) { - return &(p->cpu_class->local_queues[task_cpu(p)]); + return p->cpu_class->local_queues[task_cpu(p)]; } #define task_list_entry(list) list_entry(list,struct task_struct,run_list) @@ -247,16 +268,16 @@ void init_cpu_classes(void); void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares); void ckrm_cpu_change_class(void *task, void *old, void *new); - #define CPU_DEMAND_ENQUEUE 0 #define CPU_DEMAND_DEQUEUE 1 #define CPU_DEMAND_DESCHEDULE 2 #define CPU_DEMAND_INIT 3 /*functions exported by ckrm_cpu_monitor.c*/ +int update_effectives(void); void ckrm_cpu_monitor(int check_min); int ckrm_cpu_monitor_init(void); -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); +void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares); void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len); void adjust_local_weight(void); @@ -290,61 +311,53 @@ void adjust_local_weight(void); * *******************************************************************/ -#define CLASS_QUANTIZER 16 //shift from ns to increase class bonus -#define PRIORITY_QUANTIZER 2 //controls how much a high prio task can borrow - -#define CKRM_SHARE_ACCURACY 13 -#define NSEC_PER_MS 1000000 -#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ) - - -#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds - -#define CVT_UPDATE_TICK ((HZ/2)?:1) - -// ABSOLUTE_CKRM_TUNING determines whether classes can make up -// lost time in absolute time or in relative values - -#define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior - -#ifdef ABSOLUTE_CKRM_TUNING - -#define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE -//an absolute bonus of 200ms for classes when reactivated -#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) -#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES) - -#define scale_cvt(val,lrq) ((val)*local_class_weight(lrq)) -#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq))) - -#else - -#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) /* - * to improve system responsiveness - * an inactive class is put a little bit ahead of the current class when it wakes up - * the amount is set in normalized term to simplify the calculation - * for class with 100% share, it can be 2s ahead - * while for class with 10% share, it can be 200ms ahead + * The class priority is biasd toward classes with high priority tasks. + * But we need to prevent this bias from starving other classes. + * If a class has nice value of -20, how much it can starve the default class? + * priority bonus = (120-100) >> PRIORITY_QUANTIZER, + * if PRIORITY_QUANTIZER = 2, then it's 5 steps ahead + * A class without bonus thus can't get to run until: + * bonus * CKRM_MAX_WEIGHT * CVT_INC_PERSHARE = (120-100) >> PRIORITY_QUANTIZER + * (1 << CKRM_WEIGHT_SHIFT) + * (1 << CLASS_QUANTIZER) +*/ + +/* + * CKRM_WEIGHT_SHIFT and CLASS_QUANTIZER control how much a class with + * high priority task can starve a normal priority class, so it should + * be constant CLASS_QUANTIZER should not be too small otherwise we + * don't have enough bins in the classqueue. + * The ideal value of CLASS_QUANTIZER is 20, but a little smaller is acceptable */ -#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS) -/* - * normalized savings can't be more than MAX_NORMALIZED_SAVINGS - * based on the current configuration - * this means that a class with share 100% will accumulate 10s at most - * while a class with 1% of the share can only accumulate 100ms +#define CLASS_QUANTIZER (18)// shift from ns to increase class bonus +#define PRIORITY_QUANTIZER (2) // how much a high prio task can borrow +#define CKRM_WEIGHT_SHIFT (8) // 1/2^x == finest weight granularity +#define CKRM_MAX_WEIGHT (1<> CKRM_SHARE_ACCURACY) +#define SHARE_TO_WEIGHT(x) ((x) >> (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT)) +#define WEIGHT_TO_SHARE(x) ((x) << (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT)) -#define scale_cvt(val,lrq) (val) -#define unscale_cvt(val,lrq) (val) +/* Other constants */ -#endif +#define NSEC_PER_MS (1000000) +#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ) +#define MAX_SAVINGS_ABSOLUTE (4LLU*NSEC_PER_SEC) // 4 seconds +#define CVT_UPDATE_TICK ((HZ/2)?:1) +#define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE +#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES) /** * get_effective_prio: return the effective priority of a class local queue @@ -361,6 +374,7 @@ static inline int get_effective_prio(ckrm_lrq_t * lrq) int prio; prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage +#define URGENCY_SUPPORT 1 #ifndef URGENCY_SUPPORT #warning "ACB removing urgency calculation from get_effective_prio" #else @@ -414,84 +428,11 @@ static inline unsigned long task_load(struct task_struct* p) } /* - * runqueue load is the local_weight of all the classes on this cpu - * must be called with class_list_lock held + * moved to ckrm_sched.c + * but may need to make it static inline to improve performance */ -static inline unsigned long ckrm_cpu_load(int cpu) -{ - struct ckrm_cpu_class *clsptr; - ckrm_lrq_t* lrq; - struct ckrm_cpu_demand_stat* l_stat; - int total_load = 0; - int load; - - list_for_each_entry(clsptr,&active_cpu_classes,links) { - lrq = get_ckrm_lrq(clsptr,cpu); - l_stat = get_cls_local_stat(clsptr,cpu); - load = lrq->local_weight; - if (l_stat->cpu_demand < load) - load = l_stat->cpu_demand; - total_load += load; - } - return total_load; -} - -static inline void class_enqueue_task(struct task_struct *p, - prio_array_t * array) -{ - ckrm_lrq_t *lrq; - int effective_prio; - - lrq = get_task_lrq(p); - - cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); - lrq->lrq_load += task_load(p); - - if ((p->prio < lrq->top_priority) && (array == lrq->active)) - set_top_priority(lrq, p->prio); - - if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { - cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); - effective_prio = get_effective_prio(lrq); - classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio); - } - -} - -static inline void class_dequeue_task(struct task_struct *p, - prio_array_t * array) -{ - ckrm_lrq_t *lrq = get_task_lrq(p); - unsigned long load = task_load(p); - - BUG_ON(lrq->lrq_load < load); - lrq->lrq_load -= load; - - cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); - - if ((array == lrq->active) && (p->prio == lrq->top_priority) - && list_empty(&(array->queue[p->prio]))) - set_top_priority(lrq, - find_next_bit(array->bitmap, MAX_PRIO, - p->prio)); -} - -/* - * called after a task is switched out. Update the local cvt accounting - * we need to stick with long instead of long long due to nonexistent 64-bit division - */ -static inline void update_local_cvt(struct task_struct *p, unsigned long nsec) -{ - ckrm_lrq_t * lrq = get_task_lrq(p); - - unsigned long cvt_inc = nsec / local_class_weight(lrq); - - lrq->local_cvt += cvt_inc; - lrq->uncounted_ns += nsec; - - update_class_priority(lrq); -} - +void update_local_cvt(struct task_struct *p, unsigned long nsec); + static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) { struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj); @@ -518,11 +459,14 @@ static inline int get_ckrm_rand(unsigned long val) return rand; } -void update_class_cputime(int this_cpu); +void update_class_cputime(int this_cpu, int idle); /**********************************************/ /* PID_LOAD_BALANCING */ /**********************************************/ + +#define CPU_PID_CTRL_TICK 32 + struct ckrm_load_struct { unsigned long load_p; /*propotional*/ unsigned long load_i; /*integral */ @@ -538,26 +482,12 @@ static inline void ckrm_load_init(ckrm_load_t* ckrm_load) { } void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu); -long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group); +long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group); #define rq_ckrm_load(rq) (&((rq)->ckrm_load)) -static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load) -{ - read_lock(&class_list_lock); - -#ifdef CONFIG_SMP - ckrm_load_sample(ckrm_load,this_cpu); -#endif - if (! (j % CVT_UPDATE_TICK)) { - // printk("ckrm_sched j=%lu\n",j); - classqueue_update_base(get_cpu_classqueue(this_cpu)); - update_class_cputime(this_cpu); - } +#endif /*CONFIG_CKRM_CPU_SCHEDULE */ - read_unlock(&class_list_lock); -} +#endif -#endif //CONFIG_CKRM_CPU_SCHEDULE -#endif diff --git a/include/linux/ckrm_tc.h b/include/linux/ckrm_tc.h index 5650dd3c3..0caa797e7 100644 --- a/include/linux/ckrm_tc.h +++ b/include/linux/ckrm_tc.h @@ -1,3 +1,17 @@ +/* include/linux/ckrm_tc.h - general definitions for the CKRM TaskClass + * + * Copyright (C) Hubertus Franke, IBM Corp. 2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#ifndef _CKRM_TC_H +#define _CKRM_TC_H + #include #define TASK_CLASS_TYPE_NAME "taskclass" @@ -11,3 +25,5 @@ typedef struct ckrm_task_class { #define TC_MF_IDX 0 extern int ckrm_forced_reclassify_pid(int pid, struct ckrm_task_class *cls); + +#endif // _CKRM_TC_H diff --git a/include/linux/fs.h b/include/linux/fs.h index ece31a727..11067b72d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1603,6 +1603,15 @@ static inline void free_secdata(void *secdata) asmlinkage int sys_ioprio_set(int ioprio); asmlinkage int sys_ioprio_get(void); +/* common structure for cfq & ckrm I/O controller */ +typedef struct cfqlim { + int nskip; + unsigned long navsec; + int timedout; + atomic_t sectorate; + u64 sec[2]; +} cfqlim_t ; + #endif /* __KERNEL__ */ #endif /* _LINUX_FS_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h new file mode 100644 index 000000000..8bd6c6b91 --- /dev/null +++ b/include/linux/kexec.h @@ -0,0 +1,56 @@ +#ifndef LINUX_KEXEC_H +#define LINUX_KEXEC_H + +#ifdef CONFIG_KEXEC +#include +#include +#include + +/* + * This structure is used to hold the arguments that are used when loading + * kernel binaries. + */ + +typedef unsigned long kimage_entry_t; +#define IND_DESTINATION 0x1 +#define IND_INDIRECTION 0x2 +#define IND_DONE 0x4 +#define IND_SOURCE 0x8 + +#define KEXEC_SEGMENT_MAX 8 +struct kexec_segment { + void *buf; + size_t bufsz; + void *mem; + size_t memsz; +}; + +struct kimage { + kimage_entry_t head; + kimage_entry_t *entry; + kimage_entry_t *last_entry; + + unsigned long destination; + + unsigned long start; + struct page *control_code_page; + + unsigned long nr_segments; + struct kexec_segment segment[KEXEC_SEGMENT_MAX]; + + struct list_head control_pages; + struct list_head dest_pages; + struct list_head unuseable_pages; +}; + + +/* kexec interface functions */ +extern void machine_kexec(struct kimage *image); +extern int machine_kexec_prepare(struct kimage *image); +extern void machine_kexec_cleanup(struct kimage *image); +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments, + struct kexec_segment *segments); +extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +extern struct kimage *kexec_image; +#endif +#endif /* LINUX_KEXEC_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 3fb18934a..83c64bb32 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,7 +581,7 @@ int clear_page_dirty_for_io(struct page *page); */ typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); -extern long do_mprotect(struct mm_struct *mm, unsigned long start, +asmlinkage long do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, unsigned long prot); /* diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index a325de54c..f2ded1156 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -52,19 +52,23 @@ enum ip_conntrack_status { #include #include +#include /* per conntrack: protocol private data */ union ip_conntrack_proto { /* insert conntrack proto private data here */ + struct ip_ct_gre gre; struct ip_ct_tcp tcp; struct ip_ct_icmp icmp; }; union ip_conntrack_expect_proto { /* insert expect proto private data here */ + struct ip_ct_gre_expect gre; }; /* Add protocol helper include file here */ +#include #include #include #include @@ -72,6 +76,7 @@ union ip_conntrack_expect_proto { /* per expectation: application helper private data */ union ip_conntrack_expect_help { /* insert conntrack helper private data (expect) here */ + struct ip_ct_pptp_expect exp_pptp_info; struct ip_ct_amanda_expect exp_amanda_info; struct ip_ct_ftp_expect exp_ftp_info; struct ip_ct_irc_expect exp_irc_info; @@ -86,16 +91,19 @@ union ip_conntrack_expect_help { /* per conntrack: application helper private data */ union ip_conntrack_help { /* insert conntrack helper private data (master) here */ + struct ip_ct_pptp_master ct_pptp_info; struct ip_ct_ftp_master ct_ftp_info; struct ip_ct_irc_master ct_irc_info; }; #ifdef CONFIG_IP_NF_NAT_NEEDED #include +#include /* per conntrack: nat application helper private data */ union ip_conntrack_nat_help { /* insert nat helper private data here */ + struct ip_nat_pptp nat_pptp_info; }; #endif @@ -157,6 +165,12 @@ struct ip_conntrack_expect union ip_conntrack_expect_help help; }; +struct ip_conntrack_counter +{ + u_int64_t packets; + u_int64_t bytes; +}; + struct ip_conntrack_helper; struct ip_conntrack @@ -174,6 +188,11 @@ struct ip_conntrack /* Timer function; drops refcnt when it goes off. */ struct timer_list timeout; +#ifdef CONFIG_IP_NF_CT_ACCT + /* Accounting Information (same cache line as other written members) */ + struct ip_conntrack_counter counters[IP_CT_DIR_MAX]; +#endif + /* If we're expecting another related connection, this will be in expected linked list */ struct list_head sibling_list; @@ -249,8 +268,10 @@ extern int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig); /* Refresh conntrack for this many jiffies */ -extern void ip_ct_refresh(struct ip_conntrack *ct, - unsigned long extra_jiffies); +extern void ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies); /* These are for NAT. Icky. */ /* Call me when a conntrack is destroyed. */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h index 1e7691189..d2bd0be99 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h @@ -14,7 +14,7 @@ union ip_conntrack_manip_proto { /* Add other protocols here. */ - u_int16_t all; + u_int32_t all; struct { u_int16_t port; @@ -25,6 +25,9 @@ union ip_conntrack_manip_proto struct { u_int16_t id; } icmp; + struct { + u_int32_t key; + } gre; }; /* The manipulable part of the tuple. */ @@ -44,7 +47,7 @@ struct ip_conntrack_tuple u_int32_t ip; union { /* Add other protocols here. */ - u_int16_t all; + u_int32_t all; struct { u_int16_t port; @@ -55,6 +58,9 @@ struct ip_conntrack_tuple struct { u_int8_t type, code; } icmp; + struct { + u_int32_t key; + } gre; } u; /* The protocol. */ @@ -80,10 +86,16 @@ enum ip_conntrack_dir #ifdef __KERNEL__ #define DUMP_TUPLE(tp) \ -DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n", \ +DEBUGP("tuple %p: %u %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", \ (tp), (tp)->dst.protonum, \ - NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all), \ - NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all)) + NIPQUAD((tp)->src.ip), ntohl((tp)->src.u.all), \ + NIPQUAD((tp)->dst.ip), ntohl((tp)->dst.u.all)) + +#define DUMP_TUPLE_RAW(x) \ + DEBUGP("tuple %p: %u %u.%u.%u.%u:0x%08x -> %u.%u.%u.%u:0x%08x\n",\ + (x), (x)->dst.protonum, \ + NIPQUAD((x)->src.ip), ntohl((x)->src.u.all), \ + NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.all)) #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index d60fafc8b..5460e94a1 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -51,6 +51,8 @@ extern void machine_restart(char *cmd); extern void machine_halt(void); extern void machine_power_off(void); +extern void machine_shutdown(void); + #endif #endif /* _LINUX_REBOOT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index dd5005295..eda93cb65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -607,7 +607,6 @@ struct task_struct { spinlock_t ckrm_tsklock; void *ce_data; #ifdef CONFIG_CKRM_TYPE_TASKCLASS - // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; struct list_head taskclass_link; #ifdef CONFIG_CKRM_CPU_SCHEDULE diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 111bb7367..5156e432d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1106,6 +1106,20 @@ extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); +static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *buffer) +{ + int hlen = skb_headlen(skb); + + if (offset + len <= hlen) + return skb->data + offset; + + if (skb_copy_bits(skb, offset, buffer, len) < 0) + return NULL; + + return buffer; +} + extern void skb_init(void); extern void skb_add_mtu(int mtu); diff --git a/init/Kconfig b/init/Kconfig index 64ca2fcb7..5d28bb7df 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -214,6 +214,18 @@ config CKRM_MEM_LRUORDER_CHANGE Changing this to yes reduces the checking overhead but violates the approximate LRU order that is maintained by the paging subsystem. +config CKRM_CPU_SCHEDULE_AT_BOOT + bool "Turn on at boot time" + depends on CKRM_CPU_SCHEDULE + default n + help + Enable CKRM CPU Scheduler at boot time. Otherwise + it can be turned on dynamically at runtime. If not + turned on the default Linux Scheduler behavior + will be obtained. + + Say N if unsure, Y to use this feature + config CKRM_TYPE_SOCKETCLASS bool "Class Manager for socket groups" depends on CKRM diff --git a/kernel/.cvsignore b/kernel/.cvsignore new file mode 100644 index 000000000..21426e906 --- /dev/null +++ b/kernel/.cvsignore @@ -0,0 +1,2 @@ +config_data.gz +config_data.h diff --git a/kernel/Makefile b/kernel/Makefile index ec5001052..455ec1eae 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_MODULE_SIG) += module-verify.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index b32530977..4956dcb3a 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -8,6 +8,6 @@ endif obj-$(CONFIG_CKRM_TYPE_TASKCLASS) += ckrm_tc.o obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o - obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o + obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c index f1cfb268c..e732fdf53 100644 --- a/kernel/ckrm/ckrm.c +++ b/kernel/ckrm/ckrm.c @@ -82,6 +82,7 @@ inline unsigned int is_res_regd(struct ckrm_classtype *clstype, int resid) ); } +static struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype, const char *resname) { @@ -101,10 +102,8 @@ struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype, return NULL; } -EXPORT_SYMBOL(ckrm_resctlr_lookup); - /* given a classname return the class handle and its classtype*/ -void *ckrm_classobj(char *classname, int *classTypeID) +void *ckrm_classobj(const char *classname, int *classTypeID) { int i; @@ -864,7 +863,10 @@ int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq) atomic_inc(&clstype->nr_resusers[i]); rcbs = clstype->res_ctlrs[i]; if (rcbs && rcbs->get_share_values) { - (*rcbs->get_share_values) (core->res_class[i], &shares); + int rc = (*rcbs->get_share_values)(core->res_class[i], + &shares); + if (rc == -ENOSYS) + continue; seq_printf(seq,"res=%s,guarantee=%d,limit=%d," "total_guarantee=%d,max_limit=%d\n", rcbs->res_name, shares.my_guarantee, diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c index 917875b18..1bf482f21 100644 --- a/kernel/ckrm/ckrm_cpu_class.c +++ b/kernel/ckrm/ckrm_cpu_class.c @@ -22,9 +22,35 @@ #include #include #include +#include + +#define CPU_CTRL_NAME "cpu" struct ckrm_res_ctlr cpu_rcbs; +#define CKRM_CPU_USAGE_DETAIL_MAX 3 +static int usage_detail = 3; /* 0: show usage + * 1: show settings + * 2: show effectives + * 3: show per runqueue stats + */ + +static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode); + +/* + * update effective share setting after: + * -- remove class + * -- change class share + * we don't need to call update_effectives() when add new class since + * the defaults grt of new class is 0 + * CAUTION: might need a lock here + */ +static inline void update_class_effectives(void) +{ + // update_effectives(); + ckrm_cpu_monitor(0); +} + /** * insert_cpu_class - insert a class to active_cpu_class list * @@ -38,49 +64,81 @@ static inline void insert_cpu_class(struct ckrm_cpu_class *cls) /* * initialize a class object and its local queues */ + +CVT_t get_min_cvt_locking(int cpu); +ckrm_lrq_t *rq_get_dflt_lrq(int cpu); + +static void init_cpu_class_lrq(struct ckrm_cpu_class *cls, + int cpu, int isdflt) +{ + int j,k; + ckrm_lrq_t *queue = cls->local_queues[cpu]; + + queue->active = queue->arrays; + queue->expired = queue->arrays+1; + + for (j = 0; j < 2; j++) { + prio_array_t *array = queue->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + array->nr_active = 0; + } + + queue->expired_timestamp = 0; + queue->best_expired_prio = MAX_PRIO; + + queue->cpu_class = cls; + queue->classqueue = get_cpu_classqueue(cpu); + queue->top_priority = MAX_PRIO; + cq_node_init(&queue->classqueue_linkobj); + queue->local_cvt = isdflt ? 0 : get_min_cvt_locking(cpu); + queue->lrq_load = 0; + queue->local_weight = cpu_class_weight(cls); + if (queue->local_weight == 0) + queue->local_weight = 1; + queue->over_weight = 0; + queue->skewed_weight = CKRM_MAX_WEIGHT/2; /*otherwise class might starve on start*/ + queue->uncounted_ns = 0; + queue->savings = 0; + queue->magic = CKRM_LRQ_MAGIC; +} + void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) { - int i,j,k; - prio_array_t *array; - ckrm_lrq_t* queue; + int i; + int isdflt; + struct ckrm_cpu_class *dfltcls; + + dfltcls = get_default_cpu_class(); + + isdflt = (cls==dfltcls); cls->shares = *shares; cls->cnt_lock = SPIN_LOCK_UNLOCKED; - ckrm_cpu_stat_init(&cls->stat); + ckrm_cpu_stat_init(&cls->stat,isdflt ? CKRM_SHARE_MAX : 1); ckrm_usage_init(&cls->usage); cls->magic = CKRM_CPU_CLASS_MAGIC; - for (i = 0 ; i < NR_CPUS ; i++) { - queue = &cls->local_queues[i]; - queue->active = queue->arrays; - queue->expired = queue->arrays+1; - - for (j = 0; j < 2; j++) { - array = queue->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - array->nr_active = 0; + memset(cls->local_queues,0,NR_CPUS*sizeof(ckrm_lrq_t*)); + + if (isdflt) { + for (i=0; i< NR_CPUS; i++) { + cls->local_queues[i] = rq_get_dflt_lrq(i); + init_cpu_class_lrq(cls,i,1); + } + } else { + for_each_cpu(i) { + cls->local_queues[i] = kmalloc(sizeof(ckrm_lrq_t), + GFP_KERNEL); + BUG_ON(cls->local_queues[i]==NULL); + init_cpu_class_lrq(cls,i,0); } - - queue->expired_timestamp = 0; - - queue->cpu_class = cls; - queue->classqueue = get_cpu_classqueue(i); - queue->top_priority = MAX_PRIO; - cq_node_init(&queue->classqueue_linkobj); - queue->local_cvt = 0; - queue->lrq_load = 0; - queue->local_weight = cpu_class_weight(cls); - queue->uncounted_ns = 0; - queue->savings = 0; - queue->magic = 0x43FF43D7; } - // add to class list write_lock(&class_list_lock); insert_cpu_class(cls); write_unlock(&class_list_lock); @@ -100,14 +158,14 @@ struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) { struct ckrm_cpu_class * cls; cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); - if (valid_cpu_class(cls)) - return cls; + if (valid_cpu_class(cls)) + return (ckrm_cpu_enabled() ? cls : get_default_cpu_class()); else return NULL; } - -void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) +void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, + struct ckrm_core_class *parent) { struct ckrm_cpu_class *cls; @@ -128,7 +186,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class set_default_share(&shares); init_cpu_class(cls,&shares); cls->core = core; - cls->parent = parent; + cls->parent = parent; } } else printk(KERN_ERR"alloc_cpu_class failed\n"); @@ -136,15 +194,14 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class return cls; } -/* - * hzheng: this is not a stable implementation - * need to check race condition issue here - */ +void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr); + static void ckrm_free_cpu_class(void *my_res) { struct ckrm_cpu_class *cls = my_res, *parres, *childres; ckrm_core_class_t *child = NULL; int maxlimit; + int i; if (!cls) return; @@ -179,10 +236,19 @@ static void ckrm_free_cpu_class(void *my_res) list_del(&cls->links); write_unlock(&class_list_lock); + ckrm_cpu_class_queue_delete_sync(cls); + + for_each_cpu(i) { + ckrm_lrq_t *lrq = get_ckrm_lrq(cls,i); + if (!lrq) continue; + lrq->magic = -99; + kfree(lrq); + } kfree(cls); - //call ckrm_cpu_monitor after class removed - ckrm_cpu_monitor(0); + //call ckrm_cpu_monitor after class is removed + if (ckrm_cpu_enabled()) + update_class_effectives(); } /* @@ -194,8 +260,12 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) struct ckrm_shares *cur = &cls->shares, *par; int rc = -EINVAL; - if (!cls) - return rc; + if (ckrm_cpu_disabled()) + return -ENOSYS; + if (!cls) + return rc; + if (new_share->total_guarantee > CKRM_SHARE_MAX) + return -E2BIG; if (cls->parent) { parres = ckrm_get_cpu_class(cls->parent); @@ -215,7 +285,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) new_share->my_guarantee = 0; rc = set_shares(new_share, cur, par); - if (cur->my_limit == CKRM_SHARE_DONTCARE) + if (!rc && cur->my_limit == CKRM_SHARE_DONTCARE) cur->my_limit = cur->max_limit; @@ -225,7 +295,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) } //call ckrm_cpu_monitor after changes are changed - ckrm_cpu_monitor(0); + update_class_effectives(); return rc; } @@ -235,22 +305,90 @@ static int ckrm_cpu_get_share(void *my_res, { struct ckrm_cpu_class *cls = my_res; - if (!cls) + if (ckrm_cpu_disabled()) + return -ENOSYS; + if (!cls) return -EINVAL; + *shares = cls->shares; return 0; } +/* + * get_ckrm_usage(): + * obtain a sequence of usage informations + * returns number of usages reported. + * + * report IN: specifies the sequence of jiffies for which to report + * must be ordered (smallest first) + * OUT: returns the usage in each field + * + */ + + +int ckrm_cpu_get_usage(struct ckrm_cpu_class* clsptr, + int num, ulong report[]) +{ + struct ckrm_usage* usage = &clsptr->usage; + unsigned long long total = 0; + int i, idx, cur, num_ofs; + + num_ofs = cur = i = 0; + idx = usage->sample_pointer; + + for ( num_ofs = 0; num_ofs < num ; num_ofs++ ) { + int nr_samples; + int duration = report[num_ofs]; + unsigned long long totval = 0; + + nr_samples = duration/USAGE_SAMPLE_FREQ?:1; + + if (nr_samples > USAGE_MAX_HISTORY) + nr_samples = USAGE_MAX_HISTORY; + + for ( ; i< nr_samples; i++) { + if (! idx) + idx = USAGE_MAX_HISTORY; + idx --; + total += usage->samples[idx]; + } + totval = total * 1000; + do_div(totval,NS_PER_SAMPLE); + do_div(totval,nr_samples * cpus_weight(cpu_online_map)); + report[num_ofs] = totval; + } + + return num; +} + int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) { struct ckrm_cpu_class *cls = my_res; struct ckrm_cpu_class_stat* stat = &cls->stat; ckrm_lrq_t* lrq; int i; + ulong usage[3] = { 2*HZ, 10*HZ, 60*HZ }; - if (!cls) + if (!cls || ckrm_cpu_disabled()) return -EINVAL; + ckrm_cpu_get_usage(cls,3,usage); + + /* this will after full stabilization become the only cpu usage stats + */ + + seq_printf(sfile, "cpu-usage(2,10,60)= %lu %lu %lu\n", + usage[0],usage[1],usage[2]); + + if (usage_detail < 1) + return 0; + + /* the extended statistics we can decide whether we want to make the + * additional statistics available over config options + * eitherway they should be reported in a more concised form + * during stabilization, this is OK + */ + seq_printf(sfile, "-------- CPU Class Status Start---------\n"); seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n", cls->shares.my_guarantee, @@ -261,26 +399,35 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) cls->shares.unused_guarantee, cls->shares.cur_max_limit); + if (usage_detail < 2) + goto out; + seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt); seq_printf(sfile, "\tmegrt= %d\n",stat->megrt); seq_printf(sfile, "\tehl= %d\n",stat->ehl); seq_printf(sfile, "\tmehl= %d\n",stat->mehl); seq_printf(sfile, "\teshare= %d\n",stat->eshare); - seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls)); + seq_printf(sfile, "\tmeshare= %d\n",stat->meshare); seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand); seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns); - seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n", - get_ckrm_usage(cls,2*HZ), - get_ckrm_usage(cls,10*HZ), - get_ckrm_usage(cls,60*HZ) - ); + seq_printf(sfile, "\tusage(2,10,60)= %lu %lu %lu\n", + usage[0],usage[1],usage[2]); + + if (usage_detail < 3) + goto out; + + /* provide per run queue information */ for_each_online_cpu(i) { lrq = get_ckrm_lrq(cls,i); - seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings); + seq_printf(sfile, "\tlrq %d demand= %lu weight= %d " + "lrq_load= %lu cvt= %llu sav= %llu\n", + i,stat->local_stats[i].cpu_demand, + local_class_weight(lrq),lrq->lrq_load, + lrq->local_cvt,lrq->savings); } +out: seq_printf(sfile, "-------- CPU Class Status END ---------\n"); - return 0; } @@ -296,10 +443,34 @@ void ckrm_cpu_change_class(void *task, void *old, void *new) if (!task || ! old || !new) return; + if (ckrm_cpu_disabled()) + newcls = get_default_cpu_class(); _ckrm_cpu_change_class(tsk,newcls); } -/*dummy function, not used*/ +enum config_token_t { + config_usage_detail, /* define usage level */ + config_disable, /* always use default linux scheduling */ + /* effectively disables the ckrm scheduler */ + config_enable, /* always uses ckrm scheduling behavior */ + config_err /* parsing error */ +}; + +#define CKRM_SCHED_MODE_DISABLED_STR "disabled" +#define CKRM_SCHED_MODE_ENABLED_STR "enabled" + +static char *ckrm_sched_mode_str[] = { + CKRM_SCHED_MODE_DISABLED_STR, + CKRM_SCHED_MODE_ENABLED_STR +}; + +static match_table_t config_tokens = { + { config_disable, "mode="CKRM_SCHED_MODE_DISABLED_STR }, + { config_enable, "mode="CKRM_SCHED_MODE_ENABLED_STR }, + { config_usage_detail, "usage_detail=%u" }, + { config_err, NULL } +}; + static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile) { struct ckrm_cpu_class *cls = my_res; @@ -307,23 +478,61 @@ static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile) if (!cls) return -EINVAL; - seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class"); + seq_printf(sfile, "res=%s,mode=%s", + CPU_CTRL_NAME,ckrm_sched_mode_str[ckrm_sched_mode]); + if (!ckrm_cpu_disabled()) /* enabled || mixed */ + seq_printf(sfile, ",usage_detail=%u",usage_detail); + seq_printf(sfile,"\n"); return 0; } -/*dummy function, not used*/ static int ckrm_cpu_set_config(void *my_res, const char *cfgstr) { struct ckrm_cpu_class *cls = my_res; + char *p; + char **cfgstr_p = (char**)&cfgstr; + substring_t args[MAX_OPT_ARGS]; + int option,rc; + enum ckrm_sched_mode new_sched_mode; if (!cls) return -EINVAL; - printk(KERN_DEBUG "ckrm_cpu config='%s'\n",cfgstr); - return 0; + + new_sched_mode = ckrm_sched_mode; + rc = 0; + + while ((p = strsep(cfgstr_p, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, config_tokens, args); + switch (token) { + case config_usage_detail: + if (ckrm_cpu_disabled() || + (match_int(&args[0], &option)) || + (option > CKRM_CPU_USAGE_DETAIL_MAX)) + { + return -EINVAL; + } + usage_detail = option; + break; + case config_disable: + new_sched_mode = CKRM_SCHED_MODE_DISABLED; + break; + case config_enable: + new_sched_mode = CKRM_SCHED_MODE_ENABLED; + break; + case config_err: + return -EINVAL; + } + } + rc = ckrm_cpu_set_mode(new_sched_mode); + return rc; } struct ckrm_res_ctlr cpu_rcbs = { - .res_name = "cpu", + .res_name = CPU_CTRL_NAME, .res_hdepth = 1, .resid = -1, .res_alloc = ckrm_alloc_cpu_class, @@ -364,14 +573,69 @@ void init_cpu_classes(void) //init classqueues for each processor for (i=0; i < NR_CPUS; i++) - classqueue_init(get_cpu_classqueue(i)); + classqueue_init(get_cpu_classqueue(i),ckrm_cpu_enabled()); - /* - * hzheng: initialize the default cpu class - * required for E14/E15 since ckrm_init is called after sched_init - */ ckrm_alloc_cpu_class(NULL,NULL); } +void ckrm_cpu_class_queue_update(int on); +void ckrm_cpu_start_monitor(void); +void ckrm_cpu_kill_monitor(void); + +static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode) +{ + struct task_struct *proc, *tsk; + struct ckrm_cpu_class *new_cls = NULL; + int i; + + if (mode == ckrm_sched_mode) + return 0; + + printk("ckrm_cpu_set_mode from <%s> to <%s> pid=%d\n", + ckrm_sched_mode_str[ckrm_sched_mode], + ckrm_sched_mode_str[mode], + current->pid); + + if (mode == CKRM_SCHED_MODE_DISABLED) { + ckrm_cpu_kill_monitor(); + new_cls = get_default_cpu_class(); + } else { + ckrm_cpu_class_queue_update(1); + } + + /* run twice through the list to catch everyone, + * current and transient once + */ + + read_lock(&tasklist_lock); + + ckrm_sched_mode = mode; + /* we have to run through the list twice + * first catch all existing tasks + * and then deal with some potential race condition + */ + for ( i=2 ; i-- ; ) { + /* lock class_list_lock ? */ + + do_each_thread(proc, tsk) { + if (mode == CKRM_SCHED_MODE_ENABLED) { + new_cls = ckrm_get_res_class(class_core(tsk->taskclass), + cpu_rcbs.resid, + struct ckrm_cpu_class); + } + _ckrm_cpu_change_class(tsk,new_cls); + } while_each_thread(proc, tsk); + } + read_unlock(&tasklist_lock); + + if (mode == CKRM_SCHED_MODE_DISABLED) + ckrm_cpu_class_queue_update(0); + else + ckrm_cpu_start_monitor(); + return 0; +} EXPORT_SYMBOL(ckrm_get_cpu_class); + + + diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c index d8c199a20..d8d6bd307 100644 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ b/kernel/ckrm/ckrm_cpu_monitor.c @@ -28,21 +28,30 @@ #include #include +// #define CONFIG_CKRM_SUPPORT_MAXLIMITS + #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/ -#define CKRM_SHARE_MAX (1<shares.unused_guarantee; +} + static inline int get_soft_limit(struct ckrm_cpu_class *cls) { return cls->shares.my_limit; @@ -63,6 +72,57 @@ static inline int get_myhard_limit(struct ckrm_cpu_class *cls) return cls->shares.total_guarantee; } +static inline void set_eshare(struct ckrm_cpu_class_stat *stat, + int new_share) +{ + if (!new_share) + new_share = 1; + + BUG_ON(new_share < 0); + stat->eshare = new_share; +} + +static inline void set_meshare(struct ckrm_cpu_class_stat *stat, + int new_share) +{ + if (!new_share) + new_share = 1; + + BUG_ON(new_share < 0); + stat->meshare = new_share; +} + +/** + *get_self_cpu_demand - get cpu demand of the class itself (excluding children) + * + * self_cpu_demand = sum(cpu demand of all local queues) + */ +static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) +{ + int cpu_demand = 0; + int i; + int cpuonline = 0; + + for_each_online_cpu(i) { + cpu_demand_check_sleep(stat,i); + cpu_demand += stat->local_stats[i].cpu_demand; + cpuonline ++; + } + + return (cpu_demand/cpuonline); +} + +/* + * my max demand = min(cpu_demand, my effective hard limit) + */ +static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) +{ + unsigned long mmax_demand = get_self_cpu_demand(stat); + if (mmax_demand > stat->mehl) + mmax_demand = stat->mehl; + + return mmax_demand; +} static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type) { @@ -85,7 +145,7 @@ static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, } } -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) +void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares) { int i; @@ -93,7 +153,7 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) stat->total_ns = 0; stat->max_demand = 0; - for (i=0; i< NR_CPUS; i++) { + for (i=0; ilocal_stats[i],CPU_DEMAND_TP_CLASS); } @@ -102,10 +162,517 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/ stat->mehl = CKRM_SHARE_MAX; /*default: no limit */ - stat->eshare = CKRM_SHARE_MAX; - stat->meshare = CKRM_SHARE_MAX; + stat->eshare = eshares; + stat->meshare = eshares; + + stat->has_savings = 0; + stat->demand_per_share = 0; + +} + +#if 0 // keep handy for debugging if necessary +void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num) +{ + struct ckrm_cpu_class_stat* stat = &clsptr->stat; + printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num, + clsptr, (clsptr == get_default_cpu_class()), + clsptr->shares.my_guarantee, + clsptr->shares.my_limit, + clsptr->shares.total_guarantee, + clsptr->shares.max_limit, + clsptr->shares.unused_guarantee); + printk(" egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n", + stat->egrt,stat->megrt,stat->ehl,stat->mehl, + stat->eshare,stat->meshare); +} +#endif + +/**********************************************/ +/* surplus allocation */ +/**********************************************/ + +/* + * surplus = egrt - demand + * if surplus < 0, surplus = 0 + */ +static inline int get_node_surplus(struct ckrm_cpu_class *cls) +{ + int surplus = cls->stat.egrt - cls->stat.max_demand; + + if (surplus < 0) + surplus = 0; + + return surplus; +} + +/* + * consume savings in advance because this class give surplus to others + * this is a quick hack, should be integrated with balance_savings() + */ +static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr, + int savings_consumed) +{ + long long total_savings; + ckrm_lrq_t* lrq; + int i; + int cpu_online = 0; + + total_savings = 0; + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + total_savings += lrq->savings; + cpu_online ++; + } + + total_savings -= savings_consumed; + if (total_savings < 0) + total_savings = 0; + + //get the average savings + do_div(total_savings,cpu_online); + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + lrq->savings = total_savings; + } +} + +static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) +{ + int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); + int savings_consumed; + + if (surplus < 0) + surplus = 0; + + /* + * a quick hack about the hierarchy savings distribution + * may not be the right way to do + * + * since this node give its surplus to other nodes, + * it's savings should be consumed + * suppose CPU_MONITOR_INTERVAL = (HZ) + * savings_consumed is roughly how much savings will be consumed for the next second + */ + if (surplus) { + savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT); + consumed_surplus_savings(cls, savings_consumed) ; + } + + return surplus; +} + +/* + * all the class in the queue consume the surplus in order + * each class consume the amount propotional to its egrt + */ +static int consume_surplus_in_order(struct list_head* queue, + struct ckrm_cpu_class *p_cls, + int total_surplus) +{ + int total_grt = 0; + struct ckrm_cpu_class *clsptr; + + /* + * get total_grt of the classes in the queue + * total_grt can be maintained instead of re-calcuated each time + */ + list_for_each_entry(clsptr,queue,surplus_queue) { + if (unlikely(clsptr == p_cls)) + total_grt += clsptr->stat.megrt; + else + total_grt += clsptr->stat.egrt; + } + + if (! total_grt) + goto consume_out; + + //allocate in order + list_for_each_entry(clsptr,queue,surplus_queue) { + int surplus_per_share; + int consumed, my_grt; + + BUG_ON(! total_grt); + surplus_per_share = + (total_surplus << CKRM_SHARE_SHIFT) / total_grt; + + if (surplus_per_share <= 0) + break; + + if (unlikely(clsptr == p_cls)) //self_node consuming + my_grt = clsptr->stat.megrt; + else + my_grt = clsptr->stat.egrt; + + BUG_ON(clsptr->stat.demand_per_share <= 0); + + if (clsptr->stat.demand_per_share < surplus_per_share) + surplus_per_share = clsptr->stat.demand_per_share; + + consumed = surplus_per_share * my_grt; + consumed >>= CKRM_SHARE_SHIFT; + total_surplus -= consumed; + BUG_ON(total_surplus < 0); + total_grt -= my_grt; + + if (unlikely(clsptr == p_cls)) + set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed); + else + set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed); + } + consume_out: + if (total_surplus <= 1) //if total_suplus too small, no need to allocate again + total_surplus = 0; + return total_surplus; +} + +/* + * link all the children of parent and the parent itself using their surplus_queue field + * link the whole queue using src_queue + * if anything wrong return -1 + */ +static int get_class_surplus_queue(struct ckrm_core_class *parent, + struct list_head* src_queue) +{ + struct ckrm_core_class *child_core = NULL; + struct ckrm_cpu_class *p_cls,*c_cls; + int ret = -1; + + p_cls = ckrm_get_cpu_class(parent); + if (! p_cls) + goto link_out; + + INIT_LIST_HEAD(src_queue); + + //add the parent node itself + list_add(&p_cls->surplus_queue,src_queue); + do { + child_core = ckrm_get_next_child(parent, child_core); + if (child_core) { + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + goto link_out; + list_add(&c_cls->surplus_queue,src_queue); + } + } while (child_core); + + ret = 0; + + link_out: + return ret; +} + +/* + * insert the class to queue based on stat->demand_per_share + * status: tested + */ +static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr) +{ + struct ckrm_cpu_class *cur_cls = NULL; + int end_of_queue = 1; + + list_for_each_entry(cur_cls,queue,surplus_queue) { + if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) { + end_of_queue = 0; + break; + } + } + + //insert the clsptr + if (! cur_cls || end_of_queue) + list_add_tail(&clsptr->surplus_queue,queue); + else + list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue); +} + +/* + * copy all classes in src_queue to dst_queue, + * reorder the classes based on their normalized demand + * if a class already saturate (eshare >= demand), also remove it from src_queue + * return the total guarantee of the selected classes + * + * @src_queue: source queue + * @dst_queue: destination queue + * @check_sl: check soft limit + * @check_savings: only class has savings should be considered + */ + +static unsigned long reorder_surplus_queue(struct list_head* src_queue, + struct list_head* dst_queue, + int check_sl, int check_savings, + struct ckrm_cpu_class *p_cls) +{ + struct ckrm_cpu_class *clsptr, *tmp; + + INIT_LIST_HEAD(dst_queue); + + list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) { + struct ckrm_cpu_class_stat* stat = &clsptr->stat; + int inc_limit; + int max_demand, eshare, esl,grt; + + if (unlikely(clsptr == p_cls)) { + max_demand = get_mmax_demand(stat); + eshare = stat->meshare; + esl = get_mysoft_limit(clsptr); + grt = stat->megrt; + } else { + max_demand = stat->max_demand; + eshare = stat->eshare; + esl = get_soft_limit(clsptr); + grt = stat->egrt; + } + + //hard limit and demand limit + inc_limit = max_demand - eshare; + + //no additional share needed + if (inc_limit <= 0 || ! grt) { + list_del(&clsptr->surplus_queue); + continue; + } + + //or no more savings + if (check_savings && ! stat->has_savings) + continue; + + //check soft limit + if (check_sl) { + int soft_limit; + + soft_limit = p_cls->stat.eshare * esl + / p_cls->shares.total_guarantee; + + if (soft_limit < max_demand) + inc_limit = soft_limit - eshare; + if ( inc_limit <= 0) /* can turn negative */ + continue; + } + + BUG_ON(! grt); + //get the stat->demand_per_share + stat->demand_per_share = + (inc_limit << CKRM_SHARE_SHIFT) / grt; + + list_del_init(&clsptr->surplus_queue); + //insert the class to the queue + insert_surplus_queue(dst_queue,clsptr); + } + return 0; +} + +/* + * get all the surplus that should be reallocated to the children + */ +static inline int get_total_surplus(struct ckrm_cpu_class *p_cls, + struct ckrm_core_class *parent) +{ + struct ckrm_cpu_class *c_cls; + int total_surplus; + struct ckrm_core_class *child_core = NULL; + + //additional share assigned to this sub node from parent + total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; + BUG_ON(total_surplus < 0); + + //surplus of this node + total_surplus += get_my_node_surplus(p_cls); + do { + child_core = ckrm_get_next_child(parent, child_core); + if (child_core) { + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) { + total_surplus = 0; + break; + } + + total_surplus += get_node_surplus(c_cls); + } + } while (child_core); + + return total_surplus; +} +/** + * alloc_surplus_node: re-allocate the shares for a single level + * @parent: parent node + * return the remaining surplus + * + * The surplus reallocation policy is like below. + * -- the classes that have eshare >= demand don't need any additional share. + * So they don't participate the surplus allocation. + * -- all the other classes received share in this order: + * 1. has savings, not over soft limit + * 2. has savings, but over soft limit + * 3. no savings, not over soft limit + * 4. no savings, over soft limit + * + * In each of the 4 levels above, classes get surplus propotionally to its guarantee + */ +static int alloc_surplus_node(struct ckrm_core_class *parent) +{ + struct ckrm_cpu_class *p_cls; + int total_surplus; + int ret = -1; + struct list_head src_queue, dst_queue; + + p_cls = ckrm_get_cpu_class(parent); + if (! p_cls) //safty check + goto realloc_out; + + ret = 0; + total_surplus = get_total_surplus(p_cls,parent); + + if (! total_surplus) //no surplus to be allocated + goto realloc_out; + + /* + * first round, allocated to tasks with savings, check_sl + */ + get_class_surplus_queue(parent,&src_queue); + reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls); + if (! list_empty(&dst_queue)) { + total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); + if (! total_surplus) + goto realloc_out; + } + + /* + * second round, check savings, but no check_sl + */ + //merge the src_queue and dst_queue and reorder + list_splice(&dst_queue, &src_queue); + reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls); + if (! list_empty(&dst_queue)) { + total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); + if (! total_surplus) + goto realloc_out; + } + + /* + * third round, no check savings, but check_sl + */ + //merge the src_queue and dst_queue and reorder + list_splice(&dst_queue, &src_queue); + reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls); + if (! list_empty(&dst_queue)) { + total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); + if (! total_surplus) + goto realloc_out; + } + /* + * fourth round, no check savings, no check_sl + */ + //merge the src_queue and dst_queue and reorder + list_splice(&dst_queue, &src_queue); + reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls); + if (! list_empty(&dst_queue)) + total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); + + realloc_out: + return ret; +} + +/* + * return true if the class total savings > MIN_SAVINGS + */ +static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online) +{ + unsigned long long total_savings; + ckrm_lrq_t* lrq; + int i; +#define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS) + + total_savings = 0; + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + total_savings += lrq->savings; + } + + if (total_savings < CLASS_MIN_SAVINGS) + return 0; + + //get the average savings + do_div(total_savings,cpu_online); + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + lrq->savings = total_savings; + } + + /* + * hzheng: this is another quick hack + * only say I have savings when this node has more demand + * ignoring the requirement of child classes + */ + if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat)) + return 1; + else + return 0; +} + +/* + * check savings status + * set has_savings field if the class or its sub class has savings + */ +static void check_savings_status(struct ckrm_core_class *root_core) +{ + struct ckrm_cpu_class *clsptr; + int cpu_online; + + cpu_online = cpus_weight(cpu_online_map); + + //class status: demand, share,total_ns prio, index + list_for_each_entry(clsptr,&active_cpu_classes,links) + clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online); +} + +/** + * alloc_surplus - reallocate unused shares + * + * class A's usused share should be allocated to its siblings + * the re-allocation goes downward from the top + */ +int alloc_surplus(struct ckrm_core_class *root_core) +{ + struct ckrm_core_class *cur_core, *child_core; + // struct ckrm_cpu_class *cls; + int ret = -1; + + check_savings_status(root_core); + + /*initialize*/ + cur_core = root_core; + child_core = NULL; + // cls = ckrm_get_cpu_class(cur_core); + + /*the ckrm idle tasks get all what's remaining*/ + /*hzheng: uncomment the following like for hard limit support */ + // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); + + repeat: + //check exit + if (!cur_core) + return 0; + + //visit this node only once + if (! child_core) + if ( alloc_surplus_node(cur_core) < 0 ) + return ret; + + //next child + child_core = ckrm_get_next_child(cur_core, child_core); + if (child_core) { + //go down + cur_core = child_core; + child_core = NULL; + goto repeat; + } else { //no more child, go back + child_core = cur_core; + cur_core = child_core->hnode.parent; + } + goto repeat; } + + /**********************************************/ /* cpu demand */ /**********************************************/ @@ -134,27 +701,29 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) * how often should we recalculate the cpu demand * the number is in ns */ -static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len) +static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat, + int state, unsigned long long len) { local_stat->total += len; if (state == CKRM_CPU_DEMAND_RUN) local_stat->run += len; if (local_stat->total >= local_stat->recalc_interval) { - local_stat->total >>= CKRM_SHARE_ACCURACY; - if (unlikely(local_stat->run > 0xFFFFFFFF)) - local_stat->run = 0xFFFFFFFF; + local_stat->total >>= CKRM_SHARE_SHIFT; + if (unlikely(local_stat->run > ULONG_MAX)) + local_stat->run = ULONG_MAX; - if (local_stat->total > 0xFFFFFFFF) - local_stat->total = 0xFFFFFFFF; + if (unlikely(local_stat->total > ULONG_MAX)) + local_stat->total = ULONG_MAX; do_div(local_stat->run,(unsigned long)local_stat->total); - if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep + if (unlikely(local_stat->total > ULONG_MAX)) { + //happens after very long sleep local_stat->cpu_demand = local_stat->run; - else { - local_stat->cpu_demand += local_stat->run; - local_stat->cpu_demand >>= 1; + } else { + local_stat->cpu_demand = + (local_stat->cpu_demand + local_stat->run) >> 1; } local_stat->total = 0; local_stat->run = 0; @@ -190,57 +759,25 @@ void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsign break; default: BUG(); - } -} - -/** - * check all the class local queue - * - * to deal with excessive long run/sleep state - * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record - */ -static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) -{ - struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; - unsigned long long sleep,now; - if (local_stat->last_sleep) { - now = sched_clock(); - sleep = now - local_stat->last_sleep; - local_stat->last_sleep = now; - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); - } -} - -/** - *get_self_cpu_demand - get cpu demand of the class itself (excluding children) - * - * self_cpu_demand = sum(cpu demand of all local queues) - */ -static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) -{ - int cpu_demand = 0; - int i; - int cpuonline = 0; - - for_each_online_cpu(i) { - cpu_demand_check_sleep(stat,i); - cpu_demand += stat->local_stats[i].cpu_demand; - cpuonline ++; - } - - return (cpu_demand/cpuonline); + } } -/* - * my max demand = min(cpu_demand, my effective hard limit) +/** + * check all the class local queue + * + * to deal with excessive long run/sleep state + * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record */ -static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) +void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) { - unsigned long mmax_demand = get_self_cpu_demand(stat); - if (mmax_demand > stat->mehl) - mmax_demand = stat->mehl; - - return mmax_demand; + struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; + unsigned long long sleep,now; + if (local_stat->last_sleep) { + now = sched_clock(); + sleep = now - local_stat->last_sleep; + local_stat->last_sleep = now; + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); + } } /** @@ -301,26 +838,6 @@ static int update_max_demand(struct ckrm_core_class *root_core) /**********************************************/ /* effective guarantee & limit */ /**********************************************/ -static inline void set_eshare(struct ckrm_cpu_class_stat *stat, - int new_share) -{ - if (!new_share) - new_share = 1; - - BUG_ON(new_share < 0); - stat->eshare = new_share; -} - -static inline void set_meshare(struct ckrm_cpu_class_stat *stat, - int new_share) -{ - if (!new_share) - new_share = 1; - - BUG_ON(new_share < 0); - stat->meshare = new_share; -} - /** *update_child_effective - update egrt, ehl, mehl for all children of parent *@parent: the parent node @@ -346,7 +863,7 @@ static int update_child_effective(struct ckrm_core_class *parent) p_cls->stat.egrt * c_cls->shares.my_guarantee / p_cls->shares.total_guarantee; - c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee + c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls) / c_cls->shares.total_guarantee; c_cls->stat.ehl = @@ -372,8 +889,9 @@ static int update_child_effective(struct ckrm_core_class *parent) * * return -1 if anything wrong happened (eg: the structure changed during the process) */ -static int update_effectives(struct ckrm_core_class *root_core) +int update_effectives(void) { + struct ckrm_core_class *root_core = get_default_cpu_class()->core; struct ckrm_core_class *cur_core, *child_core; struct ckrm_cpu_class *cls; int ret = -1; @@ -384,7 +902,7 @@ static int update_effectives(struct ckrm_core_class *root_core) //initialize the effectives for root cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */ - cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee + cls->stat.megrt = cls->stat.egrt * get_my_grt(cls) / cls->shares.total_guarantee; cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls) / cls->shares.total_guarantee; @@ -418,288 +936,11 @@ static int update_effectives(struct ckrm_core_class *root_core) } /**********************************************/ -/* surplus allocation */ +/* CKRM Idle Tasks */ /**********************************************/ -/* - * surplus = egrt - demand - * if surplus < 0, surplus = 0 - */ -static inline int get_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.egrt - cls->stat.max_demand; - - if (surplus < 0) - surplus = 0; - - return surplus; -} - -static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); - - if (surplus < 0) - surplus = 0; - - return surplus; -} - -/** - * consume_surplus: decides how much surplus a node can consume - * @ckeck_sl: if check_sl is set, then check soft_limitx - * return how much consumed - * - * implements all the CKRM Scheduling Requirement - * assume c_cls is valid - */ -static inline int consume_surplus(int surplus, - struct ckrm_cpu_class *c_cls, - struct ckrm_cpu_class *p_cls, - int check_sl - ) -{ - int consumed = 0; - int inc_limit; - int total_grt = p_cls->shares.total_guarantee; - - BUG_ON(surplus < 0); - - /*can't consume more than demand or hard limit*/ - if (c_cls->stat.eshare >= c_cls->stat.max_demand) - goto out; - - //the surplus allocation is propotional to grt - consumed = - surplus * c_cls->shares.my_guarantee / total_grt; - - if (! consumed) //no more share - goto out; - - //hard limit and demand limit - inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare; - - if (check_sl) { - int esl = p_cls->stat.eshare * get_soft_limit(c_cls) - /total_grt; - if (esl < c_cls->stat.max_demand) - inc_limit = esl - c_cls->stat.eshare; - } - - if (consumed > inc_limit) - consumed = inc_limit; - - BUG_ON(consumed < 0); - out: - return consumed; -} - -/* - * how much a node can consume for itself? - */ -static inline int consume_self_surplus(int surplus, - struct ckrm_cpu_class *p_cls, - int check_sl - ) -{ - int consumed = 0; - int inc_limit; - int total_grt = p_cls->shares.total_guarantee; - int max_demand = get_mmax_demand(&p_cls->stat); - - BUG_ON(surplus < 0); - - /*can't consume more than demand or hard limit*/ - if (p_cls->stat.meshare >= max_demand) - goto out; - - //the surplus allocation is propotional to grt - consumed = - surplus * p_cls->shares.unused_guarantee / total_grt; - - if (! consumed) //no more share - goto out; - - //hard limit and demand limit - inc_limit = max_demand - p_cls->stat.meshare; - - if (check_sl) { - int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls) - /total_grt; - if (mesl < max_demand) - inc_limit = mesl - p_cls->stat.meshare; - } - - if (consumed > inc_limit) - consumed = inc_limit; - - BUG_ON(consumed < 0); - out: - return consumed; -} - - -/* - * allocate surplus to all its children and also its default class - */ -static int alloc_surplus_single_round( - int surplus, - struct ckrm_core_class *parent, - struct ckrm_cpu_class *p_cls, - int check_sl) -{ - struct ckrm_cpu_class *c_cls; - struct ckrm_core_class *child_core = NULL; - int total_consumed = 0,consumed; - - //first allocate to the default class - consumed = - consume_self_surplus(surplus,p_cls,check_sl); - - if (consumed > 0) { - set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed); - total_consumed += consumed; - } - - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - return -1; - - consumed = - consume_surplus(surplus, c_cls, - p_cls,check_sl); - if (consumed > 0) { - set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); - total_consumed += consumed; - } - } - } while (child_core); - - return total_consumed; -} - -/** - * alloc_surplus_node: re-allocate the shares for children under parent - * @parent: parent node - * return the remaining surplus - * - * task: - * 1. get total surplus - * 2. allocate surplus - * 3. set the effective_share of each node - */ -static int alloc_surplus_node(struct ckrm_core_class *parent) -{ - struct ckrm_cpu_class *p_cls,*c_cls; - int total_surplus,consumed; - int check_sl; - int ret = -1; - struct ckrm_core_class *child_core = NULL; - - p_cls = ckrm_get_cpu_class(parent); - if (! p_cls) - goto realloc_out; - - /* - * get total surplus - */ - total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; - BUG_ON(total_surplus < 0); - total_surplus += get_my_node_surplus(p_cls); - - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - goto realloc_out; - - total_surplus += get_node_surplus(c_cls); - } - } while (child_core); - - - if (! total_surplus) { - ret = 0; - goto realloc_out; - } - - /* - * distributing the surplus - * first with the check_sl enabled - * once all the tasks has research the soft limit, disable check_sl and try again - */ - - check_sl = 1; - do { - consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl); - if (consumed < 0) //something is wrong - goto realloc_out; - - if (! consumed) - check_sl = 0; - else - total_surplus -= consumed; - - } while ((total_surplus > 0) && (consumed || check_sl) ); - - ret = 0; - - realloc_out: - return ret; -} - -/** - * alloc_surplus - reallocate unused shares - * - * class A's usused share should be allocated to its siblings - * the re-allocation goes downward from the top - */ -static int alloc_surplus(struct ckrm_core_class *root_core) -{ - struct ckrm_core_class *cur_core, *child_core; - // struct ckrm_cpu_class *cls; - int ret = -1; - - /*initialize*/ - cur_core = root_core; - child_core = NULL; - // cls = ckrm_get_cpu_class(cur_core); - - /*the ckrm idle tasks get all what's remaining*/ - /*hzheng: uncomment the following like for hard limit support */ - // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); - - repeat: - //check exit - if (!cur_core) - return 0; - - //visit this node only once - if (! child_core) - if ( alloc_surplus_node(cur_core) < 0 ) - return ret; - - //next child - child_core = ckrm_get_next_child(cur_core, child_core); - if (child_core) { - //go down - cur_core = child_core; - child_core = NULL; - goto repeat; - } else { //no more child, go back - child_core = cur_core; - cur_core = child_core->hnode.parent; - } - goto repeat; -} +#ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS -/**********************************************/ -/* CKRM Idle Tasks */ -/**********************************************/ struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class; struct task_struct* ckrm_idle_tasks[NR_CPUS]; @@ -710,7 +951,7 @@ static inline int get_nr_idle(unsigned long surplus) int nr_idle = 0; nr_idle = surplus * cpu_online; - nr_idle >>= CKRM_SHARE_ACCURACY; + nr_idle >>= CKRM_SHARE_SHIFT; if (surplus) nr_idle ++; @@ -722,7 +963,8 @@ static inline int get_nr_idle(unsigned long surplus) } /** - * update_ckrm_idle: update the status of the idle class according to the new surplus + * update_ckrm_idle: update the status of the idle class according + * to the new surplus * surplus: new system surplus * * Task: @@ -816,6 +1058,20 @@ void ckrm_start_ckrm_idle(void) } } +void ckrm_stop_ckrm_idle(void) +{ + BUG_ON(1); // not yet implemented +} + +#else + +static inline void ckrm_start_ckrm_idle(void) { }; +static inline void ckrm_stop_ckrm_idle(void) { }; +static inline void update_ckrm_idle(unsigned long surplus) { }; + +#endif + + /**********************************************/ /* Local Weight */ /**********************************************/ @@ -831,8 +1087,19 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) int i; unsigned long class_weight; unsigned long long lw; - - //get total pressure + struct ckrm_cpu_class_stat *stat; + unsigned long oweight; + unsigned long skewed_limit; + /* + * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare + * then we set the skewed_share + */ +#define SKEWED_SHARE_RATIO 8 +#define SKEWED_WEIGHT_MIN 3 + + /* get total pressure of the class, if there is not pressure (.. class is + * idle, then leave the weights as is + */ for_each_online_cpu(i) { lrq = get_ckrm_lrq(clsptr,i); total_pressure += lrq->lrq_load; @@ -841,32 +1108,61 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) if (! total_pressure) return; + stat = &clsptr->stat; + class_weight = cpu_class_weight(clsptr) * cpu_online; + /* calculate or skewed limit weight */ + skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO); + if (skewed_limit < SKEWED_WEIGHT_MIN) + skewed_limit = SKEWED_WEIGHT_MIN; + + /* calculate over_weight */ + BUG_ON(stat->meshare < stat->megrt); + oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare; + oweight = SHARE_TO_WEIGHT(oweight); + /* * update weight for each cpu, minimun is 1 */ for_each_online_cpu(i) { lrq = get_ckrm_lrq(clsptr,i); - if (! lrq->lrq_load) - /*give idle class a high share to boost interactiveness */ + lrq->over_weight = oweight; + if (! lrq->lrq_load) { + /* give idle class a high share to boost + * interactiveness + */ lw = cpu_class_weight(clsptr); - else { - lw = lrq->lrq_load * class_weight; + if (unlikely(lw==0)) + lw = 1; + } else { + lw = lrq->lrq_load; + lw *= class_weight; do_div(lw,total_pressure); - if (!lw) + if (unlikely(lw==0)) lw = 1; - else if (lw > CKRM_SHARE_MAX) - lw = CKRM_SHARE_MAX; - } - + else if (unlikely(lw > CKRM_MAX_WEIGHT)) + lw = CKRM_MAX_WEIGHT; + } + BUG_ON(lw > CKRM_MAX_WEIGHT); + + /* + * set is_skewed and local_weight in proper order + * to avoid race condition + */ lrq->local_weight = lw; + if (lw < skewed_limit) + lrq->skewed_weight = skewed_limit; + else + lrq->skewed_weight = 0; + BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight)); } } /* * assume called with class_list_lock read lock held */ + void adjust_local_weight(void) { static spinlock_t lock = SPIN_LOCK_UNLOCKED; @@ -904,9 +1200,11 @@ void ckrm_cpu_monitor(int check_min) static unsigned long long last_check = 0; struct ckrm_core_class *root_core = get_default_cpu_class()->core; unsigned long long now; -#define MIN_CPU_MONITOR_INTERVAL 100000000UL + int loc; + +#define MIN_CPU_MONITOR_INTERVAL (100*1000*1000) /* 100 MSEC */ - if (!root_core) + if (ckrm_cpu_disabled() || !root_core) return; //do nothing if someone already holding the lock @@ -918,29 +1216,37 @@ void ckrm_cpu_monitor(int check_min) now = sched_clock(); //consecutive check should be at least 100ms apart - if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL)) - goto outunlock; + if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL)) + goto outunlock_np; last_check = now; - if (update_effectives(root_core) != 0) + if (update_effectives() != 0) { + loc = 0; goto outunlock; + } - if (update_max_demand(root_core) != 0) + if (update_max_demand(root_core) != 0) { + loc = 1; goto outunlock; + } -#ifndef ALLOC_SURPLUS_SUPPORT -#warning "MEF taking out alloc_surplus" -#else - if (alloc_surplus(root_core) != 0) +#warning mef: alloc_surplus call back in system; + if (alloc_surplus(root_core) != 0) { + loc = 2; goto outunlock; -#endif + } adjust_local_weight(); - outunlock: + outunlock_np: read_unlock(&class_list_lock); spin_unlock(&lock); + return; + + outunlock: + printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc); + goto outunlock_np; } /*****************************************************/ @@ -952,6 +1258,8 @@ static int thread_exit = 0; static int ckrm_cpu_monitord(void *nothing) { daemonize("ckrm_cpu_ctrld"); + printk("cpu_monitord started\n"); + thread_exit = 0; for (;;) { /*sleep for sometime before next try*/ set_current_state(TASK_INTERRUPTIBLE); @@ -967,15 +1275,19 @@ static int ckrm_cpu_monitord(void *nothing) return 0; } -void ckrm_start_monitor(void) +void ckrm_cpu_start_monitor(void) { + if (cpu_monitor_pid != -1) { + /* already started ... */ + return; + } cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL); if (cpu_monitor_pid < 0) { printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n"); } } -void ckrm_kill_monitor(void) +void ckrm_cpu_kill_monitor(void) { printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid); if (cpu_monitor_pid > 0) { @@ -987,22 +1299,12 @@ void ckrm_kill_monitor(void) } } -int ckrm_cpu_monitor_init(void) +static int __init ckrm_cpu_init_monitor(void) { - ckrm_start_monitor(); - /*hzheng: uncomment the following like for hard limit support */ - // ckrm_start_ckrm_idle(); + if (ckrm_cpu_enabled()) + ckrm_cpu_start_monitor(); return 0; } -void ckrm_cpu_monitor_exit(void) -{ - ckrm_kill_monitor(); -} - -module_init(ckrm_cpu_monitor_init); -module_exit(ckrm_cpu_monitor_exit); +__initcall(ckrm_cpu_init_monitor); -MODULE_AUTHOR("Haoqiang Zheng "); -MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor"); -MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_laq.c b/kernel/ckrm/ckrm_laq.c deleted file mode 100644 index b64205a06..000000000 --- a/kernel/ckrm/ckrm_laq.c +++ /dev/null @@ -1,495 +0,0 @@ -/* ckrm_socketaq.c - accept queue resource controller - * - * Copyright (C) Vivek Kashyap, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * Initial version - */ - -/* Code Description: TBD - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define hnode_2_core(ptr) \ - ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL) - -#define CKRM_SAQ_MAX_DEPTH 3 // 0 => /rcfs - // 1 => socket_aq - // 2 => socket_aq/listen_class - // 3 => socket_aq/listen_class/accept_queues - // 4 => Not allowed - -typedef struct ckrm_laq_res { - spinlock_t reslock; - atomic_t refcnt; - struct ckrm_shares shares; - struct ckrm_core_class *core; - struct ckrm_core_class *pcore; - int my_depth; - int my_id; - unsigned int min_ratio; -} ckrm_laq_res_t; - -static int my_resid = -1; - -extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int); -extern struct ckrm_core_class *rcfs_make_core(struct dentry *, - struct ckrm_core_class *); - -void laq_res_hold(struct ckrm_laq_res *res) -{ - atomic_inc(&res->refcnt); - return; -} - -void laq_res_put(struct ckrm_laq_res *res) -{ - if (atomic_dec_and_test(&res->refcnt)) - kfree(res); - return; -} - -/* Initialize rescls values - */ -static void laq_res_initcls(void *my_res) -{ - ckrm_laq_res_t *res = my_res; - - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; -} - -static int atoi(char *s) -{ - int k = 0; - while (*s) - k = *s++ - '0' + (k * 10); - return k; -} - -static char *laq_get_name(struct ckrm_core_class *c) -{ - char *p = (char *)c->name; - - while (*p) - p++; - while (*p != '/' && p != c->name) - p--; - - return ++p; -} - -static void *laq_res_alloc(struct ckrm_core_class *core, - struct ckrm_core_class *parent) -{ - ckrm_laq_res_t *res, *pres; - int pdepth; - - if (parent) - pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t); - else - pres = NULL; - - if (core == core->classtype->default_class) - pdepth = 1; - else { - if (!parent) - return NULL; - pdepth = 1 + pres->my_depth; - } - - res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC); - if (res) { - memset(res, 0, sizeof(res)); - spin_lock_init(&res->reslock); - laq_res_hold(res); - res->my_depth = pdepth; - if (pdepth == 2) // listen class - res->my_id = 0; - else if (pdepth == 3) - res->my_id = atoi(laq_get_name(core)); - res->core = core; - res->pcore = parent; - - // rescls in place, now initialize contents other than - // hierarchy pointers - laq_res_initcls(res); // acts as initialising value - } - - return res; -} - -static void laq_res_free(void *my_res) -{ - ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res; - ckrm_laq_res_t *parent; - - if (!res) - return; - - if (res->my_depth != 3) { - kfree(res); - return; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // Should never happen - return; - - spin_lock(&parent->reslock); - spin_lock(&res->reslock); - - // return child's guarantee to parent node - // Limits have no meaning for accept queue control - child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0); - - spin_unlock(&res->reslock); - laq_res_put(res); - spin_unlock(&parent->reslock); - return; -} - -/************************************************************************** - * SHARES *** - **************************************************************************/ - -void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio) -{ - int i; - struct tcp_opt *tp; - - tp = tcp_sk(ns->ns_sk); - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - tp->acceptq[i].aq_ratio = aq_ratio[i]; - return; -} -void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio) -{ - - struct ckrm_net_struct *ns; - struct ckrm_core_class *core = parent->core; - - class_lock(core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - laq_set_aq_value(ns, aq_ratio); - } - class_unlock(core); - return; -} - -static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio) -{ - struct ckrm_hnode *chnode; - ckrm_laq_res_t *child; - unsigned int min; - int i; - - min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee; - - list_for_each_entry(chnode, &res->core->hnode.children, siblings) { - child = hnode_2_core(chnode)->res_class[my_resid]; - - aq_ratio[child->my_id] = - (unsigned int)child->shares.my_guarantee; - if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE) - aq_ratio[child->my_id] = 0; - if (aq_ratio[child->my_id] && - ((unsigned int)aq_ratio[child->my_id] < min)) - min = (unsigned int)child->shares.my_guarantee; - } - - if (min == 0) { - min = 1; - // default takes all if nothing specified - aq_ratio[0] = 1; - } - res->min_ratio = min; - - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - aq_ratio[i] = aq_ratio[i] / min; -} - -static int laq_set_share_values(void *my_res, struct ckrm_shares *shares) -{ - ckrm_laq_res_t *res = my_res; - ckrm_laq_res_t *parent; - unsigned int aq_ratio[NUM_ACCEPT_QUEUES]; - int rc = 0; - - if (!res) - return -EINVAL; - - if (!res->pcore) { - // something is badly wrong - printk(KERN_ERR "socketaq internal inconsistency\n"); - return -EBADF; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // socketclass does not have a share interface - return -EINVAL; - - // Ensure that we ignore limit values - shares->my_limit = CKRM_SHARE_DONTCARE; - shares->max_limit = CKRM_SHARE_UNCHANGED; - - if (res->my_depth == 0) { - printk(KERN_ERR "socketaq bad entry\n"); - return -EBADF; - } else if (res->my_depth == 1) { - // can't be written to. This is an internal default. - return -EINVAL; - } else if (res->my_depth == 2) { - //nothin to inherit - if (!shares->total_guarantee) { - return -EINVAL; - } - parent = res; - shares->my_guarantee = CKRM_SHARE_DONTCARE; - } else if (res->my_depth == 3) { - // accept queue itself. - shares->total_guarantee = CKRM_SHARE_UNCHANGED; - } - - ckrm_lock_hier(parent->pcore); - spin_lock(&parent->reslock); - rc = set_shares(shares, &res->shares, - (parent == res) ? NULL : &parent->shares); - if (rc) { - spin_unlock(&res->reslock); - ckrm_unlock_hier(res->pcore); - return rc; - } - calculate_aq_ratios(parent, aq_ratio); - laq_set_aq_values(parent, aq_ratio); - spin_unlock(&parent->reslock); - ckrm_unlock_hier(parent->pcore); - - return rc; -} - -static int laq_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - ckrm_laq_res_t *res = my_res; - - if (!res) - return -EINVAL; - *shares = res->shares; - return 0; -} - -/************************************************************************** - * STATS *** - **************************************************************************/ - -void -laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i) -{ - seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t" - "queued: %u\n\twait_time: %u\n", - i, taq->acceptq_count, taq->acceptq_qcount, - jiffies_to_msecs(taq->acceptq_wait_time)); - - if (i) - return; - - for (i = 1; i < NUM_ACCEPT_QUEUES; i++) { - taq[0].acceptq_wait_time += taq[i].acceptq_wait_time; - taq[0].acceptq_qcount += taq[i].acceptq_qcount; - taq[0].acceptq_count += taq[i].acceptq_count; - } - - seq_printf(sfile, "Totals :\n\taccepted: %u\n\t" - "queued: %u\n\twait_time: %u\n", - taq->acceptq_count, taq->acceptq_qcount, - jiffies_to_msecs(taq->acceptq_wait_time)); - - return; -} - -void -laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres, - struct tcp_acceptq_info *taq) -{ - struct ckrm_net_struct *ns; - struct ckrm_core_class *core = pres->core; - struct tcp_opt *tp; - int a = mres->my_id; - int z; - - if (a == 0) - z = NUM_ACCEPT_QUEUES; - else - z = a + 1; - - // XXX Instead of holding a class_lock introduce a rw - // lock to be write locked by listen callbacks and read locked here. - // - VK - class_lock(pres->core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - tp = tcp_sk(ns->ns_sk); - for (; a < z; a++) { - taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time; - taq->acceptq_qcount += tp->acceptq[a].aq_qcount; - taq->acceptq_count += tp->acceptq[a].aq_count; - taq++; - } - } - class_unlock(pres->core); -} - -static int laq_get_stats(void *my_res, struct seq_file *sfile) -{ - ckrm_laq_res_t *res = my_res; - ckrm_laq_res_t *parent; - struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES]; - int rc = 0; - - if (!res) - return -EINVAL; - - if (!res->pcore) { - // something is badly wrong - printk(KERN_ERR "socketaq internal inconsistency\n"); - return -EBADF; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) { // socketclass does not have a stat interface - printk(KERN_ERR "socketaq internal fs inconsistency\n"); - return -EINVAL; - } - - memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES); - - switch (res->my_depth) { - - default: - case 0: - printk(KERN_ERR "socket class bad entry\n"); - rc = -EBADF; - break; - - case 1: // can't be read from. this is internal default. - // return -EINVAL - rc = -EINVAL; - break; - - case 2: // return the default and total - ckrm_lock_hier(res->core); // block any deletes - laq_get_aq_stats(res, res, &taq[0]); - laq_print_aq_stats(sfile, &taq[0], 0); - ckrm_unlock_hier(res->core); // block any deletes - break; - - case 3: - ckrm_lock_hier(parent->core); // block any deletes - laq_get_aq_stats(parent, res, &taq[res->my_id]); - laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id); - ckrm_unlock_hier(parent->core); // block any deletes - break; - } - - return rc; -} - -/* - * The network connection is reclassified to this class. Update its shares. - * The socket lock is held. - */ -static void laq_change_resclass(void *n, void *old, void *r) -{ - struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n; - struct ckrm_laq_res *res = (struct ckrm_laq_res *)r; - unsigned int aq_ratio[NUM_ACCEPT_QUEUES]; - - if (res->my_depth != 2) - return; - - // a change to my_depth == 3 ie. the accept classes cannot happen. - // there is no target file - if (res->my_depth == 2) { // it is one of the socket classes - ckrm_lock_hier(res->pcore); - // share rule: hold parent resource lock. then self. - // However, since my_depth == 1 is a generic class it is not - // needed here. Self lock is enough. - spin_lock(&res->reslock); - calculate_aq_ratios(res, aq_ratio); - class_lock(res->pcore); - laq_set_aq_value(ns, aq_ratio); - class_unlock(res->pcore); - spin_unlock(&res->reslock); - ckrm_unlock_hier(res->pcore); - } - - return; -} - -struct ckrm_res_ctlr laq_rcbs = { - .res_name = "laq", - .resid = -1, // dynamically assigned - .res_alloc = laq_res_alloc, - .res_free = laq_res_free, - .set_share_values = laq_set_share_values, - .get_share_values = laq_get_share_values, - .get_stats = laq_get_stats, - .change_resclass = laq_change_resclass, - //.res_initcls = laq_res_initcls, //HUBERTUS: unnecessary !! -}; - -int __init init_ckrm_laq_res(void) -{ - struct ckrm_classtype *clstype; - int resid; - - clstype = ckrm_find_classtype_by_name("socketclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); - return -ENOENT; - } - - if (my_resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &laq_rcbs); - if (resid >= 0) - my_resid = resid; - printk(KERN_DEBUG "........init_ckrm_listen_aq_res -> %d\n", my_resid); - } - return 0; - -} - -void __exit exit_ckrm_laq_res(void) -{ - ckrm_unregister_res_ctlr(&laq_rcbs); - my_resid = -1; -} - -module_init(init_ckrm_laq_res) - module_exit(exit_ckrm_laq_res) - - MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c index 0fe858633..103e3f957 100644 --- a/kernel/ckrm/ckrm_listenaq.c +++ b/kernel/ckrm/ckrm_listenaq.c @@ -1,4 +1,4 @@ -/* ckrm_socketaq.c - accept queue resource controller +/* ckrm_listenaq.c - accept queue resource controller * * Copyright (C) Vivek Kashyap, IBM Corp. 2004 * @@ -251,7 +251,7 @@ static int laq_set_share_values(void *my_res, struct ckrm_shares *shares) } parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // socket_class does not have a share interface + if (!parent) // socketclass does not have a share interface return -EINVAL; // Ensure that we ignore limit values @@ -380,7 +380,7 @@ static int laq_get_stats(void *my_res, struct seq_file *sfile) } parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) { // socket_class does not have a stat interface + if (!parent) { // socketclass does not have a stat interface printk(KERN_ERR "socketaq internal fs inconsistency\n"); return -EINVAL; } @@ -451,7 +451,7 @@ static void laq_change_resclass(void *n, void *old, void *r) } struct ckrm_res_ctlr laq_rcbs = { - .res_name = "laq", + .res_name = "listenaq", .resid = -1, // dynamically assigned .res_alloc = laq_res_alloc, .res_free = laq_res_free, @@ -467,9 +467,9 @@ int __init init_ckrm_laq_res(void) struct ckrm_classtype *clstype; int resid; - clstype = ckrm_find_classtype_by_name("socket_class"); + clstype = ckrm_find_classtype_by_name("socketclass"); if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); + printk(KERN_INFO " Unknown ckrm classtype"); return -ENOENT; } diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c index 555ba0a4e..143b259e8 100644 --- a/kernel/ckrm/rbce/rbcemod.c +++ b/kernel/ckrm/rbce/rbcemod.c @@ -422,7 +422,7 @@ static struct rbce_class *create_rbce_class(const char *classname, return cls; } -static struct rbce_class *get_class(char *classname, int *classtype) +static struct rbce_class *get_class(const char *classname, int *classtype) { struct rbce_class *cls; void *classobj; diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c index 0400844a3..fd7f8a2b4 100644 --- a/kernel/ckrm_classqueue.c +++ b/kernel/ckrm_classqueue.c @@ -27,14 +27,19 @@ #include #define cq_nr_member(cq) (cq->array.nr_active) +#define CLASSQUEUE_MASK (CLASSQUEUE_SIZE - 1) /** - * get_index - translate the logical priority to the real index in the queue + * get_node_index - + * translate the logical priority to the real index in the queue * * validate the position * a valid prio is [cq->base,cq->base + size -1] + * check whether node is supposed to be enqeued beyond above window and + * if so set the need_repos flag */ -static inline unsigned long get_index(struct classqueue_struct *cq, int *prio) +static inline unsigned long get_node_index(struct classqueue_struct *cq, + cq_node_t * node) { unsigned long index; int max_prio; @@ -43,22 +48,24 @@ static inline unsigned long get_index(struct classqueue_struct *cq, int *prio) return 0; max_prio = cq->base + (CLASSQUEUE_SIZE - 1); - if (*prio > max_prio) - *prio = max_prio; - if (*prio < cq->base) - *prio = cq->base; + if (unlikely(node->prio > max_prio)) { + node->real_prio = node->prio; + node->prio = max_prio; + node->need_repos = 1; + } else + node->need_repos = 0; - index = (cq->base_offset + (*prio - cq->base)) ; - if (index >= CLASSQUEUE_SIZE) - index -= CLASSQUEUE_SIZE; + if (unlikely(node->prio < cq->base)) + node->prio = cq->base; - return index; + index = (cq->base_offset + (node->prio - cq->base)) ; + return ( index & CLASSQUEUE_MASK ); // ensure its in limits } /** * initialize a class queue object */ -int classqueue_init(struct classqueue_struct *cq) +int classqueue_init(struct classqueue_struct *cq, int enabled) { int i; struct cq_prio_array *array; @@ -73,7 +80,8 @@ int classqueue_init(struct classqueue_struct *cq) array->nr_active = 0; cq->base = 0; - cq->base_offset = -1; //not valid yet + cq->base_offset = 0; + cq->enabled = enabled; return 0; } @@ -87,8 +95,8 @@ void classqueue_enqueue(struct classqueue_struct *cq, int index; //get real index - if (cq_nr_member(cq)) { - index = get_index(cq, &prio); + if (cq_nr_member(cq)) { + index = get_node_index(cq, node); } else { //the first one cq->base = prio; cq->base_offset = 0; @@ -123,8 +131,8 @@ void classqueue_update_prio(struct classqueue_struct *cq, if (! cls_in_classqueue(node)) return; - index = get_index(cq, &new_pos); node->prio = new_pos; + index = get_node_index(cq, node); //remove from the original position list_del_init(&(node->list)); @@ -137,10 +145,32 @@ void classqueue_update_prio(struct classqueue_struct *cq, node->index = index; } + +static inline void __classqueue_update_base(struct classqueue_struct *cq, + int new_base) +{ + int max_prio; + if (unlikely(new_base <= cq->base)) // base will never move back + return; + if (unlikely(!cq_nr_member(cq))) { + cq->base_offset = 0; + cq->base = new_base; // is this necessary ?? + return; + } + + max_prio = cq->base + (CLASSQUEUE_SIZE - 1); + if (unlikely(new_base > max_prio)) + new_base = max_prio; + + cq->base_offset = (cq->base_offset + (new_base - cq->base)) & CLASSQUEUE_MASK; + cq->base = new_base; +} + /** *classqueue_get_min_prio: return the priority of the last node in queue * * this function can be called without runqueue lock held + * return 0 if there's nothing in the queue */ static inline int classqueue_get_min_prio(struct classqueue_struct *cq) { @@ -171,9 +201,13 @@ static inline int classqueue_get_min_prio(struct classqueue_struct *cq) */ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) { - cq_node_t *result = NULL; + cq_node_t *node; int pos; + int index; + int new_base; +search_again: + node = NULL; /* * search over the bitmap to get the first class in the queue */ @@ -183,10 +217,38 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); if (pos < CLASSQUEUE_SIZE) { - BUG_ON(list_empty(&cq->array.queue[pos])); - result = list_entry(cq->array.queue[pos].next, cq_node_t, list); + //BUG_ON(list_empty(&cq->array.queue[pos])); + node = list_entry(cq->array.queue[pos].next, cq_node_t, list); } - return result; + + //check if the node need to be repositioned + if (likely(! node || ! node->need_repos)) + return node; + + // We need to reposition this node in the class queue + // BUG_ON(node->prio == node->real_prio); + + //remove from the original position + list_del_init(&(node->list)); + if (list_empty(&cq->array.queue[node->index])) + __clear_bit(node->index, cq->array.bitmap); + + new_base = classqueue_get_min_prio(cq); + node->prio = node->real_prio; + + if (! new_base) + new_base = node->real_prio; + else if (node->real_prio < new_base) + new_base = node->real_prio; + __classqueue_update_base(cq,new_base); + + index = get_node_index(cq, node); + //add to new positon, round robin for classes with same priority + list_add_tail(&(node->list), &cq->array.queue[index]); + __set_bit(index, cq->array.bitmap); + node->index = index; + + goto search_again; } /** @@ -198,14 +260,11 @@ void classqueue_update_base(struct classqueue_struct *cq) int new_base; if (! cq_nr_member(cq)) { - cq->base_offset = -1; //not defined + cq->base = 0; + cq->base_offset = 0; return; } new_base = classqueue_get_min_prio(cq); - - if (new_base > cq->base) { - cq->base_offset = get_index(cq, &new_base); - cq->base = new_base; - } + __classqueue_update_base(cq,new_base); } diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c index 5142b2eaa..26ffc69d8 100644 --- a/kernel/ckrm_sched.c +++ b/kernel/ckrm_sched.c @@ -20,6 +20,28 @@ LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor struct ckrm_cpu_class default_cpu_class_obj; +unsigned int ckrm_sched_mode __cacheline_aligned_in_smp = +#ifdef CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT + CKRM_SCHED_MODE_ENABLED; +#else + CKRM_SCHED_MODE_DISABLED; +#endif + +static int __init ckrm_cpu_enabled_setup(char *str) +{ + ckrm_sched_mode = CKRM_SCHED_MODE_ENABLED; + return 1; +} + +static int __init ckrm_cpu_disabled_setup(char *str) +{ + ckrm_sched_mode = CKRM_SCHED_MODE_DISABLED; + return 1; +} + +__setup("ckrmcpu", ckrm_cpu_enabled_setup); +__setup("nockrmcpu",ckrm_cpu_disabled_setup); + struct ckrm_cpu_class * get_default_cpu_class(void) { return (&default_cpu_class_obj); } @@ -28,7 +50,10 @@ struct ckrm_cpu_class * get_default_cpu_class(void) { /* CVT Management */ /*******************************************************/ -static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) +//an absolute bonus of 200ms for classes when reactivated +#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) + +static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) { CVT_t min_cvt; CVT_t bonus; @@ -37,6 +62,7 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) if (unlikely(! cur_cvt)) return; +#define INTERACTIVE_BONUS_SUPPORT 1 #ifndef INTERACTIVE_BONUS_SUPPORT #warning "ACB taking out interactive bonus calculation" bonus = 0; @@ -50,51 +76,40 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) #endif //cvt can't be negative - if (cur_cvt > bonus) + if (likely(cur_cvt > bonus)) min_cvt = cur_cvt - bonus; else min_cvt = 0; - - if (lrq->local_cvt < min_cvt) { + + if (lrq->local_cvt < min_cvt) { + // if (lrq->local_cvt < min_cvt && ! lrq_nr_running(lrq)) { CVT_t lost_cvt; - lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq); + if (unlikely(lrq->local_cvt == 0)) { + lrq->local_cvt = cur_cvt; + return; + } + lost_cvt = min_cvt - lrq->local_cvt; + lost_cvt *= local_class_weight(lrq); lrq->local_cvt = min_cvt; + BUG_ON(lost_cvt < 0); /* add what the class lost to its savings*/ - lrq->savings += lost_cvt; +#if 1 /*zhq debugging*/ + lrq->savings += lost_cvt; +#endif if (lrq->savings > MAX_SAVINGS) lrq->savings = MAX_SAVINGS; - } else if (lrq->savings) { - /* - *if a class saving and falling behind - * then start to use it saving in a leaking bucket way - */ - CVT_t savings_used; - - savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq); - if (savings_used > lrq->savings) - savings_used = lrq->savings; - - if (savings_used > SAVINGS_LEAK_SPEED) - savings_used = SAVINGS_LEAK_SPEED; - - BUG_ON(lrq->savings < savings_used); - lrq->savings -= savings_used; - unscale_cvt(savings_used,lrq); - BUG_ON(lrq->local_cvt < savings_used); -#ifndef CVT_SAVINGS_SUPPORT -#warning "ACB taking out cvt saving" -#else - lrq->local_cvt -= savings_used; +#if 0 /* zhq debugging*/ + printk("lrq= %x savings: %llu lost= %llu\n",(int)lrq,lrq->savings,lost_cvt); #endif - } + } } /* * return the max_cvt of all the classes */ -static inline CVT_t get_max_cvt(int this_cpu) +CVT_t get_max_cvt(int this_cpu) { struct ckrm_cpu_class *clsptr; ckrm_lrq_t * lrq; @@ -102,7 +117,6 @@ static inline CVT_t get_max_cvt(int this_cpu) max_cvt = 0; - /*update class time, at the same time get max_cvt */ list_for_each_entry(clsptr, &active_cpu_classes, links) { lrq = get_ckrm_lrq(clsptr, this_cpu); if (lrq->local_cvt > max_cvt) @@ -112,6 +126,23 @@ static inline CVT_t get_max_cvt(int this_cpu) return max_cvt; } +CVT_t get_min_cvt(int this_cpu) +{ + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t * lrq; + CVT_t max_cvt; + + max_cvt = 0xFFFFFFFFFFFFFLLU; + + list_for_each_entry(clsptr, &active_cpu_classes, links) { + lrq = get_ckrm_lrq(clsptr, this_cpu); + if (lrq->local_cvt < max_cvt) + max_cvt = lrq->local_cvt; + } + + return max_cvt; +} + /** * update_class_cputime - updates cvt of inactive classes * -- an inactive class shouldn't starve others when it comes back @@ -120,7 +151,7 @@ static inline CVT_t get_max_cvt(int this_cpu) * * class_list_lock must have been acquired */ -void update_class_cputime(int this_cpu) +void update_class_cputime(int this_cpu, int idle) { struct ckrm_cpu_class *clsptr; ckrm_lrq_t * lrq; @@ -178,24 +209,45 @@ void update_class_cputime(int this_cpu) /*******************************************************/ /* PID load balancing stuff */ /*******************************************************/ -#define PID_SAMPLE_T 32 #define PID_KP 20 #define PID_KI 60 #define PID_KD 20 +/* + * runqueue load is the local_weight of all the classes on this cpu + * must be called with class_list_lock held + */ +static unsigned long ckrm_cpu_load(int cpu) +{ + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t* lrq; + struct ckrm_cpu_demand_stat* l_stat; + int total_load = 0; + int load; + + list_for_each_entry(clsptr,&active_cpu_classes,links) { + lrq = get_ckrm_lrq(clsptr,cpu); + l_stat = get_cls_local_stat(clsptr,cpu); + + load = WEIGHT_TO_SHARE(lrq->local_weight); + + if (l_stat->cpu_demand < load) + load = l_stat->cpu_demand; + total_load += load; + } + return total_load; +} + + /** * sample pid load periodically */ + void ckrm_load_sample(ckrm_load_t* pid,int cpu) { long load; long err; - if (jiffies % PID_SAMPLE_T) - return; - - adjust_local_weight(); - load = ckrm_cpu_load(cpu); err = load - pid->load_p; pid->load_d = err; @@ -205,7 +257,7 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu) pid->load_i /= 10; } -long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) +long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group) { long pressure; pressure = ckrm_load->load_p * PID_KP; @@ -214,3 +266,58 @@ long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) pressure /= 100; return pressure; } + +/* + * called after a task is switched out. Update the local cvt accounting + * we need to stick with long instead of long long due to nonexistent + * 64-bit division + */ +void update_local_cvt(struct task_struct *p, unsigned long nsec) +{ + ckrm_lrq_t * lrq = get_task_lrq(p); + unsigned long cvt_inc; + + /* + * consume from savings if eshare is larger than egrt + */ + if (lrq->savings && lrq->over_weight) { + unsigned long savings_used; + + savings_used = nsec; + savings_used >>= CKRM_WEIGHT_SHIFT; + savings_used *= lrq->over_weight; + if (savings_used > lrq->savings) + savings_used = lrq->savings; + lrq->savings -= savings_used; + } + + //BUG_ON(local_class_weight(lrq) == 0); + cvt_inc = nsec / local_class_weight(lrq); + + /* + * For a certain processor, CKRM allocates CPU time propotional + * to the class's local_weight. So once a class consumed nsec, + * it will wait for X (nsec) for its next turn. + * + * X is calculated based on the following fomular + * nsec / local_weight < X / (CKRM_MAX_WEIGHT - local_weight) + * if local_weight is small, then approximated as + * nsec / local_weight < X / (CKRM_MAX_WEIGHT) + */ +#define CVT_STARVATION_LIMIT (200LL*NSEC_PER_MS) +#define CVT_STARVATION_INC_LIMIT (CVT_STARVATION_LIMIT >> CKRM_WEIGHT_SHIFT) + + if (unlikely(lrq->skewed_weight)) { + unsigned long long starvation_limit = CVT_STARVATION_INC_LIMIT; + + starvation_limit *= local_class_weight(lrq); + if (unlikely(cvt_inc > starvation_limit)) + cvt_inc = nsec / lrq->skewed_weight; + } + + /* now update the CVT accounting */ + + lrq->local_cvt += cvt_inc; + lrq->uncounted_ns += nsec; + update_class_priority(lrq); +} diff --git a/kernel/kexec.c b/kernel/kexec.c new file mode 100644 index 000000000..b59023fbf --- /dev/null +++ b/kernel/kexec.c @@ -0,0 +1,640 @@ +/* + * kexec.c - kexec system call + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in . + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static int kimage_is_destination_range( + struct kimage *image, unsigned long start, unsigned long end); +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest); + + +static int kimage_alloc(struct kimage **rimage, + unsigned long nr_segments, struct kexec_segment *segments) +{ + int result; + struct kimage *image; + size_t segment_bytes; + unsigned long i; + + /* Allocate a controlling structure */ + result = -ENOMEM; + image = kmalloc(sizeof(*image), GFP_KERNEL); + if (!image) { + goto out; + } + memset(image, 0, sizeof(*image)); + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unuseable pages */ + INIT_LIST_HEAD(&image->unuseable_pages); + + /* Read in the segments */ + image->nr_segments = nr_segments; + segment_bytes = nr_segments * sizeof*segments; + result = copy_from_user(image->segment, segments, segment_bytes); + if (result) + goto out; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mend; + mend = ((unsigned long)(image->segment[i].mem)) + + image->segment[i].memsz; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + goto out; + } + + /* + * Find a location for the control code buffer, and add it + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; + out: + if (result == 0) { + *rimage = image; + } else { + kfree(image); + } + return result; +} + +static int kimage_is_destination_range( + struct kimage *image, unsigned long start, unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + mstart = (unsigned long)image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) { + return 1; + } + } + return 0; +} + +static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order) +{ + struct page *pages; + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + pages->mapping = NULL; + pages->private = order; + count = 1 << order; + for(i = 0; i < count; i++) { + SetPageReserved(pages + i); + } + } + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + order = page->private; + count = 1 << order; + for(i = 0; i < count; i++) { + ClearPageReserved(page + i); + } + __free_pages(page, order); +} + +static void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + + kimage_free_pages(page); + } +} + +struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + pages = kimage_alloc_pages(GFP_KERNEL, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) + { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while(!pages); + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everyting to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + return pages; + +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) { + image->entry++; + } + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) { + return -ENOMEM; + } + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = + ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + return 0; +} + +static int kimage_set_destination( + struct kimage *image, unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + if (result == 0) { + image->destination = destination; + } + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + if (result == 0) { + image->destination += PAGE_SIZE; + } + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unuseable pages I have cached */ + kimage_free_page_list(&image->unuseable_pages); + +} +static int kimage_terminate(struct kimage *image) +{ + int result; + + result = kimage_add_entry(image, IND_DONE); + if (result == 0) { + /* Point at the terminating element */ + image->entry--; + kimage_free_extra_pages(image); + } + return result; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION)? \ + phys_to_virt((entry & PAGE_MASK)): ptr +1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +static void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) { + kimage_free_entry(ind); + } + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } + else if (entry & IND_SOURCE) { + kimage_free_entry(entry); + } + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) { + kimage_free_entry(ind); + } + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) { + destination = entry & PAGE_MASK; + } + else if (entry & IND_SOURCE) { + if (page == destination) { + return ptr; + } + destination += PAGE_SIZE; + } + } + return 0; +} + +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) { + return 0; + } + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unuseable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it. + */ + addr = old_addr; + page = old_page; + break; + } + else { + /* Place the page on the destination list I + * will use it later. + */ + list_add(&page->lru, &image->dest_pages); + } + } + return page; +} + +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long mstart; + int result; + unsigned long offset; + unsigned long offset_end; + unsigned char *buf; + + result = 0; + buf = segment->buf; + mstart = (unsigned long)segment->mem; + + offset_end = segment->memsz; + + result = kimage_set_destination(image, mstart); + if (result < 0) { + goto out; + } + for (offset = 0; offset < segment->memsz; offset += PAGE_SIZE) { + struct page *page; + char *ptr; + size_t size, leader; + page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset); + if (page == 0) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT); + if (result < 0) { + goto out; + } + ptr = kmap(page); + if (segment->bufsz < offset) { + /* We are past the end zero the whole page */ + memset(ptr, 0, PAGE_SIZE); + kunmap(page); + continue; + } + size = PAGE_SIZE; + leader = 0; + if ((offset == 0)) { + leader = mstart & ~PAGE_MASK; + } + if (leader) { + /* We are on the first page zero the unused portion */ + memset(ptr, 0, leader); + size -= leader; + ptr += leader; + } + if (size > (segment->bufsz - offset)) { + size = segment->bufsz - offset; + } + if (size < (PAGE_SIZE - leader)) { + /* zero the trailing part of the page */ + memset(ptr + size, 0, (PAGE_SIZE - leader) - size); + } + result = copy_from_user(ptr, buf + offset, size); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + } + out: + return result; +} + +/* + * Exec Kernel system call: for obvious reasons only root may call it. + * + * This call breaks up into three pieces. + * - A generic part which loads the new kernel from the current + * address space, and very carefully places the data in the + * allocated pages. + * + * - A generic part that interacts with the kernel and tells all of + * the devices to shut down. Preventing on-going dmas, and placing + * the devices in a consistent state so a later kernel can + * reinitialize them. + * + * - A machine specific part that includes the syscall number + * and the copies the image to it's final destination. And + * jumps into the image at entry. + * + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ +struct kimage *kexec_image = NULL; + +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment *segments, unsigned long flags) +{ + struct kimage *image; + int result; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* + * In case we need just a little bit of special behavior for + * reboot on panic. + */ + if (flags != 0) + return -EINVAL; + + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + image = NULL; + result = 0; + + if (nr_segments > 0) { + unsigned long i; + result = kimage_alloc(&image, nr_segments, segments); + if (result) { + goto out; + } + result = machine_kexec_prepare(image); + if (result) { + goto out; + } + image->start = entry; + for (i = 0; i < nr_segments; i++) { + result = kimage_load_segment(image, &image->segment[i]); + if (result) { + goto out; + } + } + result = kimage_terminate(image); + if (result) { + goto out; + } + } + + image = xchg(&kexec_image, image); + + out: + kimage_free(image); + return result; +} diff --git a/kernel/sched.c b/kernel/sched.c index 20b09215e..42af615a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -45,6 +45,8 @@ #include #include +#include +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -205,8 +207,6 @@ unsigned int task_timeslice(task_t *p) */ typedef struct runqueue runqueue_t; -#include -#include /* * This is the main, per-CPU runqueue data structure. @@ -227,17 +227,19 @@ struct runqueue { unsigned long cpu_load; #endif unsigned long long nr_switches, nr_preempt; - unsigned long expired_timestamp, nr_uninterruptible; + unsigned long nr_uninterruptible; unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; #ifdef CONFIG_CKRM_CPU_SCHEDULE struct classqueue_struct classqueue; ckrm_load_t ckrm_load; + ckrm_lrq_t dflt_lrq; /* local runqueue of the default class */ #else prio_array_t *active, *expired, arrays[2]; -#endif + unsigned long expired_timestamp; int best_expired_prio; +#endif atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -320,10 +322,72 @@ static inline void rq_unlock(runqueue_t *rq) spin_unlock_irq(&rq->lock); } +static inline void idle_balance(int this_cpu, runqueue_t *this_rq); +static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq); + #ifdef CONFIG_CKRM_CPU_SCHEDULE + +#define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled) +#define ckrm_rq_cpu_enabled(rq) ( rq->classqueue.enabled) + +static inline void class_enqueue_task(struct task_struct *p, + prio_array_t * array) +{ + ckrm_lrq_t *lrq; + int effective_prio; + + if (ckrm_rq_cpu_disabled(task_rq(p))) + return; + + lrq = get_task_lrq(p); + // BUG_ON(lrq==NULL); + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); + lrq->lrq_load += task_load(p); + + if ((p->prio < lrq->top_priority) && (array == lrq->active)) + set_top_priority(lrq, p->prio); + + if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { + cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); + effective_prio = get_effective_prio(lrq); + classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, + effective_prio); + } + +} + +static inline void class_dequeue_task(struct task_struct *p, + prio_array_t * array) +{ + ckrm_lrq_t *lrq; + unsigned long load; + + if (ckrm_rq_cpu_disabled(task_rq(p))) + return; + + lrq = get_task_lrq(p); + load = task_load(p); + + // BUG_ON(lrq->lrq_load < load); + + lrq->lrq_load -= load; + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); + + if ((array == lrq->active) && (p->prio == lrq->top_priority) + && list_empty(&(array->queue[p->prio]))) + set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO, + p->prio)); +} + static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) { - cq_node_t *node = classqueue_get_head(&rq->classqueue); + cq_node_t *node; + + if (ckrm_rq_cpu_disabled(rq)) + return &rq->dflt_lrq; + node = classqueue_get_head(&rq->classqueue); return ((node) ? class_list_entry(node) : NULL); } @@ -342,51 +406,189 @@ CVT_t get_local_cur_cvt(int cpu) return 0; } -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +static inline struct task_struct * rq_get_next_task(struct runqueue* rq, + int cpu) { prio_array_t *array; struct task_struct *next; ckrm_lrq_t *queue; int idx; - int cpu = smp_processor_id(); - // it is guaranteed be the ( rq->nr_running > 0 ) check in - // schedule that a task will be found. + if (ckrm_rq_cpu_disabled(rq)) { + /* original code from schedule(void) + * see also code in non CKRM configuration + */ + struct list_head *array_queue; + ckrm_lrq_t *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu); + + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + rq->dflt_lrq.expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + return NULL; + } + } + + array = lrq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + lrq->active = lrq->expired; + lrq->expired = array; + array = lrq->active; + lrq->expired_timestamp = 0; + lrq->best_expired_prio = MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + array_queue = array->queue + idx; + next = list_entry(array_queue->next, task_t, run_list); + return next; + } + /*-- CKRM SCHEDULER --*/ + retry_next_class: + /* we can't use (rq->nr_running == 0) to declare idleness + * first we have to make sure that the class runqueue is properly + * processed. This is due to two facts/requirements: + * (a) when the last task is removed form an lrq we do not remove + * the lrq from the class runqueue. As a result the lrq is + * selected again and we can perform necessary + * expired switches. + * (b) perform outstanding expired switches + * + */ + queue = rq_get_next_class(rq); - // BUG_ON( !queue ); + if (unlikely(queue == NULL)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + rq->dflt_lrq.expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + return NULL; + } + goto retry_next_class; // try again + } array = queue->active; if (unlikely(!array->nr_active)) { queue->active = queue->expired; queue->expired = array; + array = queue->active; queue->expired_timestamp = 0; - if (queue->active->nr_active) + if (array->nr_active) set_top_priority(queue, - find_first_bit(queue->active->bitmap, MAX_PRIO)); + find_first_bit(array->bitmap,MAX_PRIO)); else { + /* since we do not dequeue a lrq when it becomes empty + * but rely on the switching mechanism, we must dequeue + * at this point + */ classqueue_dequeue(queue->classqueue, &queue->classqueue_linkobj); - cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); + cpu_demand_event(get_rq_local_stat(queue,cpu), + CPU_DEMAND_DEQUEUE,0); } goto retry_next_class; } - // BUG_ON(!array->nr_active); idx = queue->top_priority; - // BUG_ON (idx == MAX_PRIO); + //BUG_ON(!array->nr_active); + //BUG_ON(idx == MAX_PRIO); + //BUG_ON(list_empty(array->queue+idx)); next = task_list_entry(array->queue[idx].next); return next; } + +static inline void ckrm_account_task(struct runqueue* rq, + struct task_struct *prev, + unsigned long long now) +{ + if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) { + unsigned long long run = now - prev->timestamp; + ckrm_lrq_t * lrq = get_task_lrq(prev); + + lrq->lrq_load -= task_load(prev); + cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); + lrq->lrq_load += task_load(prev); + + cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); + update_local_cvt(prev, run); + } + +} + +#ifdef CONFIG_SMP +#define COND_SMP(dflt,cond) (cond) +#else +#define COND_SMP(dflt,cond) (dflt) +#endif + +static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle, + runqueue_t *rq) +{ + /* first determine whether we have to do anything + * without grabing the global lock + */ + + int sample, update; + +#ifdef __SIMULATOR__ + if ((this_cpu == 0) && (j % 1000) == 0) { + ckrm_cpu_monitor(1); + } +#endif + + if (ckrm_rq_cpu_disabled(rq)) + return; + + update = (j % CVT_UPDATE_TICK); + sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK)); + +// avoid taking the global class_list lock on every tick + if (likely(update && sample)) + return; // nothing to be done; + + read_lock(&class_list_lock); + +#ifdef CONFIG_SMP + if (sample==0) { + ckrm_load_sample(rq_ckrm_load(rq),this_cpu); + } +#endif + + if (update==0) { + classqueue_update_base(get_cpu_classqueue(this_cpu)); + update_class_cputime(this_cpu,idle); + // occasionally we need to call the weight adjustment + // for SMP systems + if (COND_SMP(0,(this_cpu==0))) + adjust_local_weight(); + } + + read_unlock(&class_list_lock); +} + #else /*! CONFIG_CKRM_CPU_SCHEDULE*/ -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +static inline struct task_struct * rq_get_next_task(struct runqueue* rq, + int cpu) { prio_array_t *array; struct list_head *queue; int idx; + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + return NULL; + } + } array = rq->active; if (unlikely(!array->nr_active)) { /* @@ -404,11 +606,17 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq) return list_entry(queue->next, task_t, run_list); } -static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } -static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } +static inline void class_enqueue_task(struct task_struct* p, + prio_array_t *array) { } +static inline void class_dequeue_task(struct task_struct* p, + prio_array_t *array) { } static inline void init_cpu_classes(void) { } +static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {} +static inline void ckrm_account_task(struct runqueue* rq, struct + task_struct *prev, + unsigned long long now) { } #define rq_ckrm_load(rq) NULL -static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} + #endif /* CONFIG_CKRM_CPU_SCHEDULE */ /* @@ -1558,261 +1766,129 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline int ckrm_preferred_task(task_t *tmp,long min, long max, - int phase, enum idle_type idle) -{ - long pressure = task_load(tmp); - - if (pressure > max) - return 0; - - if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) - return 0; - return 1; -} - /* - * move tasks for a specic local class - * return number of tasks pulled + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. */ -static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, - runqueue_t *this_rq, - runqueue_t *busiest, - struct sched_domain *sd, - int this_cpu, - enum idle_type idle, - long* pressure_imbalance) +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) { prio_array_t *array, *dst_array; struct list_head *head, *curr; + int idx, pulled = 0; task_t *tmp; - int idx; - int pulled = 0; - int phase = -1; - long pressure_min, pressure_max; - /*hzheng: magic : 90% balance is enough*/ - long balance_min = *pressure_imbalance / 10; -/* - * we don't want to migrate tasks that will reverse the balance - * or the tasks that make too small difference - */ -#define CKRM_BALANCE_MAX_RATIO 100 -#define CKRM_BALANCE_MIN_RATIO 1 - start: - phase ++; +#if CONFIG_CKRM_CPU_SCHEDULE + /* need to distinguish between the runqueues and the class + * local runqueues. + * we know we can get here only if the dflt class is present + */ + ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq; + ckrm_lrq_t *l_busiest = &busiest->dflt_lrq; +#else +#define l_busiest busiest +#define l_this_rq this_rq +#endif + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect * on them. */ - if (src_lrq->expired->nr_active) { - array = src_lrq->expired; - dst_array = dst_lrq->expired; + if (l_busiest->expired->nr_active) { + array = l_busiest->expired; + dst_array = l_this_rq->expired; } else { - array = src_lrq->active; - dst_array = dst_lrq->active; + array = l_busiest->active; + dst_array = l_this_rq->active; } - - new_array: + +new_array: /* Start searching at priority 0: */ idx = 0; - skip_bitmap: +skip_bitmap: if (!idx) idx = sched_find_first_bit(array->bitmap); else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == src_lrq->expired && src_lrq->active->nr_active) { - array = src_lrq->active; - dst_array = dst_lrq->active; + if (array == l_busiest->expired && l_busiest->active->nr_active) { + array = l_busiest->active; + dst_array = l_this_rq->active; goto new_array; } - if ((! phase) && (! pulled) && (idle != IDLE)) - goto start; //try again - else - goto out; //finished search for this lrq + goto out; } - + head = array->queue + idx; curr = head->prev; - skip_queue: +skip_queue: tmp = list_entry(curr, task_t, run_list); - + curr = curr->prev; - + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; - pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; - pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; - /* - * skip the tasks that will reverse the balance too much - */ - if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { - *pressure_imbalance -= task_load(tmp); - pull_task(busiest, array, tmp, - this_rq, dst_array, this_cpu); - pulled++; - - if (*pressure_imbalance <= balance_min) - goto out; + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } - - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - out: +out: return pulled; } -static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) -{ - long imbalance; - /* - * make sure after balance, imbalance' > - imbalance/2 - * we don't want the imbalance be reversed too much - */ - imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) - - pid_get_pressure(rq_ckrm_load(this_rq),1); - imbalance /= 2; - return imbalance; -} - /* - * try to balance the two runqueues - * - * Called with both runqueues locked. - * if move_tasks is called, it will try to move at least one task over + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) { - struct ckrm_cpu_class *clsptr,*vip_cls = NULL; - ckrm_lrq_t* src_lrq,*dst_lrq; - long pressure_imbalance, pressure_imbalance_old; - int src_cpu = task_cpu(busiest->curr); - struct list_head *list; - int pulled = 0; - long imbalance; - - imbalance = ckrm_rq_imbalance(this_rq,busiest); + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; - if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) - goto out; + max_load = this_load = total_load = total_pwr = 0; - //try to find the vip class - list_for_each_entry(clsptr,&active_cpu_classes,links) { - src_lrq = get_ckrm_lrq(clsptr,src_cpu); + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; - if (! lrq_nr_running(src_lrq)) - continue; - - if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) - { - vip_cls = clsptr; - } - } - - /* - * do search from the most significant class - * hopefully, less tasks will be migrated this way - */ - clsptr = vip_cls; - - move_class: - if (! clsptr) - goto out; - - - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - if (! lrq_nr_running(src_lrq)) - goto other_class; - - dst_lrq = get_ckrm_lrq(clsptr,this_cpu); - - //how much pressure for this class should be transferred - pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; - if (pulled && ! pressure_imbalance) - goto other_class; - - pressure_imbalance_old = pressure_imbalance; - - //move tasks - pulled += - ckrm_cls_move_tasks(src_lrq,dst_lrq, - this_rq, - busiest, - sd,this_cpu,idle, - &pressure_imbalance); - - /* - * hzheng: 2 is another magic number - * stop balancing if the imbalance is less than 25% of the orig - */ - if (pressure_imbalance <= (pressure_imbalance_old >> 2)) - goto out; - - //update imbalance - imbalance *= pressure_imbalance / pressure_imbalance_old; - other_class: - //who is next? - list = clsptr->links.next; - if (list == &active_cpu_classes) - list = list->next; - clsptr = list_entry(list, typeof(*clsptr), links); - if (clsptr != vip_cls) - goto move_class; - out: - return pulled; -} - -/** - * ckrm_check_balance - is load balancing necessary? - * return 0 if load balancing is not necessary - * otherwise return the average load of the system - * also, update nr_group - * - * heuristics: - * no load balancing if it's load is over average - * no load balancing if it's load is far more than the min - * task: - * read the status of all the runqueues - */ -static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, - enum idle_type idle, int* nr_group) -{ - struct sched_group *group = sd->groups; - unsigned long min_load, max_load, avg_load; - unsigned long total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - min_load = 0xFFFFFFFF; - *nr_group = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; + local_group = cpu_isset(this_cpu, group->cpumask); /* Tally up the load of all CPUs in the group */ + avg_load = 0; cpus_and(tmp, group->cpumask, cpu_online_map); if (unlikely(cpus_empty(tmp))) goto nextgroup; - avg_load = 0; - local_group = cpu_isset(this_cpu, group->cpumask); - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i); + else + load = source_load(i); + nr_cpus++; avg_load += load; } @@ -1828,386 +1904,86 @@ static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, if (local_group) { this_load = avg_load; + this = group; goto nextgroup; } else if (avg_load > max_load) { max_load = avg_load; - } - if (avg_load < min_load) { - min_load = avg_load; + busiest = group; } nextgroup: group = group->next; - *nr_group = *nr_group + 1; } while (group != sd->groups); - if (!max_load || this_load >= max_load) + if (!busiest || this_load >= max_load) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - /* hzheng: debugging: 105 is a magic number - * 100*max_load <= sd->imbalance_pct*this_load) - * should use imbalance_pct instead - */ - if (this_load > avg_load - || 100*max_load < 105*this_load - || 100*min_load < 70*this_load - ) + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) goto out_balanced; - return avg_load; - out_balanced: - return 0; -} - -/** - * any group that has above average load is considered busy - * find the busiest queue from any of busy group - */ -static runqueue_t * -ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, - unsigned long avg_load, enum idle_type idle, - int nr_group) -{ - struct sched_group *group; - runqueue_t * busiest=NULL; - unsigned long rand; - - group = sd->groups; - rand = get_ckrm_rand(nr_group); - nr_group = 0; + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + *imbalance = min(max_load - avg_load, avg_load - this_load); - do { - unsigned long load,total_load,max_load; - cpumask_t tmp; - int i; - runqueue_t * grp_busiest; + /* How much load to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + / SCHED_LOAD_SCALE; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto find_nextgroup; + if (*imbalance < SCHED_LOAD_SCALE - 1) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; - total_load = 0; - max_load = 0; - grp_busiest = NULL; - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); - total_load += load; - if (load > max_load) { - max_load = load; - grp_busiest = cpu_rq(i); - } + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; } - total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (total_load > avg_load) { - busiest = grp_busiest; - if (nr_group >= rand) - break; - } - find_nextgroup: - group = group->next; - nr_group ++; - } while (group != sd->groups); + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ - return busiest; -} + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; -/** - * load_balance - pressure based load balancing algorithm used by ckrm - */ -static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - runqueue_t *busiest; - unsigned long avg_load; - int nr_moved,nr_group; + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); - avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); - if (! avg_load) - goto out_balanced; + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + if (max_load < tmp) + tmp = max_load; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; - busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); - if (! busiest) - goto out_balanced; - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; - nr_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - 0,sd, idle); - spin_unlock(&busiest->lock); - if (nr_moved) { - adjust_local_weight(); - } + *imbalance = 1; + return busiest; } - if (!nr_moved) - sd->nr_balance_failed ++; - else - sd->nr_balance_failed = 0; - - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; - return nr_moved; - -out_balanced: - /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - - return 0; -} - -/* - * this_rq->lock is already held - */ -static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd) -{ - int ret; - read_lock(&class_list_lock); - ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - return ret; -} - -static inline int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - int ret; - - spin_lock(&this_rq->lock); - read_lock(&class_list_lock); - ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - spin_unlock(&this_rq->lock); - return ret; -} -#else /*! CONFIG_CKRM_CPU_SCHEDULE */ -/* - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, - * as part of a balancing operation within "domain". Returns the number of - * tasks moved. - * - * Called with both runqueues locked. - */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) -{ - prio_array_t *array, *dst_array; - struct list_head *head, *curr; - int idx, pulled = 0; - task_t *tmp; - - if (max_nr_move <= 0 || busiest->nr_running <= 1) - goto out; - - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } - goto out; - } - - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, task_t, run_list); - - curr = curr->prev; - - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); - pulled++; - - /* We only want to steal up to the prescribed number of tasks. */ - if (pulled < max_nr_move) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } -out: - return pulled; -} - -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the number of tasks which should be - * moved to restore balance via the imbalance parameter. - */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle) -{ - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - local_group = cpu_isset(this_cpu, group->cpumask); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - - for_each_cpu_mask(i, tmp) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = target_load(i); - else - load = source_load(i); - - nr_cpus++; - avg_load += load; - } - - if (!nr_cpus) - goto nextgroup; - - total_load += avg_load; - total_pwr += group->cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - this = group; - goto nextgroup; - } else if (avg_load > max_load) { - max_load = avg_load; - busiest = group; - } -nextgroup: - group = group->next; - } while (group != sd->groups); - - if (!busiest || this_load >= max_load) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) - goto out_balanced; - - /* - * We're trying to get all the cpus to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load, as either of these - * actions would just result in more rebalancing later, and ping-pong - * tasks around. Thus we look for the minimum possible imbalance. - * Negative imbalances (*we* are more loaded than anyone else) will - * be counted as no imbalance for these purposes -- we can't fix that - * by pulling tasks to us. Be careful of negative numbers as they'll - * appear as very large values with unsigned longs. - */ - *imbalance = min(max_load - avg_load, avg_load - this_load); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) - / SCHED_LOAD_SCALE; - - if (*imbalance < SCHED_LOAD_SCALE - 1) { - unsigned long pwr_now = 0, pwr_move = 0; - unsigned long tmp; - - if (max_load - this_load >= SCHED_LOAD_SCALE*2) { - *imbalance = 1; - return busiest; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; - if (max_load > tmp) - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, - max_load - tmp); - - /* Amount of load we'd add */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; - if (max_load < tmp) - tmp = max_load; - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain another 8th of a CPU worth of throughput */ - if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) - goto out_balanced; - - *imbalance = 1; - return busiest; - } - - /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; - - return busiest; + return busiest; out_balanced: if (busiest && (idle == NEWLY_IDLE || @@ -2249,6 +2025,17 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) * * Called with this_rq unlocked. */ + +static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, + enum idle_type idle) +#ifndef CONFIG_CKRM_CPU_SCHEDULE +{ + return -1; +} +#endif +; + static int load_balance(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd, enum idle_type idle) { @@ -2259,6 +2046,9 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, spin_lock(&this_rq->lock); + if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1) + goto out_balanced; + group = find_busiest_group(sd, this_cpu, &imbalance, idle); if (!group) goto out_balanced; @@ -2344,8 +2134,12 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, struct sched_group *group; runqueue_t *busiest = NULL; unsigned long imbalance; - int nr_moved = 0; + int nr_moved; + + if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1) + goto out; + nr_moved = 0; group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) goto out; @@ -2365,8 +2159,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } -#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ - /* * idle_balance is called by schedule() if this_cpu is about to become @@ -2472,6 +2264,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq); + /* Update our load */ old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; @@ -2510,7 +2304,9 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, */ static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { + ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq); } + static inline void idle_balance(int cpu, runqueue_t *rq) { } @@ -2547,15 +2343,19 @@ EXPORT_PER_CPU_SYMBOL(kstat); #ifndef CONFIG_CKRM_CPU_SCHEDULE #define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->best_expired_prio)) #else +/* we need to scale the starvation based on weight + * classes with small weight have longer expiration starvation + */ #define EXPIRED_STARVING(rq) \ - (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) + (((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \ + (this_rq()->curr->static_prio > (rq)->best_expired_prio)) #endif /* @@ -2598,7 +2398,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2639,8 +2438,11 @@ void scheduler_tick(int user_ticks, int sys_ticks) } if (vx_need_resched(p)) { #ifdef CONFIG_CKRM_CPU_SCHEDULE - /* Hubertus ... we can abstract this out */ - ckrm_lrq_t* rq = get_task_lrq(p); + /* we redefine RQ to be a local runqueue */ + ckrm_lrq_t* rq; + runqueue_t *cpu_rq = this_rq(); + rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p) + : &(cpu_rq->dflt_lrq); #endif dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2652,8 +2454,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < this_rq()->best_expired_prio) - this_rq()->best_expired_prio = p->static_prio; + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2687,7 +2489,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) out_unlock: spin_unlock(&rq->lock); out: - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2788,21 +2589,17 @@ asmlinkage void __sched schedule(void) unsigned long long now; unsigned long run_time; int cpu; -#ifdef CONFIG_VSERVER_HARDCPU - struct vx_info *vxi; - int maxidle = -HZ; -#endif - /* + + /* * If crash dump is in progress, this other cpu's * need to wait until it completes. * NB: this code is optimized away for kernels without * dumping enabled. */ - if (unlikely(dump_oncpu)) - goto dump_scheduling_disabled; + if (unlikely(dump_oncpu)) + goto dump_scheduling_disabled; - //WARN_ON(system_state == SYSTEM_BOOTING); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -2837,19 +2634,8 @@ need_resched: spin_lock_irq(&rq->lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - if (prev != rq->idle) { - unsigned long long run = now - prev->timestamp; - ckrm_lrq_t * lrq = get_task_lrq(prev); - - lrq->lrq_load -= task_load(prev); - cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); - lrq->lrq_load += task_load(prev); + ckrm_account_task(rq,prev,now); - cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } -#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2865,8 +2651,9 @@ need_resched: } cpu = smp_processor_id(); + #ifdef CONFIG_VSERVER_HARDCPU - if (!list_empty(&rq->hold_queue)) { + if (!list_empty(&rq->hold_queue)) { struct list_head *l, *n; int ret; @@ -2875,7 +2662,7 @@ need_resched: next = list_entry(l, task_t, run_list); if (vxi == next->vx_info) continue; - + vxi = next->vx_info; ret = vx_tokens_recalc(vxi); // tokens = vx_tokens_avail(next); @@ -2885,51 +2672,43 @@ need_resched: next->state &= ~TASK_ONHOLD; recalc_task_prio(next, now); __activate_task(next, rq); - // printk("··· unhold %p\n", next); + // printk("×·· unhold %p\n", next); break; } if ((ret < 0) && (maxidle < ret)) maxidle = ret; - } + } } - rq->idle_tokens = -maxidle; - -pick_next: -#endif - if (unlikely(!rq->nr_running)) { - idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - rq->expired_timestamp = 0; + rq->idle_tokens = -maxidle; + + pick_next: #endif - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; - } + next = rq_get_next_task(rq,cpu); + if (unlikely(next == NULL)) { + next = rq->idle; + goto switch_tasks; } - next = rq_get_next_task(rq); - if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; goto switch_tasks; } #ifdef CONFIG_VSERVER_HARDCPU - vxi = next->vx_info; - if (vxi && __vx_flags(vxi->vx_flags, - VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) { - int ret = vx_tokens_recalc(vxi); - - if (unlikely(ret <= 0)) { - if (ret && (rq->idle_tokens > -ret)) - rq->idle_tokens = -ret; - deactivate_task(next, rq); - list_add_tail(&next->run_list, &rq->hold_queue); - next->state |= TASK_ONHOLD; - goto pick_next; - } - } + vxi = next->vx_info; + if (vxi && __vx_flags(vxi->vx_flags, + VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) { + int ret = vx_tokens_recalc(vxi); + + if (unlikely(ret <= 0)) { + if (ret && (rq->idle_tokens > -ret)) + rq->idle_tokens = -ret; + deactivate_task(next, rq); + list_add_tail(&next->run_list, &rq->hold_queue); + next->state |= TASK_ONHOLD; + goto pick_next; + } + } #endif if (!rt_task(next) && next->activated > 0) { @@ -2980,15 +2759,16 @@ switch_tasks: if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; - return; - + + return; + dump_scheduling_disabled: - /* allow scheduling only if this is the dumping cpu */ - if (dump_oncpu != smp_processor_id()+1) { - while (dump_oncpu) - cpu_relax(); - } - return; + /* allow scheduling only if this is the dumping cpu */ + if (dump_oncpu != smp_processor_id()+1) { + while (dump_oncpu) + cpu_relax(); + } + return; } EXPORT_SYMBOL(schedule); @@ -3175,11 +2955,11 @@ EXPORT_SYMBOL(wait_for_completion); spin_unlock_irqrestore(&q->lock, flags); #define SLEEP_ON_BKLCHECK \ - if (unlikely(!kernel_locked()) && \ - sleep_on_bkl_warnings < 10) { \ - sleep_on_bkl_warnings++; \ - WARN_ON(1); \ - } + if (unlikely(!kernel_locked()) && \ + sleep_on_bkl_warnings < 10) { \ + sleep_on_bkl_warnings++; \ + WARN_ON(1); \ + } static int sleep_on_bkl_warnings; @@ -3202,7 +2982,7 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long { SLEEP_ON_VAR - SLEEP_ON_BKLCHECK + SLEEP_ON_BKLCHECK current->state = TASK_INTERRUPTIBLE; @@ -3215,11 +2995,26 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long EXPORT_SYMBOL(interruptible_sleep_on_timeout); +void fastcall __sched sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + SLEEP_ON_BKLCHECK + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR - SLEEP_ON_BKLCHECK + SLEEP_ON_BKLCHECK current->state = TASK_UNINTERRUPTIBLE; @@ -3346,7 +3141,6 @@ int task_nice(const task_t *p) { return TASK_NICE(p); } - EXPORT_SYMBOL(task_nice); /** @@ -3969,8 +3763,6 @@ void show_state(void) read_unlock(&tasklist_lock); } -EXPORT_SYMBOL_GPL(show_state); - void __devinit init_idle(task_t *idle, int cpu) { runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); @@ -4657,13 +4449,12 @@ void __init sched_init(void) rq->active = rq->arrays; rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; #else rq = cpu_rq(i); spin_lock_init(&rq->lock); #endif - rq->best_expired_prio = MAX_PRIO; - #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; @@ -4676,7 +4467,7 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->migration_queue); #endif #ifdef CONFIG_VSERVER_HARDCPU - INIT_LIST_HEAD(&rq->hold_queue); + INIT_LIST_HEAD(&rq->hold_queue); #endif atomic_set(&rq->nr_iowait, 0); } @@ -4712,15 +4503,15 @@ void __might_sleep(char *file, int line, int atomic_depth) #ifndef CONFIG_PREEMPT atomic_depth = 0; #endif - if (((in_atomic() != atomic_depth) || irqs_disabled()) && + if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; printk(KERN_ERR "Debug: sleeping function called from invalid" " context at %s:%d\n", file, line); - printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n", - in_atomic(), atomic_depth, irqs_disabled()); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); dump_stack(); } #endif @@ -4783,6 +4574,20 @@ EXPORT_SYMBOL(task_running_sys); #endif #ifdef CONFIG_CKRM_CPU_SCHEDULE + +/******************************************************************** + * + * CKRM Scheduler additions + * + * (a) helper functions + * (b) load balancing code + * + * These are required here to avoid having to externalize many + * of the definitions in sched.c + * + * + ********************************************************************/ + /** * return the classqueue object of a certain processor */ @@ -4811,4 +4616,559 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) task_rq_unlock(rq,&flags); } + +/** + * get_min_cvt_locking - get the mininum cvt on a particular cpu under rqlock + */ + +CVT_t get_min_cvt(int cpu); + +CVT_t get_min_cvt_locking(int cpu) +{ + CVT_t cvt; + struct runqueue *rq = cpu_rq(cpu); + spin_lock(&rq->lock); + cvt = get_min_cvt(cpu); + spin_unlock(&rq->lock); + return cvt; +} + +ckrm_lrq_t *rq_get_dflt_lrq(int cpu) +{ + return &(cpu_rq(cpu)->dflt_lrq); +} + +#ifdef CONFIG_SMP + +/************** CKRM Load Balancing code ************************/ + +static inline int ckrm_preferred_task(task_t *tmp,long min, long max, + int phase, enum idle_type idle) +{ + long pressure = task_load(tmp); + + if (pressure > max) + return 0; + + if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) + return 0; + return 1; +} + +/* + * move tasks for a specic local class + * return number of tasks pulled + */ +static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, + runqueue_t *this_rq, + runqueue_t *busiest, + struct sched_domain *sd, + int this_cpu, + enum idle_type idle, + long* pressure_imbalance) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + task_t *tmp; + int idx; + int pulled = 0; + int phase = -1; + long pressure_min, pressure_max; + /*hzheng: magic : 90% balance is enough*/ + long balance_min = *pressure_imbalance / 10; +/* + * we don't want to migrate tasks that will reverse the balance + * or the tasks that make too small difference + */ +#define CKRM_BALANCE_MAX_RATIO 100 +#define CKRM_BALANCE_MIN_RATIO 1 + start: + phase ++; + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (src_lrq->expired->nr_active) { + array = src_lrq->expired; + dst_array = dst_lrq->expired; + } else { + array = src_lrq->active; + dst_array = dst_lrq->active; + } + + new_array: + /* Start searching at priority 0: */ + idx = 0; + skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == src_lrq->expired && src_lrq->active->nr_active) { + array = src_lrq->active; + dst_array = dst_lrq->active; + goto new_array; + } + if ((! phase) && (! pulled) && (idle != IDLE)) + goto start; //try again + else + goto out; //finished search for this lrq + } + + head = array->queue + idx; + curr = head->prev; + skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + + pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; + pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; + /* + * skip the tasks that will reverse the balance too much + */ + if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { + *pressure_imbalance -= task_load(tmp); + pull_task(busiest, array, tmp, + this_rq, dst_array, this_cpu); + pulled++; + + if (*pressure_imbalance <= balance_min) + goto out; + } + + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + out: + return pulled; +} + +static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) +{ + long imbalance; + /* + * make sure after balance, imbalance' > - imbalance/2 + * we don't want the imbalance be reversed too much + */ + imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0) + - ckrm_get_pressure(rq_ckrm_load(this_rq),1); + imbalance /= 2; + return imbalance; +} + +/* + * try to balance the two runqueues + * + * Called with both runqueues locked. + * if move_tasks is called, it will try to move at least one task over + */ +static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu, + runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct ckrm_cpu_class *clsptr,*vip_cls = NULL; + ckrm_lrq_t* src_lrq,*dst_lrq; + long pressure_imbalance, pressure_imbalance_old; + int src_cpu = task_cpu(busiest->curr); + struct list_head *list; + int pulled = 0; + long imbalance; + + imbalance = ckrm_rq_imbalance(this_rq,busiest); + + if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) + goto out; + + //try to find the vip class + list_for_each_entry(clsptr,&active_cpu_classes,links) { + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + + if (! lrq_nr_running(src_lrq)) + continue; + + if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) + { + vip_cls = clsptr; + } + } + + /* + * do search from the most significant class + * hopefully, less tasks will be migrated this way + */ + clsptr = vip_cls; + + move_class: + if (! clsptr) + goto out; + + + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + if (! lrq_nr_running(src_lrq)) + goto other_class; + + dst_lrq = get_ckrm_lrq(clsptr,this_cpu); + + //how much pressure for this class should be transferred + pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight); + if (pulled && ! pressure_imbalance) + goto other_class; + + pressure_imbalance_old = pressure_imbalance; + + //move tasks + pulled += + ckrm_cls_move_tasks(src_lrq,dst_lrq, + this_rq, + busiest, + sd,this_cpu,idle, + &pressure_imbalance); + + /* + * hzheng: 2 is another magic number + * stop balancing if the imbalance is less than 25% of the orig + */ + if (pressure_imbalance <= (pressure_imbalance_old >> 2)) + goto out; + + //update imbalance + imbalance *= pressure_imbalance / pressure_imbalance_old; + other_class: + //who is next? + list = clsptr->links.next; + if (list == &active_cpu_classes) + list = list->next; + clsptr = list_entry(list, typeof(*clsptr), links); + if (clsptr != vip_cls) + goto move_class; + out: + return pulled; +} + +/** + * ckrm_check_balance - is load balancing necessary? + * return 0 if load balancing is not necessary + * otherwise return the average load of the system + * also, update nr_group + * + * heuristics: + * no load balancing if it's load is over average + * no load balancing if it's load is far more than the min + * task: + * read the status of all the runqueues + */ +static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, + enum idle_type idle, int* nr_group) +{ + struct sched_group *group = sd->groups; + unsigned long min_load, max_load, avg_load; + unsigned long total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + min_load = 0xFFFFFFFF; + *nr_group = 0; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + /* Tally up the load of all CPUs in the group */ + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto nextgroup; + + avg_load = 0; + local_group = cpu_isset(this_cpu, group->cpumask); + + for_each_cpu_mask(i, tmp) { + load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + } + if (avg_load < min_load) { + min_load = avg_load; + } +nextgroup: + group = group->next; + *nr_group = *nr_group + 1; + } while (group != sd->groups); + + if (!max_load || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + /* hzheng: debugging: 105 is a magic number + * 100*max_load <= sd->imbalance_pct*this_load) + * should use imbalance_pct instead + */ + if (this_load > avg_load + || 100*max_load < 105*this_load + || 100*min_load < 70*this_load + ) + goto out_balanced; + + return avg_load; + out_balanced: + return 0; +} + +/** + * any group that has above average load is considered busy + * find the busiest queue from any of busy group + */ +static runqueue_t * +ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, + unsigned long avg_load, enum idle_type idle, + int nr_group) +{ + struct sched_group *group; + runqueue_t * busiest=NULL; + unsigned long rand; + + group = sd->groups; + rand = get_ckrm_rand(nr_group); + nr_group = 0; + + do { + unsigned long load,total_load,max_load; + cpumask_t tmp; + int i; + runqueue_t * grp_busiest; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto find_nextgroup; + + total_load = 0; + max_load = 0; + grp_busiest = NULL; + for_each_cpu_mask(i, tmp) { + load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0); + total_load += load; + if (load > max_load) { + max_load = load; + grp_busiest = cpu_rq(i); + } + } + + total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; + if (total_load > avg_load) { + busiest = grp_busiest; + if (nr_group >= rand) + break; + } + find_nextgroup: + group = group->next; + nr_group ++; + } while (group != sd->groups); + + return busiest; +} + +/** + * load_balance - pressure based load balancing algorithm used by ckrm + */ +static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, + enum idle_type idle) +{ + runqueue_t *busiest; + unsigned long avg_load; + int nr_moved,nr_group; + + avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); + if (! avg_load) + goto out_balanced; + + busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); + if (! busiest) + goto out_balanced; + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest, + 0,sd, idle); + spin_unlock(&busiest->lock); + if (nr_moved) { + adjust_local_weight(); + } + } + + if (!nr_moved) + sd->nr_balance_failed ++; + else + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, + enum idle_type idle) +{ + int ret; + + if (ckrm_rq_cpu_disabled(this_rq)) + return -1; + //spin_lock(&this_rq->lock); + read_lock(&class_list_lock); + ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle); + // ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + //spin_unlock(&this_rq->lock); + return ret; +} + +#endif // CONFIG_SMP + + +void ckrm_cpu_class_queue_update(int on) +{ + /* This is called when the mode changes from disabled + * to enabled (on=1) or vice versa (on=0). + * we make sure that all classqueues on all cpus + * either have the default class enqueued (on=1) or + * all classes dequeued (on=0). + * if not done a race condition will persist + * when flipping the ckrm_sched_mode. + * Otherwise will lead to more complicated code + * in rq_get_next_task, where we despite knowing of + * runnable tasks can not find an enqueued class. + */ + + int i; + runqueue_t *rq; + ckrm_lrq_t *lrq; + struct ckrm_cpu_class *clsptr; + + if (on) { + BUG_ON(ckrm_cpu_enabled()); + for_each_cpu(i) { + rq = cpu_rq(i); + BUG_ON(ckrm_rq_cpu_enabled(rq)); + lrq = &rq->dflt_lrq; + spin_lock(&rq->lock); + + BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj)); + + classqueue_init(&rq->classqueue,1); + lrq->top_priority = find_first_bit(lrq->active->bitmap, + MAX_PRIO), + classqueue_enqueue(lrq->classqueue, + &lrq->classqueue_linkobj, 0); + spin_unlock(&rq->lock); +#if 0 + printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i, + rq->nr_running,lrq->active->nr_active, + lrq->expired->nr_active, + find_first_bit(lrq->active->bitmap,MAX_PRIO), + find_first_bit(lrq->expired->bitmap,MAX_PRIO), + lrq->top_priority); #endif + } + } else { + for_each_cpu(i) { + rq = cpu_rq(i); + spin_lock(&rq->lock); + + /* walk through all classes and make sure they + * are not enqueued + */ + write_lock(&class_list_lock); + list_for_each_entry(clsptr,&active_cpu_classes,links) { + lrq = get_ckrm_lrq(clsptr,i); + BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq)); // must be empty + if (cls_in_classqueue(&lrq->classqueue_linkobj)) + classqueue_dequeue(lrq->classqueue, + &lrq->classqueue_linkobj); + } + rq->classqueue.enabled = 0; + write_unlock(&class_list_lock); + spin_unlock(&rq->lock); + } + } +} + +/* + * callback when a class is getting deleted + * need to remove it from the class runqueue. see (class_queue_update) + */ + +void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr) +{ + int i; + + for_each_cpu(i) { + runqueue_t *rq = cpu_rq(i); + ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i); + + spin_lock(&rq->lock); + write_lock(&class_list_lock); + BUG_ON(lrq_nr_running(lrq)); // must be empty + if (cls_in_classqueue(&lrq->classqueue_linkobj)) + classqueue_dequeue(lrq->classqueue, + &lrq->classqueue_linkobj); + write_unlock(&class_list_lock); + spin_unlock(&rq->lock); + } +} + +#endif // CONFIG_CKRM_CPU_SCHEDULE diff --git a/kernel/sys.c b/kernel/sys.c index c69f6ed82..6e8b073bc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -511,6 +513,25 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user machine_restart(buffer); break; +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + { + struct kimage *image; + image = xchg(&kexec_image, 0); + if (!image) { + unlock_kernel(); + return -EINVAL; + } + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_shutdown(); + system_state = SYSTEM_BOOTING; + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + machine_kexec(image); + break; + } +#endif #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { diff --git a/lib/.cvsignore b/lib/.cvsignore new file mode 100644 index 000000000..30d38180f --- /dev/null +++ b/lib/.cvsignore @@ -0,0 +1,2 @@ +crc32table.h +gen_crc32table diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index b58141ead..c4bae8c2f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -628,5 +628,50 @@ config IP_NF_MATCH_REALM If you want to compile it as a module, say M here and read Documentation/modules.txt. If unsure, say `N'. +config IP_NF_CT_ACCT + bool "Connection tracking flow accounting" + depends on IP_NF_CONNTRACK + +config IP_NF_CT_PROTO_GRE + tristate ' GRE protocol support' + depends on IP_NF_CONNTRACK + help + This module adds generic support for connection tracking and NAT of the + GRE protocol (RFC1701, RFC2784). Please note that this will only work + with GRE connections using the key field of the GRE header. + + You will need GRE support to enable PPTP support. + + If you want to compile it as a module, say `M' here and read + Documentation/modules.txt. If unsire, say `N'. + +config IP_NF_PPTP + tristate 'PPTP protocol support' + depends on IP_NF_CT_PROTO_GRE + help + This module adds support for PPTP (Point to Point Tunnelling Protocol, + RFC2637) conncection tracking and NAT. + + If you are running PPTP sessions over a stateful firewall or NAT box, + you may want to enable this feature. + + Please note that not all PPTP modes of operation are supported yet. + For more info, read top of the file net/ipv4/netfilter/ip_conntrack_pptp.c + + If you want to compile it as a module, say M here and read + Documentation/modules.txt. If unsure, say `N'. + +config IP_NF_NAT_PPTP + tristate + depends on IP_NF_NAT!=n && IP_NF_PPTP!=n + default IP_NF_NAT if IP_NF_PPTP=y + default m if IP_NF_PPTP=m + +config IP_NF_NAT_PROTO_GRE + tristate + depends on IP_NF_NAT!=n && IP_NF_CT_PROTO_GRE!=n + default IP_NF_NAT if IP_NF_CT_PROTO_GRE=y + default m if IP_NF_CT_PROTO_GRE=m + endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index bdb23fde1..f54887b48 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -19,17 +19,25 @@ ipchains-objs := $(ip_nf_compat-objs) ipchains_core.o # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +# connection tracking protocol helpers +obj-$(CONFIG_IP_NF_CT_PROTO_GRE) += ip_conntrack_proto_gre.o + +# NAT protocol helpers +obj-$(CONFIG_IP_NF_NAT_PROTO_GRE) += ip_nat_proto_gre.o + # connection tracking helpers obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o +obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o # NAT helpers obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o +obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index 4e8f4d83b..40ed4474d 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -58,7 +58,7 @@ static int help(struct sk_buff *skb, /* increase the UDP timeout of the master connection as replies from * Amanda clients to the server can be quite delayed */ - ip_ct_refresh(ct, master_timeout * HZ); + ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ); /* No data? */ dataoff = skb->nh.iph->ihl*4 + sizeof(struct udphdr); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 05fbb43cc..757af6893 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -143,6 +143,7 @@ get_tuple(const struct iphdr *iph, tuple->src.ip = iph->saddr; tuple->dst.ip = iph->daddr; tuple->dst.protonum = iph->protocol; + tuple->src.u.all = tuple->dst.u.all = 0; return protocol->pkt_to_tuple(skb, dataoff, tuple); } @@ -156,6 +157,8 @@ invert_tuple(struct ip_conntrack_tuple *inverse, inverse->dst.ip = orig->src.ip; inverse->dst.protonum = orig->dst.protonum; + inverse->src.u.all = inverse->dst.u.all = 0; + return protocol->invert_tuple(inverse, orig); } @@ -976,8 +979,8 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect, * so there is no need to use the tuple lock too */ DEBUGP("ip_conntrack_expect_related %p\n", related_to); - DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); + DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple); + DEBUGP("mask: "); DUMP_TUPLE_RAW(&expect->mask); old = LIST_FIND(&ip_conntrack_expect_list, resent_expect, struct ip_conntrack_expect *, &expect->tuple, @@ -1070,15 +1073,14 @@ int ip_conntrack_change_expect(struct ip_conntrack_expect *expect, MUST_BE_READ_LOCKED(&ip_conntrack_lock); WRITE_LOCK(&ip_conntrack_expect_tuple_lock); - DEBUGP("change_expect:\n"); - DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple); - DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask); - DEBUGP("newtuple: "); DUMP_TUPLE(newtuple); + DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple); + DEBUGP("exp mask: "); DUMP_TUPLE_RAW(&expect->mask); + DEBUGP("newtuple: "); DUMP_TUPLE_RAW(newtuple); if (expect->ct_tuple.dst.protonum == 0) { /* Never seen before */ DEBUGP("change expect: never seen before\n"); - if (!ip_ct_tuple_equal(&expect->tuple, newtuple) + if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask) && LIST_FIND(&ip_conntrack_expect_list, expect_clash, struct ip_conntrack_expect *, newtuple, &expect->mask)) { /* Force NAT to find an unused tuple */ @@ -1166,21 +1168,39 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) synchronize_net(); } -/* Refresh conntrack for this many jiffies. */ -void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies) +static inline void ct_add_counters(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_NF_CT_ACCT + if (skb) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + } +#endif +} + +/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */ +void ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies) { IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); /* If not in hash table, timer will not be active yet */ - if (!is_confirmed(ct)) + if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; - else { + ct_add_counters(ct, ctinfo, skb); + } else { WRITE_LOCK(&ip_conntrack_lock); /* Need del_timer for race avoidance (may already be dying). */ if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); } + ct_add_counters(ct, ctinfo, skb); WRITE_UNLOCK(&ip_conntrack_lock); } } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c index 0df558a58..6a7db7754 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c @@ -50,9 +50,9 @@ static unsigned int generic_print_conntrack(char *buffer, /* Returns verdict for packet, or -1 for invalid. */ static int packet(struct ip_conntrack *conntrack, const struct sk_buff *skb, - enum ip_conntrack_info conntrackinfo) + enum ip_conntrack_info ctinfo) { - ip_ct_refresh(conntrack, ip_ct_generic_timeout); + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); return NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 013f759cc..edccfe843 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -130,13 +130,6 @@ int ip_ct_gre_keymap_add(struct ip_conntrack_expect *exp, void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km, struct ip_conntrack_tuple *t) { - if (!km) - { - printk(KERN_WARNING - "NULL GRE conntrack keymap change requested\n"); - return; - } - DEBUGP("changing entry %p to: ", km); DUMP_TUPLE_GRE(t); @@ -188,8 +181,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, u_int32_t srckey; grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - /* PPTP header is variable length, only need up to the call_id field */ - pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); + pgrehdr = skb_header_pointer(skb, dataoff, sizeof(_pgrehdr), &_pgrehdr); if (!grehdr || !pgrehdr) return 0; @@ -219,11 +211,11 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, srckey = gre_keymap_lookup(tuple); - tuple->src.u.gre.key = srckey; #if 0 DEBUGP("found src key %x for tuple ", ntohl(srckey)); DUMP_TUPLE_GRE(tuple); #endif + tuple->src.u.gre.key = srckey; return 1; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 47114840f..e854193eb 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -94,7 +94,7 @@ static int icmp_packet(struct ip_conntrack *ct, ct->timeout.function((unsigned long)ct); } else { atomic_inc(&ct->proto.icmp.count); - ip_ct_refresh(ct, ip_ct_icmp_timeout); + ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); } return NF_ACCEPT; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 463cafa66..73fe0401d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -225,7 +225,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, set_bit(IPS_ASSURED_BIT, &conntrack->status); out: WRITE_UNLOCK(&tcp_lock); - ip_ct_refresh(conntrack, *tcp_timeouts[newconntrack]); + ip_ct_refresh_acct(conntrack, ctinfo, skb, *tcp_timeouts[newconntrack]); return NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index a63c32d18..a69e14b5c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -60,16 +60,17 @@ static unsigned int udp_print_conntrack(char *buffer, /* Returns verdict for packet, and may modify conntracktype */ static int udp_packet(struct ip_conntrack *conntrack, const struct sk_buff *skb, - enum ip_conntrack_info conntrackinfo) + enum ip_conntrack_info ctinfo) { /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { - ip_ct_refresh(conntrack, ip_ct_udp_timeout_stream); + ip_ct_refresh_acct(conntrack, ctinfo, skb, + ip_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ set_bit(IPS_ASSURED_BIT, &conntrack->status); } else - ip_ct_refresh(conntrack, ip_ct_udp_timeout); + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); return NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index fd688f4fe..76c827dcb 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -83,6 +83,17 @@ print_expect(char *buffer, const struct ip_conntrack_expect *expect) return len; } +#ifdef CONFIG_IP_NF_CT_ACCT +static unsigned int +print_counters(char *buffer, struct ip_conntrack_counter *counter) +{ + return sprintf(buffer, "packets=%llu bytes=%llu ", + counter->packets, counter->bytes); +} +#else +#define print_counters(x, y) 0 +#endif + static unsigned int print_conntrack(char *buffer, struct ip_conntrack *conntrack) { @@ -103,12 +114,16 @@ print_conntrack(char *buffer, struct ip_conntrack *conntrack) &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, proto); len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_ORIGINAL]); + len += print_counters(buffer + len, + &conntrack->counters[IP_CT_DIR_ORIGINAL]); if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) len += sprintf(buffer + len, "[UNREPLIED] "); len += print_tuple(buffer + len, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, proto); len += sprintf(buffer + len, "xid=%d ", conntrack->xid[IP_CT_DIR_REPLY]); + len += print_counters(buffer + len, + &conntrack->counters[IP_CT_DIR_REPLY]); if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) len += sprintf(buffer + len, "[ASSURED] "); len += sprintf(buffer + len, "use=%u ", @@ -640,7 +655,7 @@ EXPORT_SYMBOL(need_ip_conntrack); EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_selective_cleanup); -EXPORT_SYMBOL(ip_ct_refresh); +EXPORT_SYMBOL(ip_ct_refresh_acct); EXPORT_SYMBOL(ip_ct_find_proto); EXPORT_SYMBOL(__ip_ct_find_proto); EXPORT_SYMBOL(ip_ct_find_helper); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 1c6b78106..130b01c18 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -438,7 +438,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, *tuple = *orig_tuple; while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum)) != NULL) { - DEBUGP("Found best for "); DUMP_TUPLE(tuple); + DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple); /* 3) The per-protocol part of the manip is made to map into the range to make a unique tuple. */ @@ -580,9 +580,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack, HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST", conntrack); DEBUGP("Original: "); - DUMP_TUPLE(&orig_tp); + DUMP_TUPLE_RAW(&orig_tp); DEBUGP("New: "); - DUMP_TUPLE(&new_tuple); + DUMP_TUPLE_RAW(&new_tuple); #endif /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT): diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 23f8f511d..ad097f510 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1107,6 +1107,75 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, return 0; } +/* XXX (mef) need to generalize the IPOD stuff. Right now I am borrowing + from the ICMP infrastructure. */ +#ifdef CONFIG_ICMP_IPOD +#include + +extern int sysctl_icmp_ipod_version; +extern int sysctl_icmp_ipod_enabled; +extern u32 sysctl_icmp_ipod_host; +extern u32 sysctl_icmp_ipod_mask; +extern char sysctl_icmp_ipod_key[32+1]; +#define IPOD_CHECK_KEY \ + (sysctl_icmp_ipod_key[0] != 0) +#define IPOD_VALID_KEY(d) \ + (strncmp(sysctl_icmp_ipod_key, (char *)(d), strlen(sysctl_icmp_ipod_key)) == 0) + +static void udp_ping_of_death(struct sk_buff *skb, struct udphdr *uh, u32 saddr) +{ + int doit = 0; + + /* + * If IPOD not enabled or wrong UDP IPOD port, ignore. + */ + if (!sysctl_icmp_ipod_enabled || (ntohs(uh->dest) != 664)) + return; + +#if 0 + printk(KERN_INFO "IPOD: got udp pod request, host=%u.%u.%u.%u\n", NIPQUAD(saddr)); +#endif + + + /* + * First check the source address info. + * If host not set, ignore. + */ + if (sysctl_icmp_ipod_host != 0xffffffff && + (ntohl(saddr) & sysctl_icmp_ipod_mask) == sysctl_icmp_ipod_host) { + /* + * Now check the key if enabled. + * If packet doesn't contain enough data or key + * is otherwise invalid, ignore. + */ + if (IPOD_CHECK_KEY) { + if (pskb_may_pull(skb, sizeof(sysctl_icmp_ipod_key)+sizeof(struct udphdr)-1)){ +#if 0 + int i; + for (i=0;i<32+1;i++){ + printk("%c",((char*)skb->data)[i+sizeof(struct udphdr)]); + } + printk("\n"); +#endif + if (IPOD_VALID_KEY(skb->data+sizeof(struct udphdr))) + doit = 1; + } + } else { + doit = 1; + } + } + if (doit) { + sysctl_icmp_ipod_enabled = 0; + printk(KERN_CRIT "IPOD: reboot forced by %u.%u.%u.%u...\n", + NIPQUAD(saddr)); + machine_restart(NULL); + } else { + printk(KERN_WARNING "IPOD: from %u.%u.%u.%u rejected\n", + NIPQUAD(saddr)); + } +} +#endif + /* * All we need to do is get the socket, and then do a checksum. */ @@ -1143,6 +1212,10 @@ int udp_rcv(struct sk_buff *skb) if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); +#ifdef CONFIG_ICMP_IPOD + udp_ping_of_death(skb, uh, saddr); +#endif + sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); if (sk != NULL) { diff --git a/scripts/.cvsignore b/scripts/.cvsignore new file mode 100644 index 000000000..d95bc0ab8 --- /dev/null +++ b/scripts/.cvsignore @@ -0,0 +1,4 @@ +bin2c +conmakehash +kallsyms +pnmtologo diff --git a/scripts/basic/.cvsignore b/scripts/basic/.cvsignore new file mode 100644 index 000000000..fa6c88800 --- /dev/null +++ b/scripts/basic/.cvsignore @@ -0,0 +1,3 @@ +docproc +fixdep +split-include diff --git a/scripts/kconfig/.cvsignore b/scripts/kconfig/.cvsignore new file mode 100644 index 000000000..37981a9ca --- /dev/null +++ b/scripts/kconfig/.cvsignore @@ -0,0 +1,5 @@ +conf +lex.zconf.c +mconf +zconf.tab.c +zconf.tab.h diff --git a/scripts/kernel-2.6-planetlab.spec b/scripts/kernel-2.6-planetlab.spec index 4e2be569b..84f9f996d 100644 --- a/scripts/kernel-2.6-planetlab.spec +++ b/scripts/kernel-2.6-planetlab.spec @@ -22,7 +22,7 @@ Summary: The Linux kernel (the core of the Linux operating system) %define kversion 2.6.%{sublevel} %define rpmversion 2.6.%{sublevel} %define rhbsys %([ -r /etc/beehive-root ] && echo || echo .`whoami`) -%define release 1.521.2.6.planetlab%{?date:.%{date}} +%define release 1.521.3.planetlab%{?date:.%{date}} %define signmodules 0 %define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE} diff --git a/scripts/lxdialog/.cvsignore b/scripts/lxdialog/.cvsignore new file mode 100644 index 000000000..bebf29560 --- /dev/null +++ b/scripts/lxdialog/.cvsignore @@ -0,0 +1 @@ +lxdialog diff --git a/scripts/mod/.cvsignore b/scripts/mod/.cvsignore new file mode 100644 index 000000000..a6dd5e27e --- /dev/null +++ b/scripts/mod/.cvsignore @@ -0,0 +1,3 @@ +elfconfig.h +mk_elfconfig +modpost diff --git a/usr/.cvsignore b/usr/.cvsignore new file mode 100644 index 000000000..d06dfff84 --- /dev/null +++ b/usr/.cvsignore @@ -0,0 +1,3 @@ +gen_init_cpio +initramfs_data.cpio +initramfs_data.cpio.gz