From 9cf19aed75f49779d5df60e77c5f85de66a87ce5 Mon Sep 17 00:00:00 2001 From: Planet-Lab Support <support@planet-lab.org> Date: Mon, 8 Aug 2005 21:12:11 +0000 Subject: [PATCH] This commit was manufactured by cvs2svn to create tag 'after-ckrm_E17-mem'. --- Documentation/devices.txt | 1 - Documentation/kdump.txt | 105 --- MAINTAINERS | 11 - arch/h8300/kernel/ints.c | 2 +- arch/h8300/platform/h8s/ints.c | 2 +- arch/i386/Kconfig | 47 - arch/i386/boot/compressed/head.S | 6 +- arch/i386/boot/compressed/misc.c | 7 +- arch/i386/kernel/Makefile | 2 - arch/i386/kernel/apic.c | 30 - arch/i386/kernel/crash_dump.c | 105 --- arch/i386/kernel/entry.S | 2 +- arch/i386/kernel/i386_ksyms.c | 6 +- arch/i386/kernel/i8259.c | 12 - arch/i386/kernel/machine_kexec.c | 233 ----- arch/i386/kernel/reboot.c | 82 +- arch/i386/kernel/relocate_kernel.S | 118 --- arch/i386/kernel/setup.c | 13 - arch/i386/kernel/smp.c | 14 +- arch/i386/kernel/vmlinux.lds.S | 59 +- arch/i386/mm/discontig.c | 4 - arch/i386/mm/highmem.c | 19 +- arch/ppc/Kconfig | 20 - arch/ppc/kernel/Makefile | 1 - arch/ppc/kernel/machine_kexec.c | 114 --- arch/ppc/kernel/relocate_kernel.S | 135 --- arch/x86_64/Kconfig | 17 - arch/x86_64/kernel/Makefile | 1 - arch/x86_64/kernel/apic.c | 30 - arch/x86_64/kernel/e820.c | 2 + arch/x86_64/kernel/i8259.c | 14 - arch/x86_64/kernel/io_apic.c | 34 +- arch/x86_64/kernel/machine_kexec.c | 246 ----- arch/x86_64/kernel/reboot.c | 65 +- arch/x86_64/kernel/relocate_kernel.S | 141 --- configs/kernel-2.6.10-i686-planetlab.config | 267 +++++- drivers/char/mem.c | 74 -- drivers/dump/dump_fmt.c | 4 +- drivers/dump/dump_i386.c | 4 +- drivers/dump/dump_memdev.c | 18 +- drivers/dump/dump_overlay.c | 6 +- drivers/dump/dump_setup.c | 9 +- fs/aio.c | 2 +- fs/exec.c | 2 +- fs/ioctl.c | 13 - fs/posix_acl.c | 4 - fs/proc/Makefile | 1 - fs/proc/kcore.c | 10 +- fs/proc/proc_misc.c | 3 - fs/proc/vmcore.c | 239 ----- include/asm-generic/vmlinux.lds.h | 2 +- include/asm-i386/apicdef.h | 1 - include/asm-i386/crash_dump.h | 82 -- include/asm-i386/highmem.h | 1 - include/asm-i386/kexec.h | 25 - include/asm-i386/mach-default/irq_vectors.h | 2 +- include/asm-i386/smp.h | 1 - include/asm-ppc/kexec.h | 36 - include/asm-ppc/machdep.h | 25 - include/asm-x86_64/kexec.h | 25 - include/asm-x86_64/unistd.h | 2 +- include/linux/bootmem.h | 1 - include/linux/ckrm_mem.h | 98 +- include/linux/ckrm_mem_inline.h | 347 ++++--- include/linux/ckrm_tsk.h | 18 +- include/linux/crash_dump.h | 34 - include/linux/dump.h | 2 +- include/linux/ext2_fs.h | 4 +- include/linux/ext3_fs.h | 4 +- include/linux/highmem.h | 1 - include/linux/kexec.h | 57 -- include/linux/mm.h | 2 +- include/linux/page-flags.h | 11 +- include/linux/reboot.h | 2 - include/linux/sched.h | 107 ++- include/linux/vserver/inode.h | 7 - init/Kconfig | 13 +- kernel/Makefile | 2 - kernel/ckrm/Makefile | 2 +- kernel/ckrm/ckrm_cpu_class.c | 11 - kernel/ckrm/ckrm_cpu_monitor.c | 13 +- kernel/ckrm/ckrm_mem.c | 981 -------------------- kernel/ckrm/ckrm_numtasks.c | 404 +++++--- kernel/ckrm/ckrm_numtasks_stub.c | 10 +- kernel/crash.c | 117 --- kernel/exit.c | 2 +- kernel/fork.c | 6 +- kernel/kexec.c | 637 ------------- kernel/panic.c | 8 +- kernel/sys.c | 2 - kernel/sys_ni.c | 1 - kernel/vserver/inode.c | 32 - mm/bootmem.c | 5 - mm/page_alloc.c | 20 +- mm/swap.c | 6 +- mm/vmscan.c | 318 ++----- scripts/kernel-2.6-planetlab.spec | 2 +- 97 files changed, 1177 insertions(+), 4668 deletions(-) delete mode 100644 Documentation/kdump.txt delete mode 100644 arch/i386/kernel/crash_dump.c delete mode 100644 arch/i386/kernel/machine_kexec.c delete mode 100644 arch/i386/kernel/relocate_kernel.S delete mode 100644 arch/ppc/kernel/machine_kexec.c delete mode 100644 arch/ppc/kernel/relocate_kernel.S delete mode 100644 arch/x86_64/kernel/machine_kexec.c delete mode 100644 arch/x86_64/kernel/relocate_kernel.S delete mode 100644 fs/proc/vmcore.c delete mode 100644 include/asm-i386/crash_dump.h delete mode 100644 include/asm-i386/kexec.h delete mode 100644 include/asm-ppc/kexec.h delete mode 100644 include/asm-x86_64/kexec.h delete mode 100644 include/linux/crash_dump.h delete mode 100644 include/linux/kexec.h delete mode 100644 kernel/ckrm/ckrm_mem.c delete mode 100644 kernel/crash.c delete mode 100644 kernel/kexec.c diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 60ce4ae9d..f115145e5 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt @@ -100,7 +100,6 @@ Your cooperation is appreciated. 9 = /dev/urandom Faster, less secure random number gen. 10 = /dev/aio Asyncronous I/O notification interface 11 = /dev/kmsg Writes to this come out as printk's - 12 = /dev/oldmem Access to kexec-ed crash dump 1 block RAM disk 0 = /dev/ram0 First RAM disk 1 = /dev/ram1 Second RAM disk diff --git a/Documentation/kdump.txt b/Documentation/kdump.txt deleted file mode 100644 index 8fc3d68ae..000000000 --- a/Documentation/kdump.txt +++ /dev/null @@ -1,105 +0,0 @@ -Documentation for kdump - the kexec based crash dumping solution -================================================================ - -DESIGN -====== - -We use kexec to reboot to a second kernel whenever a dump needs to be taken. -This second kernel is booted with with very little memory (configurable -at compile time). The first kernel reserves the section of memory that the -second kernel uses. This ensures that on-going DMA from the first kernel -does not corrupt the second kernel. The first 640k of physical memory is -needed irrespective of where the kernel loads at. Hence, this region is -backed up before reboot. - -In the second kernel, "old memory" can be accessed in two ways. The -first one is through a device interface. We can create a /dev/oldmem or -whatever and write out the memory in raw format. The second interface is -through /proc/vmcore. This exports the dump as an ELF format file which -can be written out using any file copy command (cp, scp, etc). Further, gdb -can be used to perform some minimal debugging on the dump file. Both these -methods ensure that there is correct ordering of the dump pages (corresponding -to the first 640k that has been relocated). - -SETUP -===== - -1) Obtain the appropriate -mm tree patch and apply it on to the vanilla - kernel tree. - -2) Two kernels need to be built in order to get this feature working. - - For the first kernel, choose the default values for the following options. - - a) Physical address where the kernel is loaded - b) kexec system call - c) kernel crash dumps - - All the options are under "Processor type and features" - - For the second kernel, change (a) to 16MB. If you want to choose another - value here, ensure "location from where the crash dumping kernel will boot - (MB)" under (c) reflects the same value. - - Also ensure you have CONFIG_HIGHMEM on. - -3) Boot into the first kernel. You are now ready to try out kexec based crash - dumps. - -4) Load the second kernel to be booted using - - kexec -p <second-kernel> --args-linux --append="root=<root-dev> dump - init 1 memmap=exactmap memmap=640k@0 memmap=32M@16M" - - Note that <second-kernel> has to be a vmlinux image. bzImage will not - work, as of now. - -5) Enable kexec based dumping by - - echo 1 > /proc/kexec-dump - - If this is not set, the system will not do a kexec reboot in the event - of a panic. - -6) System reboots into the second kernel when a panic occurs. - You could write a module to call panic, for testing purposes. - -7) Write out the dump file using - - cp /proc/vmcore <dump-file> - -You can also access the dump as a device for a linear/raw view. To do this, -you will need the kd-oldmem-<version>.patch built into the kernel. To create -the device, type - - mknod /dev/oldmem c 1 12 - -Use "dd" with suitable options for count, bs and skip to access specific -portions of the dump. - -ANALYSIS -======== - -You can run gdb on the dump file copied out of /proc/vmcore. Use vmlinux built -with -g and run - - gdb vmlinux <dump-file> - -Stack trace for the task on processor 0, register display, memory display -work fine. - -TODO -==== - -1) Provide a kernel-pages only view for the dump. This could possibly turn up - as /proc/vmcore-kern. -2) Provide register contents of all processors (similar to what multi-threaded - core dumps does). -3) Modify "crash" to make it recognize this dump. -4) Make the i386 kernel boot from any location so we can run the second kernel - from the reserved location instead of the current approach. - -CONTACT -======= - -Hariprasad Nellitheertha - hari at in dot ibm dot com diff --git a/MAINTAINERS b/MAINTAINERS index 5f81698b5..66275d498 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1259,17 +1259,6 @@ M: rml@novell.com L: linux-kernel@vger.kernel.org S: Maintained -KEXEC -P: Eric Biederman -P: Randy Dunlap -M: ebiederm@xmission.com -M: rddunlap@osdl.org -W: http://www.xmission.com/~ebiederm/files/kexec/ -W: http://developer.osdl.org/rddunlap/kexec/ -L: linux-kernel@vger.kernel.org -L: fastboot@osdl.org -S: Maintained - LANMEDIA WAN CARD DRIVER P: Andrew Stanley-Jones M: asj@lanmedia.com diff --git a/arch/h8300/kernel/ints.c b/arch/h8300/kernel/ints.c index 0b9ddba3d..edb3c4170 100644 --- a/arch/h8300/kernel/ints.c +++ b/arch/h8300/kernel/ints.c @@ -114,7 +114,7 @@ void __init init_IRQ(void) } } interrupt_redirect_table = ramvec; -#ifdef CRASH_DUMP_VECTOR +#ifdef DUMP_VECTOR ramvec_p = ramvec; for (i = 0; i < NR_IRQS; i++) { if ((i % 8) == 0) diff --git a/arch/h8300/platform/h8s/ints.c b/arch/h8300/platform/h8s/ints.c index 6b27e5ac5..5441cdd12 100644 --- a/arch/h8300/platform/h8s/ints.c +++ b/arch/h8300/platform/h8s/ints.c @@ -134,7 +134,7 @@ void __init init_IRQ(void) ramvec[TRAP0_VEC] = VECTOR(system_call); ramvec[TRAP3_VEC] = break_vec; interrupt_redirect_table = ramvec; -#ifdef CRASH_DUMP_VECTOR +#ifdef DUMP_VECTOR ramvec_p = ramvec; for (i = 0; i < NR_IRQS; i++) { if ((i % 8) == 0) diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 1ed5b3831..1e4f78c0a 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -922,53 +922,6 @@ config REGPARM generate incorrect output with certain kernel constructs when -mregparm=3 is used. -config KERN_PHYS_OFFSET - int "Physical address where the kernel is loaded (1-112)MB" - range 1 112 - default "1" - help - This gives the physical address where the kernel is loaded. - Primarily used in the case of kexec on panic where the - recovery kernel needs to run at a different address than - the panic-ed kernel. - -config KEXEC - bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - kexec is a system call that implements the ability to shutdown your - current kernel, and to start another kernel. It is like a reboot - but it is indepedent of the system firmware. And like a reboot - you can start any kernel with it, not just Linux. - - The name comes from the similiarity to the exec system call. - - It is an ongoing process to be certain the hardware in a machine - is properly shutdown, so do not be surprised if this code does not - initially work for you. It may help to enable device hotplugging - support. As of this writing the exact hardware interface is - strongly in flux, so no good recommendation can be made. - -config CRASH_DUMP - bool "kernel crash dumps (EXPERIMENTAL)" - depends on KEXEC - help - Generate crash dump using kexec. - -config BACKUP_BASE - int "location from where the crash dumping kernel will boot (MB)" - depends on CRASH_DUMP - default 16 - help - This is the location where the second kernel will boot from. - -config BACKUP_SIZE - int "Size of memory used by the crash dumping kernel (MB)" - depends on CRASH_DUMP - range 16 64 - default 32 - help - The size of the second kernel's memory. endmenu diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index 4f41af3a5..c5e80b69e 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -74,7 +74,7 @@ startup_32: popl %esi # discard address popl %esi # real mode pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $KERN_PHYS_OFFSET + ljmp $(__BOOT_CS), $0x100000 /* * We come here, if we were loaded high. @@ -99,7 +99,7 @@ startup_32: popl %ecx # lcount popl %edx # high_buffer_start popl %eax # hcount - movl $KERN_PHYS_OFFSET,%edi + movl $0x100000,%edi cli # make sure we don't get interrupted ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine @@ -124,5 +124,5 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $KERN_PHYS_OFFSET + ljmp $(__BOOT_CS), $0x100000 move_routine_end: diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 9805b3730..874568330 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -14,7 +14,6 @@ #include <linux/tty.h> #include <video/edid.h> #include <asm/io.h> -#include <asm/segment.h> /* * gzip declarations @@ -310,7 +309,7 @@ static void setup_normal_output_buffer(void) #else if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (char *)KERN_PHYS_OFFSET; /* Points to 1M */ + output_data = (char *)0x100000; /* Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -335,8 +334,8 @@ static void setup_output_buffer_if_we_run_high(struct moveparams *mv) low_buffer_size = low_buffer_end - LOW_BUFFER_START; high_loaded = 1; free_mem_end_ptr = (long)high_buffer_start; - if ( (KERN_PHYS_OFFSET + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(KERN_PHYS_OFFSET + low_buffer_size); + if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(0x100000 + low_buffer_size); mv->hcount = 0; /* say: we need not to move high_buffer */ } else mv->hcount = -1; diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 501c21afe..8ec7eac47 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -23,8 +23,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o -obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index de085522e..9d5590c2d 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -219,36 +219,6 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write_around(APIC_SPIV, value); - - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT); - apic_write_around(APIC_LVT0, value); - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); - } } void disable_local_APIC(void) diff --git a/arch/i386/kernel/crash_dump.c b/arch/i386/kernel/crash_dump.c deleted file mode 100644 index 1c9bdd2a8..000000000 --- a/arch/i386/kernel/crash_dump.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Architecture specific (i386) functions for kexec based crash dumps. - * - * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) - * - * Copyright (C) IBM Corporation, 2004. All rights reserved. - * - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/irq.h> - -#include <asm/crash_dump.h> -#include <asm/processor.h> -#include <asm/hardirq.h> -#include <asm/nmi.h> -#include <asm/hw_irq.h> - -struct pt_regs crash_smp_regs[NR_CPUS]; -long crash_smp_current_task[NR_CPUS]; - -#ifdef CONFIG_SMP -static atomic_t waiting_for_dump_ipi; -static int crash_dump_expect_ipi[NR_CPUS]; -extern void crash_dump_send_ipi(void); -extern void stop_this_cpu(void *); - -static int crash_dump_nmi_callback(struct pt_regs *regs, int cpu) -{ - if (!crash_dump_expect_ipi[cpu]) - return 0; - - crash_dump_expect_ipi[cpu] = 0; - crash_dump_save_this_cpu(regs, cpu); - atomic_dec(&waiting_for_dump_ipi); - - stop_this_cpu(NULL); - - return 1; -} - -void __crash_dump_stop_cpus(void) -{ - int i, cpu, other_cpus; - - preempt_disable(); - cpu = smp_processor_id(); - other_cpus = num_online_cpus()-1; - - if (other_cpus > 0) { - atomic_set(&waiting_for_dump_ipi, other_cpus); - - for (i = 0; i < NR_CPUS; i++) - crash_dump_expect_ipi[i] = (i != cpu && cpu_online(i)); - - set_nmi_callback(crash_dump_nmi_callback); - /* Ensure the new callback function is set before sending - * out the IPI - */ - wmb(); - - crash_dump_send_ipi(); - while (atomic_read(&waiting_for_dump_ipi) > 0) - cpu_relax(); - - unset_nmi_callback(); - } else { - local_irq_disable(); - disable_local_APIC(); - local_irq_enable(); - } - preempt_enable(); -} -#else -void __crash_dump_stop_cpus(void) {} -#endif - -void crash_get_current_regs(struct pt_regs *regs) -{ - __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx)); - __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx)); - __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx)); - __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi)); - __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi)); - __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp)); - __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax)); - __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp)); - __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss)); - __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs)); - __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds)); - __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes)); - __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags)); - - regs->eip = (unsigned long)current_text_addr(); -} - -void crash_dump_save_this_cpu(struct pt_regs *regs, int cpu) -{ - crash_smp_current_task[cpu] = (long)current; - crash_smp_regs[cpu] = *regs; -} - diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 006a19af1..02a2e7ce4 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -871,7 +871,7 @@ ENTRY(sys_call_table) .long sys_mq_timedreceive /* 280 */ .long sys_mq_notify .long sys_mq_getsetattr - .long sys_kexec_load + .long sys_ni_syscall /* reserved for kexec */ .long sys_waitid .long sys_ni_syscall /* 285 */ /* available */ .long sys_add_key diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 9967fb368..685b7a25c 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -200,7 +200,7 @@ EXPORT_SYMBOL(ist_info); EXPORT_SYMBOL(csum_partial); -#ifdef CONFIG_CRASH_DUMP +#ifdef CONFIG_CRASH_DUMP_MODULE #ifdef CONFIG_SMP extern irq_desc_t irq_desc[NR_IRQS]; extern unsigned long irq_affinity[NR_IRQS]; @@ -210,8 +210,8 @@ EXPORT_SYMBOL(irq_affinity); EXPORT_SYMBOL(stop_this_cpu); EXPORT_SYMBOL(dump_send_ipi); #endif -extern int page_is_ram(unsigned long); -EXPORT_SYMBOL(page_is_ram); +extern int pfn_is_ram(unsigned long); +EXPORT_SYMBOL(pfn_is_ram); #ifdef ARCH_HAS_NMI_WATCHDOG EXPORT_SYMBOL(touch_nmi_watchdog); #endif diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index 12c1fb9f5..686a95bd0 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -269,22 +269,10 @@ static int i8259A_suspend(struct sys_device *dev, u32 state) return 0; } -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-1 */ - return 0; -} - static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .suspend = i8259A_suspend, .resume = i8259A_resume, - .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c deleted file mode 100644 index ff59e77ac..000000000 --- a/arch/i386/kernel/machine_kexec.c +++ /dev/null @@ -1,233 +0,0 @@ -/* - * machine_kexec.c - handle transition of Linux booting another kernel - * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/mm.h> -#include <linux/kexec.h> -#include <linux/delay.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/cpufeature.h> -#include <asm/crash_dump.h> - -static inline unsigned long read_cr3(void) -{ - unsigned long cr3; - asm volatile("movl %%cr3,%0": "=r"(cr3)); - return cr3; -} - -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) - -#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L2_ATTR (_PAGE_PRESENT) - -#define LEVEL0_SIZE (1UL << 12UL) - -#ifndef CONFIG_X86_PAE -#define LEVEL1_SIZE (1UL << 22UL) -static u32 pgtable_level1[1024] PAGE_ALIGNED; - -static void identity_map_page(unsigned long address) -{ - unsigned long level1_index, level2_index; - u32 *pgtable_level2; - - /* Find the current page table */ - pgtable_level2 = __va(read_cr3()); - - /* Find the indexes of the physical address to identity map */ - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; - level2_index = address / LEVEL1_SIZE; - - /* Identity map the page table entry */ - pgtable_level1[level1_index] = address | L0_ATTR; - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; - - /* Flush the tlb so the new mapping takes effect. - * Global tlb entries are not flushed but that is not an issue. - */ - load_cr3(pgtable_level2); -} - -#else -#define LEVEL1_SIZE (1UL << 21UL) -#define LEVEL2_SIZE (1UL << 30UL) -static u64 pgtable_level1[512] PAGE_ALIGNED; -static u64 pgtable_level2[512] PAGE_ALIGNED; - -static void identity_map_page(unsigned long address) -{ - unsigned long level1_index, level2_index, level3_index; - u64 *pgtable_level3; - - /* Find the current page table */ - pgtable_level3 = __va(read_cr3()); - - /* Find the indexes of the physical address to identity map */ - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; - level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; - level3_index = address / LEVEL2_SIZE; - - /* Identity map the page table entry */ - pgtable_level1[level1_index] = address | L0_ATTR; - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; - set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR); - - /* Flush the tlb so the new mapping takes effect. - * Global tlb entries are not flushed but that is not an issue. - */ - load_cr3(pgtable_level3); -} -#endif - - -static void set_idt(void *newidt, __u16 limit) -{ - unsigned char curidt[6]; - - /* ia32 supports unaliged loads & stores */ - (*(__u16 *)(curidt)) = limit; - (*(__u32 *)(curidt +2)) = (unsigned long)(newidt); - - __asm__ __volatile__ ( - "lidt %0\n" - : "=m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, __u16 limit) -{ - unsigned char curgdt[6]; - - /* ia32 supports unaligned loads & stores */ - (*(__u16 *)(curgdt)) = limit; - (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt); - - __asm__ __volatile__ ( - "lgdt %0\n" - : "=m" (curgdt) - ); -}; - -static void load_segments(void) -{ -#define __STR(X) #X -#define STR(X) __STR(X) - - __asm__ __volatile__ ( - "\tljmp $"STR(__KERNEL_CS)",$1f\n" - "\t1:\n" - "\tmovl $"STR(__KERNEL_DS)",%eax\n" - "\tmovl %eax,%ds\n" - "\tmovl %eax,%es\n" - "\tmovl %eax,%fs\n" - "\tmovl %eax,%gs\n" - "\tmovl %eax,%ss\n" - ); -#undef STR -#undef __STR -} - -typedef asmlinkage void (*relocate_new_kernel_t)( - unsigned long indirection_page, unsigned long reboot_code_buffer, - unsigned long start_address, unsigned int has_pae); - -const extern unsigned char relocate_new_kernel[]; -extern void relocate_new_kernel_end(void); -const extern unsigned int relocate_new_kernel_size; - -/* - * Do what every setup is needed on image and the - * reboot code buffer to allow us to avoid allocations - * later. Currently nothing. - */ -int machine_kexec_prepare(struct kimage *image) -{ - return 0; -} - -void machine_kexec_cleanup(struct kimage *image) -{ -} - -/* - * We are going to do a memory preserving reboot. So, we copy over the - * first 640k of memory into a backup location. Though the second kernel - * boots from a different location, it still requires the first 640k. - * Hence this backup. - */ -void __crash_relocate_mem(unsigned long backup_addr, unsigned long backup_size) -{ - unsigned long pfn, pfn_max; - void *src_addr, *dest_addr; - struct page *page; - - pfn_max = backup_size >> PAGE_SHIFT; - for (pfn = 0; pfn < pfn_max; pfn++) { - src_addr = phys_to_virt(pfn << PAGE_SHIFT); - dest_addr = backup_addr + src_addr; - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - if (PageReserved(page)) - copy_page(dest_addr, src_addr); - } -} - -/* - * Do not allocate memory (or fail in any way) in machine_kexec(). - * We are past the point of no return, committed to rebooting now. - */ -void machine_kexec(struct kimage *image) -{ - unsigned long indirection_page; - unsigned long reboot_code_buffer; - relocate_new_kernel_t rnk; - - /* Interrupts aren't acceptable while we reboot */ - local_irq_disable(); - - /* Compute some offsets */ - reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - indirection_page = image->head & PAGE_MASK; - - /* Set up an identity mapping for the reboot_code_buffer */ - identity_map_page(reboot_code_buffer); - - /* copy it out */ - memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size); - - /* The segment registers are funny things, they are - * automatically loaded from a table, in memory wherever you - * set them to a specific selector, but this table is never - * accessed again you set the segment to a different selector. - * - * The more common model is are caches where the behide - * the scenes work is done, but is also dropped at arbitrary - * times. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); - - /* now call it */ - rnk = (relocate_new_kernel_t) reboot_code_buffer; - (*rnk)(indirection_page, reboot_code_buffer, image->start, cpu_has_pae); -} diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index c4d62d9a8..dd06362cd 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -23,6 +23,7 @@ static int reboot_mode; int reboot_thru_bios; #ifdef CONFIG_SMP +int reboot_smp = 0; static int reboot_cpu = -1; /* shamelessly grabbed from lib/vsprintf.c for readability */ #define is_digit(c) ((c) >= '0' && (c) <= '9') @@ -45,6 +46,7 @@ static int __init reboot_setup(char *str) break; #ifdef CONFIG_SMP case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ + reboot_smp = 1; if (is_digit(*(str+1))) { reboot_cpu = (int) (*(str+1) - '0'); if (is_digit(*(str+2))) @@ -83,9 +85,33 @@ static int __init set_bios_reboot(struct dmi_system_id *d) return 0; } +/* + * Some machines require the "reboot=s" commandline option, this quirk makes that automatic. + */ +static int __init set_smp_reboot(struct dmi_system_id *d) +{ +#ifdef CONFIG_SMP + if (!reboot_smp) { + reboot_smp = 1; + printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident); + } +#endif + return 0; +} + +/* + * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic. + */ +static int __init set_smp_bios_reboot(struct dmi_system_id *d) +{ + set_smp_reboot(d); + set_bios_reboot(d); + return 0; +} + static struct dmi_system_id __initdata reboot_dmi_table[] = { { /* Handle problems with rebooting on Dell 1300's */ - .callback = set_bios_reboot, + .callback = set_smp_bios_reboot, .ident = "Dell PowerEdge 1300", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), @@ -269,32 +295,41 @@ void machine_real_restart(unsigned char *code, int length) : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); } -void machine_shutdown(void) +void machine_restart(char * __unused) { #ifdef CONFIG_SMP - int reboot_cpu_id; - - /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; - - /* See if there has been given a command line override */ - if ((reboot_cpu_id != -1) && (reboot_cpu < NR_CPUS) && - cpu_isset(reboot_cpu, cpu_online_map)) { - reboot_cpu_id = reboot_cpu; + int cpuid; + + cpuid = GET_APIC_ID(apic_read(APIC_ID)); + + if (reboot_smp) { + + /* check to see if reboot_cpu is valid + if its not, default to the BSP */ + if ((reboot_cpu == -1) || + (reboot_cpu > (NR_CPUS -1)) || + !physid_isset(cpuid, phys_cpu_present_map)) + reboot_cpu = boot_cpu_physical_apicid; + + reboot_smp = 0; /* use this as a flag to only go through this once*/ + /* re-run this function on the other CPUs + it will fall though this section since we have + cleared reboot_smp, and do the reboot if it is the + correct CPU, otherwise it halts. */ + if (reboot_cpu != cpuid) + smp_call_function((void *)machine_restart , NULL, 1, 0); } - /* Make certain the cpu I'm rebooting on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { - reboot_cpu_id = smp_processor_id(); + /* if reboot_cpu is still -1, then we want a tradional reboot, + and if we are not running on the reboot_cpu,, halt */ + if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) { + for (;;) + __asm__ __volatile__ ("hlt"); } - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - - /* O.K. Now that I'm on the appropriate processor, stop - * all of the others, and disable their local APICs. + /* + * Stop all CPUs and turn off local APICs and the IO-APIC, so + * other OSs see a clean IRQ state. */ - smp_send_stop(); #endif /* CONFIG_SMP */ @@ -303,11 +338,6 @@ void machine_shutdown(void) #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); #endif -} - -void machine_restart(char * __unused) -{ - machine_shutdown(); if (!reboot_thru_bios) { if (efi_enabled) { diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S deleted file mode 100644 index 54be4c2ae..000000000 --- a/arch/i386/kernel/relocate_kernel.S +++ /dev/null @@ -1,118 +0,0 @@ -/* - * relocate_kernel.S - put the kernel image in place to boot - * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/linkage.h> - - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel -relocate_new_kernel: - /* read the arguments and say goodbye to the stack */ - movl 4(%esp), %ebx /* indirection_page */ - movl 8(%esp), %ebp /* reboot_code_buffer */ - movl 12(%esp), %edx /* start address */ - movl 16(%esp), %ecx /* cpu_has_pae */ - - /* zero out flags, and disable interrupts */ - pushl $0 - popfl - - /* set a new stack at the bottom of our page... */ - lea 4096(%ebp), %esp - - /* store the parameters back on the stack */ - pushl %edx /* store the start address */ - - /* Set cr0 to a known state: - * 31 0 == Paging disabled - * 18 0 == Alignment check disabled - * 16 0 == Write protect disabled - * 3 0 == No task switch - * 2 0 == Don't do FP software emulation. - * 0 1 == Proctected mode enabled - */ - movl %cr0, %eax - andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax - orl $(1<<0), %eax - movl %eax, %cr0 - - /* clear cr4 if applicable */ - testl %ecx, %ecx - jz 1f - /* Set cr4 to a known state: - * Setting everything to zero seems safe. - */ - movl %cr4, %eax - andl $0, %eax - movl %eax, %cr4 - - jmp 1f -1: - - /* Flush the TLB (needed?) */ - xorl %eax, %eax - movl %eax, %cr3 - - /* Do the copies */ - cld -0: /* top, read another word for the indirection page */ - movl %ebx, %ecx - movl (%ebx), %ecx - addl $4, %ebx - testl $0x1, %ecx /* is it a destination page */ - jz 1f - movl %ecx, %edi - andl $0xfffff000, %edi - jmp 0b -1: - testl $0x2, %ecx /* is it an indirection page */ - jz 1f - movl %ecx, %ebx - andl $0xfffff000, %ebx - jmp 0b -1: - testl $0x4, %ecx /* is it the done indicator */ - jz 1f - jmp 2f -1: - testl $0x8, %ecx /* is it the source indicator */ - jz 0b /* Ignore it otherwise */ - movl %ecx, %esi /* For every source page do a copy */ - andl $0xfffff000, %esi - - movl $1024, %ecx - rep ; movsl - jmp 0b - -2: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB, it's handy, and not processor dependent. - */ - xorl %eax, %eax - movl %eax, %cr3 - - /* set all of the registers to known values */ - /* leave %esp alone */ - - xorl %eax, %eax - xorl %ebx, %ebx - xorl %ecx, %ecx - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - xorl %ebp, %ebp - ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .long relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 7ccdf028c..6910009bc 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -48,7 +48,6 @@ #include <asm/io_apic.h> #include <asm/ist.h> #include <asm/io.h> -#include <asm/crash_dump.h> #include "setup_arch_pre.h" #include <bios_ebda.h> @@ -58,7 +57,6 @@ unsigned long init_pg_tables_end __initdata = ~0UL; int disable_pse __initdata = 0; -unsigned int dump_enabled; /* * Machine setup.. @@ -712,11 +710,6 @@ static void __init parse_cmdline_early (char ** cmdline_p) if (to != command_line) to--; if (!memcmp(from+7, "exactmap", 8)) { - /* If we are doing a crash dump, we - * still need to know the real mem - * size. - */ - set_saved_max_pfn(); from += 8+7; e820.nr_map = 0; userdef = 1; @@ -823,9 +816,6 @@ static void __init parse_cmdline_early (char ** cmdline_p) */ if (c == ' ' && !memcmp(from, "highmem=", 8)) highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; - - if (!memcmp(from, "dump", 4)) - dump_enabled = 1; if (c == ' ' && !memcmp(from, "crashdump=", 10)) crashdump_addr = memparse(from+10, &from); @@ -1125,9 +1115,6 @@ static unsigned long __init setup_memory(void) } } #endif - - crash_reserve_bootmem(); - return max_low_pfn; } #else diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 131ed47f2..812b50a1f 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -139,15 +139,12 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) */ apic_wait_icr_idle(); - if (vector == CRASH_DUMP_VECTOR) - cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI; - /* * No need to touch the target chip field */ cfg = __prepare_ICR(shortcut, vector); - if (vector == CRASH_DUMP_VECTOR) { + if (vector == DUMP_VECTOR) { /* * Setup DUMP IPI to be delivered as an NMI */ @@ -232,7 +229,7 @@ inline void send_IPI_mask_sequence(cpumask_t mask, int vector) */ cfg = __prepare_ICR(0, vector); - if (vector == CRASH_DUMP_VECTOR) { + if (vector == DUMP_VECTOR) { /* * Setup DUMP IPI to be delivered as an NMI */ @@ -489,7 +486,7 @@ void flush_tlb_all(void) void dump_send_ipi(void) { - send_IPI_allbutself(CRASH_DUMP_VECTOR); + send_IPI_allbutself(DUMP_VECTOR); } /* @@ -502,11 +499,6 @@ void smp_send_reschedule(int cpu) send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } -void crash_dump_send_ipi(void) -{ - send_IPI_allbutself(CRASH_DUMP_VECTOR); -} - /* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index cfb1b180e..e0512cc8b 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -2,24 +2,20 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; */ -#define LOAD_OFFSET __PAGE_OFFSET - #include <asm-generic/vmlinux.lds.h> #include <asm/thread_info.h> #include <asm/page.h> -#include <asm/segment.h> OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) +ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = LOAD_OFFSET + KERN_PHYS_OFFSET; - phys_startup_32 = startup_32 - LOAD_OFFSET; + . = __PAGE_OFFSET + 0x100000; /* read-only */ _text = .; /* Text and read-only data */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { + .text : { *(.text) SCHED_TEXT LOCK_TEXT @@ -31,51 +27,49 @@ SECTIONS . = ALIGN(16); /* Exception table */ __start___ex_table = .; - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } + __ex_table : { *(__ex_table) } __stop___ex_table = .; RODATA /* writeable */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ + .data : { /* Data */ *(.data) CONSTRUCTORS } . = ALIGN(4096); __nosave_begin = .; - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } + .data_nosave : { *(.data.nosave) } . = ALIGN(4096); __nosave_end = .; . = ALIGN(4096); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { *(.data.idt) } + .data.page_aligned : { *(.data.idt) } . = ALIGN(32); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } + .data.cacheline_aligned : { *(.data.cacheline_aligned) } _edata = .; /* End of data section */ . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) } + .data.init_task : { *(.data.init_task) } /* will be freed after init */ . = ALIGN(4096); /* Init code and data */ __init_begin = .; - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + .init.text : { _sinittext = .; *(.init.text) _einittext = .; } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } + .init.data : { *(.init.data) } . = ALIGN(16); __setup_start = .; - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } + .init.setup : { *(.init.setup) } __setup_end = .; __initcall_start = .; - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + .initcall.init : { *(.initcall1.init) *(.initcall2.init) *(.initcall3.init) @@ -86,40 +80,33 @@ SECTIONS } __initcall_end = .; __con_initcall_start = .; - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - *(.con_initcall.init) - } + .con_initcall.init : { *(.con_initcall.init) } __con_initcall_end = .; SECURITY_INIT . = ALIGN(4); __alt_instructions = .; - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - *(.altinstructions) - } - __alt_instructions_end = .; - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } + .altinstructions : { *(.altinstructions) } + __alt_instructions_end = .; + .altinstr_replacement : { *(.altinstr_replacement) } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } + .exit.text : { *(.exit.text) } + .exit.data : { *(.exit.data) } . = ALIGN(4096); __initramfs_start = .; - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } + .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; . = ALIGN(32); __per_cpu_start = .; - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } + .data.percpu : { *(.data.percpu) } __per_cpu_end = .; . = ALIGN(4096); __init_end = .; /* freed after init ends here */ __bss_start = .; /* BSS */ - .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) { - *(.bss.page_aligned) } - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + .bss : { + *(.bss.page_aligned) *(.bss) } . = ALIGN(4); diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index 5bd9e6cff..33b81da92 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -32,7 +32,6 @@ #include <asm/e820.h> #include <asm/setup.h> #include <asm/mmzone.h> -#include <asm/crash_dump.h> #include <bios_ebda.h> struct pglist_data *node_data[MAX_NUMNODES]; @@ -364,9 +363,6 @@ unsigned long __init setup_memory(void) } } #endif - - crash_reserve_bootmem(); - return system_max_low_pfn; } diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index c5547476b..581753285 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -74,24 +74,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type) preempt_check_resched(); } -/* This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -char *kmap_atomic_pfn(unsigned long pfn, enum km_type type) -{ - enum fixed_addresses idx; - unsigned long vaddr; - - inc_preempt_count(); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); - __flush_tlb_one(vaddr); - - return (char *)vaddr; -} - struct page *kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; @@ -104,3 +86,4 @@ struct page *kmap_atomic_to_page(void *ptr) pte = kmap_pte - (idx - FIX_KMAP_BEGIN); return pte_page(*pte); } + diff --git a/arch/ppc/Kconfig b/arch/ppc/Kconfig index b460f005d..56afd54b7 100644 --- a/arch/ppc/Kconfig +++ b/arch/ppc/Kconfig @@ -189,26 +189,6 @@ config MATH_EMULATION here. Saying Y here will not hurt performance (on any machine) but will increase the size of the kernel. -config KEXEC - bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - kexec is a system call that implements the ability to shutdown your - current kernel, and to start another kernel. It is like a reboot - but it is indepedent of the system firmware. And like a reboot - you can start any kernel with it, not just Linux. - - The name comes from the similiarity to the exec system call. - - It is an ongoing process to be certain the hardware in a machine - is properly shutdown, so do not be surprised if this code does not - initially work for you. It may help to enable device hotplugging - support. As of this writing the exact hardware interface is - strongly in flux, so no good recommendation can be made. - - In the GameCube implementation, kexec allows you to load and - run DOL files, including kernel and homebrew DOLs. - source "drivers/cpufreq/Kconfig" config CPU_FREQ_PMAC diff --git a/arch/ppc/kernel/Makefile b/arch/ppc/kernel/Makefile index 24845dbe4..7d0abfff4 100644 --- a/arch/ppc/kernel/Makefile +++ b/arch/ppc/kernel/Makefile @@ -24,7 +24,6 @@ obj-$(CONFIG_KGDB) += ppc-stub.o obj-$(CONFIG_SMP) += smp.o smp-tbsync.o obj-$(CONFIG_TAU) += temp.o obj-$(CONFIG_ALTIVEC) += vecemu.o vector.o -obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o ifndef CONFIG_MATH_EMULATION obj-$(CONFIG_8xx) += softemu8xx.o diff --git a/arch/ppc/kernel/machine_kexec.c b/arch/ppc/kernel/machine_kexec.c deleted file mode 100644 index caac3d456..000000000 --- a/arch/ppc/kernel/machine_kexec.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * machine_kexec.c - handle transition of Linux booting another kernel - * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> - * - * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/mm.h> -#include <linux/kexec.h> -#include <linux/delay.h> -#include <linux/reboot.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/mmu_context.h> -#include <asm/io.h> -#include <asm/hw_irq.h> -#include <asm/cacheflush.h> -#include <asm/machdep.h> - -typedef void (*relocate_new_kernel_t)( - unsigned long indirection_page, unsigned long reboot_code_buffer, - unsigned long start_address); - -const extern unsigned char relocate_new_kernel[]; -const extern unsigned int relocate_new_kernel_size; - -void machine_shutdown(void) -{ - if (ppc_md.machine_shutdown) { - ppc_md.machine_shutdown(); - } -} - -/* - * Do what every setup is needed on image and the - * reboot code buffer to allow us to avoid allocations - * later. - */ -int machine_kexec_prepare(struct kimage *image) -{ - if (ppc_md.machine_kexec_prepare) { - return ppc_md.machine_kexec_prepare(image); - } - /* - * Fail if platform doesn't provide its own machine_kexec_prepare - * implementation. - */ - return -ENOSYS; -} - -void machine_kexec_cleanup(struct kimage *image) -{ - if (ppc_md.machine_kexec_cleanup) { - ppc_md.machine_kexec_cleanup(image); - } -} - -/* - * Do not allocate memory (or fail in any way) in machine_kexec(). - * We are past the point of no return, committed to rebooting now. - */ -void machine_kexec(struct kimage *image) -{ - if (ppc_md.machine_kexec) { - ppc_md.machine_kexec(image); - } else { - /* - * Fall back to normal restart if platform doesn't provide - * its own kexec function, and user insist to kexec... - */ - machine_restart(NULL); - } -} - - -/* - * This is a generic machine_kexec function suitable at least for - * non-OpenFirmware embedded platforms. - * It merely copies the image relocation code to the control page and - * jumps to it. - * A platform specific function may just call this one. - */ -void machine_kexec_simple(struct kimage *image) -{ - unsigned long indirection_page; - unsigned long reboot_code_buffer, reboot_code_buffer_phys; - relocate_new_kernel_t rnk; - - /* Interrupts aren't acceptable while we reboot */ - local_irq_disable(); - - indirection_page = image->head & PAGE_MASK; - - /* we need both effective and real address here */ - reboot_code_buffer = - (unsigned long)page_address(image->control_code_page); - reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer); - - /* copy our kernel relocation code to the control code page */ - memcpy((void *)reboot_code_buffer, - relocate_new_kernel, relocate_new_kernel_size); - - flush_icache_range(reboot_code_buffer, - reboot_code_buffer + KEXEC_CONTROL_CODE_SIZE); - printk(KERN_INFO "Bye!\n"); - - /* now call it */ - rnk = (relocate_new_kernel_t) reboot_code_buffer; - (*rnk)(indirection_page, reboot_code_buffer_phys, image->start); -} - diff --git a/arch/ppc/kernel/relocate_kernel.S b/arch/ppc/kernel/relocate_kernel.S deleted file mode 100644 index e170b13a6..000000000 --- a/arch/ppc/kernel/relocate_kernel.S +++ /dev/null @@ -1,135 +0,0 @@ -/* - * relocate_kernel.S - put the kernel image in place to boot - * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> - * - * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <asm/reg.h> -#include <asm/ppc_asm.h> -#include <asm/processor.h> - -#include <asm/kexec.h> - -#define PAGE_SIZE 4096 /* must be same value as in <asm/page.h> */ - -/* returns r3 = relocated address of sym */ -/* modifies r0 */ -#define RELOC_SYM(sym) \ - mflr r3; \ - bl 1f; \ -1: mflr r0; \ - mtlr r3; \ - lis r3, 1b@ha; \ - ori r3, r3, 1b@l; \ - subf r0, r3, r0; \ - lis r3, sym@ha; \ - ori r3, r3, sym@l; \ - add r3, r3, r0 - - /* - * Must be relocatable PIC code callable as a C function. - */ - .globl relocate_new_kernel -relocate_new_kernel: - /* r3 = indirection_page */ - /* r4 = reboot_code_buffer */ - /* r5 = start_address */ - - li r0, 0 - - /* - * Set Machine Status Register to a known status, - * switch the MMU off and jump to 1: in a single step. - */ - - mr r8, r0 - ori r8, r8, MSR_RI|MSR_ME - mtspr SRR1, r8 - addi r8, r4, 1f - relocate_new_kernel - mtspr SRR0, r8 - sync - rfi - -1: - /* from this point address translation is turned off */ - /* and interrupts are disabled */ - - /* set a new stack at the bottom of our page... */ - /* (not really needed now) */ - addi r1, r4, KEXEC_CONTROL_CODE_SIZE - 8 /* for LR Save+Back Chain */ - stw r0, 0(r1) - - /* Do the copies */ - li r6, 0 /* checksum */ - subi r3, r3, 4 - -0: /* top, read another word for the indirection page */ - lwzu r0, 4(r3) - - /* is it a destination page? (r8) */ - rlwinm. r7, r0, 0, 31, 31 /* IND_DESTINATION (1<<0) */ - beq 1f - - rlwinm r8, r0, 0, 0, 19 /* clear kexec flags, page align */ - b 0b - -1: /* is it an indirection page? (r3) */ - rlwinm. r7, r0, 0, 30, 30 /* IND_INDIRECTION (1<<1) */ - beq 1f - - rlwinm r3, r0, 0, 0, 19 /* clear kexec flags, page align */ - subi r3, r3, 4 - b 0b - -1: /* are we done? */ - rlwinm. r7, r0, 0, 29, 29 /* IND_DONE (1<<2) */ - beq 1f - b 2f - -1: /* is it a source page? (r9) */ - rlwinm. r7, r0, 0, 28, 28 /* IND_SOURCE (1<<3) */ - beq 0b - - rlwinm r9, r0, 0, 0, 19 /* clear kexec flags, page align */ - - li r7, PAGE_SIZE / 4 - mtctr r7 - subi r9, r9, 4 - subi r8, r8, 4 -9: - lwzu r0, 4(r9) /* do the copy */ - xor r6, r6, r0 - stwu r0, 4(r8) - dcbst 0, r8 - sync - icbi 0, r8 - bdnz 9b - - addi r9, r9, 4 - addi r8, r8, 4 - b 0b - -2: - - /* To be certain of avoiding problems with self-modifying code - * execute a serializing instruction here. - */ - isync - sync - - /* jump to the entry point, usually the setup routine */ - mtlr r5 - blrl - -1: b 1b - -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .long relocate_new_kernel_end - relocate_new_kernel - diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 23db325e6..fd2eb1833 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -428,23 +428,6 @@ config UID16 depends on IA32_EMULATION default y -config KEXEC - bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - kexec is a system call that implements the ability to shutdown your - current kernel, and to start another kernel. It is like a reboot - but it is indepedent of the system firmware. And like a reboot - you can start any kernel with it, not just Linux. - - The name comes from the similiarity to the exec system call. - - It is an ongoing process to be certain the hardware in a machine - is properly shutdown, so do not be surprised if this code does not - initially work for you. It may help to enable device hotplugging - support. As of this writing the exact hardware interface is - strongly in flux, so no good recommendation can be made. - endmenu source drivers/Kconfig diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index e0405ab07..2c0f3af82 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o -obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 5aac099db..cd37a0aa6 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -145,36 +145,6 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write_around(APIC_SPIV, value); - - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT); - apic_write_around(APIC_LVT0, value); - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); - } } void disable_local_APIC(void) diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 72f7a7f93..5b65992b8 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -185,6 +185,8 @@ void __init e820_reserve_resources(void) int i; for (i = 0; i < e820.nr_map; i++) { struct resource *res; + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; res = alloc_bootmem_low(sizeof(struct resource)); switch (e820.map[i].type) { case E820_RAM: res->name = "System RAM"; break; diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index f8c10a6a0..7929a2e53 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -415,24 +415,10 @@ static int i8259A_suspend(struct sys_device *dev, u32 state) return 0; } - - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-1 */ - return 0; -} - static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .suspend = i8259A_suspend, .resume = i8259A_resume, - .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 28c1cb3b5..c22a8f045 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -327,7 +327,7 @@ static int find_irq_entry(int apic, int pin, int type) /* * Find the pin to which IRQ[irq] (ISA) is connected */ -static int find_isa_irq_pin(int irq, int type) +static int __init find_isa_irq_pin(int irq, int type) { int i; @@ -1125,43 +1125,11 @@ static void __init enable_IO_APIC(void) */ void disable_IO_APIC(void) { - int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); - /* - * If the i82559 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrups can be delivered. - */ - pin = find_isa_irq_pin(0, mp_ExtINT); - if (pin != -1) { - struct IO_APIC_route_entry entry; - unsigned long flags; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = 7; /* ExtInt */ - entry.vector = 0; - entry.dest.physical.physical_dest = 0; - - - /* - * Add it to the IO-APIC irq-routing table: - */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - disconnect_bsp_APIC(); } diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c deleted file mode 100644 index 974d2352b..000000000 --- a/arch/x86_64/kernel/machine_kexec.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * machine_kexec.c - handle transition of Linux booting another kernel - * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/mm.h> -#include <linux/kexec.h> -#include <linux/delay.h> -#include <linux/string.h> -#include <linux/reboot.h> -#include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/cpufeature.h> -#include <asm/hw_irq.h> - -#define LEVEL0_SIZE (1UL << 12UL) -#define LEVEL1_SIZE (1UL << 21UL) -#define LEVEL2_SIZE (1UL << 30UL) -#define LEVEL3_SIZE (1UL << 39UL) -#define LEVEL4_SIZE (1UL << 48UL) - -#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) -#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) - -static void init_level2_page( - uint64_t *level2p, unsigned long addr) -{ - unsigned long end_addr; - addr &= PAGE_MASK; - end_addr = addr + LEVEL2_SIZE; - while(addr < end_addr) { - *(level2p++) = addr | L1_ATTR; - addr += LEVEL1_SIZE; - } -} - -static int init_level3_page(struct kimage *image, - uint64_t *level3p, unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - result = 0; - addr &= PAGE_MASK; - end_addr = addr + LEVEL3_SIZE; - while((addr < last_addr) && (addr < end_addr)) { - struct page *page; - uint64_t *level2p; - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level2p = (uint64_t *)page_address(page); - init_level2_page(level2p, addr); - *(level3p++) = __pa(level2p) | L2_ATTR; - addr += LEVEL2_SIZE; - } - /* clear the unused entries */ - while(addr < end_addr) { - *(level3p++) = 0; - addr += LEVEL2_SIZE; - } -out: - return result; -} - - -static int init_level4_page(struct kimage *image, - uint64_t *level4p, unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - result = 0; - addr &= PAGE_MASK; - end_addr = addr + LEVEL4_SIZE; - while((addr < last_addr) && (addr < end_addr)) { - struct page *page; - uint64_t *level3p; - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level3p = (uint64_t *)page_address(page); - result = init_level3_page(image, level3p, addr, last_addr); - if (result) { - goto out; - } - *(level4p++) = __pa(level3p) | L3_ATTR; - addr += LEVEL3_SIZE; - } - /* clear the unused entries */ - while(addr < end_addr) { - *(level4p++) = 0; - addr += LEVEL3_SIZE; - } - out: - return result; -} - - -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) -{ - uint64_t *level4p; - level4p = (uint64_t *)__va(start_pgtable); - return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); -} - -static void set_idt(void *newidt, __u16 limit) -{ - unsigned char curidt[10]; - - /* x86-64 supports unaliged loads & stores */ - (*(__u16 *)(curidt)) = limit; - (*(__u64 *)(curidt +2)) = (unsigned long)(newidt); - - __asm__ __volatile__ ( - "lidt %0\n" - : "=m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, __u16 limit) -{ - unsigned char curgdt[10]; - - /* x86-64 supports unaligned loads & stores */ - (*(__u16 *)(curgdt)) = limit; - (*(__u64 *)(curgdt +2)) = (unsigned long)(newgdt); - - __asm__ __volatile__ ( - "lgdt %0\n" - : "=m" (curgdt) - ); -}; - -static void load_segments(void) -{ - __asm__ __volatile__ ( - "\tmovl $"STR(__KERNEL_DS)",%eax\n" - "\tmovl %eax,%ds\n" - "\tmovl %eax,%es\n" - "\tmovl %eax,%ss\n" - "\tmovl %eax,%fs\n" - "\tmovl %eax,%gs\n" - ); -#undef STR -#undef __STR -} - -typedef void (*relocate_new_kernel_t)( - unsigned long indirection_page, unsigned long control_code_buffer, - unsigned long start_address, unsigned long pgtable); - -const extern unsigned char relocate_new_kernel[]; -extern void relocate_new_kernel_end(void); -const extern unsigned long relocate_new_kernel_size; - -int machine_kexec_prepare(struct kimage *image) -{ - unsigned long start_pgtable, control_code_buffer; - int result; - - /* Calculate the offsets */ - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + 4096UL; - - /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, start_pgtable); - if (result) { - return result; - } - - /* Place the code in the reboot code buffer */ - memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size); - - return 0; -} - -void machine_kexec_cleanup(struct kimage *image) -{ - return; -} - -/* - * Do not allocate memory (or fail in any way) in machine_kexec(). - * We are past the point of no return, committed to rebooting now. - */ -void machine_kexec(struct kimage *image) -{ - unsigned long indirection_page; - unsigned long control_code_buffer; - unsigned long start_pgtable; - relocate_new_kernel_t rnk; - - /* Interrupts aren't acceptable while we reboot */ - local_irq_disable(); - - /* Calculate the offsets */ - indirection_page = image->head & PAGE_MASK; - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + 4096UL; - - /* Set the low half of the page table to my identity mapped - * page table for kexec. Leave the high half pointing at the - * kernel pages. Don't bother to flush the global pages - * as that will happen when I fully switch to my identity mapped - * page table anyway. - */ -// memcpy(current->active_mm->pml4, __va(start_pgtable), PAGE_SIZE/2); - __flush_tlb(); - - - /* The segment registers are funny things, they are - * automatically loaded from a table, in memory wherever you - * set them to a specific selector, but this table is never - * accessed again unless you set the segment to a different selector. - * - * The more common model are caches where the behide - * the scenes work is done, but is also dropped at arbitrary - * times. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); - /* now call it */ - rnk = (relocate_new_kernel_t) control_code_buffer; - (*rnk)(indirection_page, control_code_buffer, image->start, start_pgtable); -} diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index ce789144f..3dac33924 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -91,54 +91,47 @@ static void reboot_warm(void) [target] "b" (WARMBOOT_TRAMP)); } -static inline void kb_wait(void) -{ - int i; - - for (i=0; i<0x10000; i++) - if ((inb_p(0x64) & 0x02) == 0) - break; -} - -void machine_shutdown(void) -{ - /* Stop the cpus and apics */ #ifdef CONFIG_SMP - int reboot_cpu_id; - - /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { - reboot_cpu_id = smp_processor_id(); +static void smp_halt(void) +{ + int cpuid = safe_smp_processor_id(); + static int first_entry = 1; + + if (first_entry) { + first_entry = 0; + smp_call_function((void *)machine_restart, NULL, 1, 0); + } + + smp_stop_cpu(); + + /* AP calling this. Just halt */ + if (cpuid != boot_cpu_id) { + for (;;) + asm("hlt"); } - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - - /* O.K Now that I'm on the appropriate processor, - * stop all of the others. - */ - smp_send_stop(); -#endif - - local_irq_disable(); - -#ifndef CONFIG_SMP - disable_local_APIC(); + /* Wait for all other CPUs to have run smp_stop_cpu */ + while (!cpus_empty(cpu_online_map)) + rep_nop(); +} #endif - disable_IO_APIC(); +static inline void kb_wait(void) +{ + int i; - local_irq_enable(); + for (i=0; i<0x10000; i++) + if ((inb_p(0x64) & 0x02) == 0) + break; } void machine_restart(char * __unused) { int i; - machine_shutdown(); +#ifdef CONFIG_SMP + smp_halt(); +#endif local_irq_disable(); diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S deleted file mode 100644 index c944e5958..000000000 --- a/arch/x86_64/kernel/relocate_kernel.S +++ /dev/null @@ -1,141 +0,0 @@ -/* - * relocate_kernel.S - put the kernel image in place to boot - * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/linkage.h> - - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel - .code64 -relocate_new_kernel: - /* %rdi indirection_page - * %rsi reboot_code_buffer - * %rdx start address - * %rcx page_table - * %r8 arg5 - * %r9 arg6 - */ - - /* zero out flags, and disable interrupts */ - pushq $0 - popfq - - /* set a new stack at the bottom of our page... */ - lea 4096(%rsi), %rsp - - /* store the parameters back on the stack */ - pushq %rdx /* store the start address */ - - /* Set cr0 to a known state: - * 31 1 == Paging enabled - * 18 0 == Alignment check disabled - * 16 0 == Write protect disabled - * 3 0 == No task switch - * 2 0 == Don't do FP software emulation. - * 0 1 == Proctected mode enabled - */ - movq %cr0, %rax - andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax - orl $((1<<31)|(1<<0)), %eax - movq %rax, %cr0 - - /* Set cr4 to a known state: - * 10 0 == xmm exceptions disabled - * 9 0 == xmm registers instructions disabled - * 8 0 == performance monitoring counter disabled - * 7 0 == page global disabled - * 6 0 == machine check exceptions disabled - * 5 1 == physical address extension enabled - * 4 0 == page size extensions disabled - * 3 0 == Debug extensions disabled - * 2 0 == Time stamp disable (disabled) - * 1 0 == Protected mode virtual interrupts disabled - * 0 0 == VME disabled - */ - - movq $((1<<5)), %rax - movq %rax, %cr4 - - jmp 1f -1: - - /* Switch to the identity mapped page tables, - * and flush the TLB. - */ - movq %rcx, %cr3 - - /* Do the copies */ - movq %rdi, %rbx /* Put the indirection page in %rbx */ - xorq %rdi, %rdi - xorq %rsi, %rsi - -0: /* top, read another word for the indirection page */ - - movq (%rbx), %rcx - addq $8, %rbx - testq $0x1, %rcx /* is it a destination page? */ - jz 1f - movq %rcx, %rdi - andq $0xfffffffffffff000, %rdi - jmp 0b -1: - testq $0x2, %rcx /* is it an indirection page? */ - jz 1f - movq %rcx, %rbx - andq $0xfffffffffffff000, %rbx - jmp 0b -1: - testq $0x4, %rcx /* is it the done indicator? */ - jz 1f - jmp 2f -1: - testq $0x8, %rcx /* is it the source indicator? */ - jz 0b /* Ignore it otherwise */ - movq %rcx, %rsi /* For ever source page do a copy */ - andq $0xfffffffffffff000, %rsi - - movq $512, %rcx - rep ; movsq - jmp 0b -2: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB by reloading %cr3 here, it's handy, - * and not processor dependent. - */ - movq %cr3, %rax - movq %rax, %cr3 - - /* set all of the registers to known values */ - /* leave %rsp alone */ - - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 - - ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .quad relocate_new_kernel_end - relocate_new_kernel diff --git a/configs/kernel-2.6.10-i686-planetlab.config b/configs/kernel-2.6.10-i686-planetlab.config index bd63671f1..b7dc0a7d6 100644 --- a/configs/kernel-2.6.10-i686-planetlab.config +++ b/configs/kernel-2.6.10-i686-planetlab.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.10-1.14_FC2.1.planetlab.2005.04.14 -# Sat May 7 01:45:01 2005 +# Linux kernel version: 2.6.10-1.14_FC2.1.planetlab.2005.03.31 +# Thu Mar 31 11:50:25 2005 # CONFIG_X86=y CONFIG_MMU=y @@ -33,10 +33,9 @@ CONFIG_CKRM=y CONFIG_RCFS_FS=y CONFIG_CKRM_TYPE_TASKCLASS=y CONFIG_CKRM_RES_NULL=m -# CONFIG_CKRM_RES_MEM is not set +CONFIG_CKRM_RES_MEM=y # CONFIG_CKRM_TYPE_SOCKETCLASS is not set CONFIG_CKRM_RES_NUMTASKS=y -# CONFIG_CKRM_RES_NUMTASKS_FORKRATE is not set CONFIG_CKRM_CPU_SCHEDULE=y # CONFIG_CKRM_RES_BLKIO is not set CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y @@ -156,9 +155,6 @@ CONFIG_HIGHPTE=y CONFIG_MTRR=y # CONFIG_EFI is not set CONFIG_REGPARM=y -CONFIG_KERN_PHYS_OFFSET=1 -CONFIG_KEXEC=y -# CONFIG_CRASH_DUMP is not set # # Power management options (ACPI, APM) @@ -702,7 +698,7 @@ CONFIG_MD_RAID5=m CONFIG_MD_RAID6=m CONFIG_MD_MULTIPATH=m CONFIG_MD_FAULTY=m -CONFIG_BLK_DEV_DM=y +CONFIG_BLK_DEV_DM=m CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m @@ -791,7 +787,7 @@ CONFIG_INET_IPCOMP=m CONFIG_INET_TUNNEL=m # CONFIG_ACCEPT_QUEUES is not set CONFIG_IP_TCPDIAG=m -# CONFIG_IP_TCPDIAG_IPV6 is not set +CONFIG_IP_TCPDIAG_IPV6=y # # IP: Virtual Server Configuration @@ -827,7 +823,13 @@ CONFIG_IP_VS_NQ=m # CONFIG_IP_VS_FTP=m CONFIG_ICMP_IPOD=y -# CONFIG_IPV6 is not set +CONFIG_IPV6=m +CONFIG_IPV6_PRIVACY=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_INET6_TUNNEL=m +CONFIG_IPV6_TUNNEL=m CONFIG_NETFILTER=y # CONFIG_NETFILTER_DEBUG is not set CONFIG_BRIDGE_NETFILTER=y @@ -904,6 +906,31 @@ CONFIG_IP_NF_ARP_MANGLE=m # CONFIG_IP_NF_COMPAT_IPFWADM is not set # CONFIG_IP_NF_CT_PROTO_GRE is not set +# +# IPv6: Netfilter Configuration +# +# CONFIG_IP6_NF_QUEUE is not set +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_LIMIT=m +CONFIG_IP6_NF_MATCH_MAC=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_MULTIPORT=m +CONFIG_IP6_NF_MATCH_OWNER=m +CONFIG_IP6_NF_MATCH_MARK=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_AHESP=m +CONFIG_IP6_NF_MATCH_LENGTH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_PHYSDEV=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_TARGET_MARK=m +CONFIG_IP6_NF_RAW=m + # # Bridge: Netfilter Configuration # @@ -949,7 +976,7 @@ CONFIG_ATM_BR2684=m CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m # CONFIG_DECNET is not set -CONFIG_LLC=m +CONFIG_LLC=y # CONFIG_LLC2 is not set CONFIG_IPX=m # CONFIG_IPX_INTERN is not set @@ -1008,9 +1035,98 @@ CONFIG_NETPOLL=y CONFIG_NETPOLL_TRAP=y CONFIG_NET_POLL_CONTROLLER=y # CONFIG_HAMRADIO is not set -# CONFIG_IRDA is not set -# CONFIG_BT is not set -# CONFIG_TUX is not set +CONFIG_IRDA=m + +# +# IrDA protocols +# +CONFIG_IRLAN=m +CONFIG_IRNET=m +CONFIG_IRCOMM=m +# CONFIG_IRDA_ULTRA is not set + +# +# IrDA options +# +CONFIG_IRDA_CACHE_LAST_LSAP=y +CONFIG_IRDA_FAST_RR=y +# CONFIG_IRDA_DEBUG is not set + +# +# Infrared-port device drivers +# + +# +# SIR device drivers +# +CONFIG_IRTTY_SIR=m + +# +# Dongle support +# +CONFIG_DONGLE=y +CONFIG_ESI_DONGLE=m +CONFIG_ACTISYS_DONGLE=m +CONFIG_TEKRAM_DONGLE=m +CONFIG_LITELINK_DONGLE=m +CONFIG_MA600_DONGLE=m +CONFIG_GIRBIL_DONGLE=m +CONFIG_MCP2120_DONGLE=m +CONFIG_OLD_BELKIN_DONGLE=m +CONFIG_ACT200L_DONGLE=m + +# +# Old SIR device drivers +# +CONFIG_IRPORT_SIR=m + +# +# Old Serial dongle support +# +# CONFIG_DONGLE_OLD is not set + +# +# FIR device drivers +# +CONFIG_USB_IRDA=m +CONFIG_SIGMATEL_FIR=m +CONFIG_TOSHIBA_FIR=m +CONFIG_VLSI_FIR=m +CONFIG_BT=m +CONFIG_BT_L2CAP=m +CONFIG_BT_SCO=m +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m + +# +# Bluetooth device drivers +# +CONFIG_BT_HCIUSB=m +CONFIG_BT_HCIUSB_SCO=y +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_BCSP_TXCRC=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIBTUART=m +CONFIG_BT_HCIVHCI=m +CONFIG_TUX=m + +# +# TUX options +# +CONFIG_TUX_EXTCGI=y +CONFIG_TUX_EXTENDED_LOG=y +# CONFIG_TUX_DEBUG is not set CONFIG_NETDEVICES=y CONFIG_DUMMY=m CONFIG_BONDING=m @@ -1108,7 +1224,13 @@ CONFIG_S2IO_NAPI=y # # Token Ring devices # -# CONFIG_TR is not set +CONFIG_TR=y +CONFIG_IBMOL=m +CONFIG_IBMLS=m +CONFIG_3C359=m +CONFIG_TMS380TR=m +CONFIG_TMSPCI=m +CONFIG_ABYSS=m # # Wireless LAN (non-hamradio) @@ -1174,6 +1296,7 @@ CONFIG_PCMCIA_NMCLAN=m CONFIG_PCMCIA_SMC91C92=m CONFIG_PCMCIA_XIRC2PS=m CONFIG_PCMCIA_AXNET=m +CONFIG_PCMCIA_IBMTR=m # # Wan interfaces @@ -1210,9 +1333,20 @@ CONFIG_FDDI=y # CONFIG_DEFXX is not set CONFIG_SKFP=m # CONFIG_HIPPI is not set -# CONFIG_PLIP is not set -# CONFIG_PPP is not set -# CONFIG_SLIP is not set +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPP_FILTER=y +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +# CONFIG_PPP_BSDCOMP is not set +CONFIG_PPPOE=m +CONFIG_PPPOATM=m +CONFIG_SLIP=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +# CONFIG_SLIP_MODE_SLIP6 is not set CONFIG_NET_FC=y # CONFIG_SHAPER is not set CONFIG_NETCONSOLE=m @@ -1886,7 +2020,95 @@ CONFIG_LOGO_LINUX_CLUT224=y # # Sound # -# CONFIG_SOUND is not set +CONFIG_SOUND=m + +# +# Advanced Linux Sound Architecture +# +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_RTCTIMER=m +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set + +# +# Generic devices +# +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_DUMMY=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +# CONFIG_SND_SERIAL_U16550 is not set +CONFIG_SND_MPU401=m + +# +# PCI devices +# +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CS4281=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_KORG1212=m +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_HDSP=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_YMFPCI=m +CONFIG_SND_ALS4000=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VX222=m + +# +# USB devices +# +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_USX2Y=m + +# +# PCMCIA devices +# + +# +# Open Sound System +# +# CONFIG_SOUND_PRIME is not set # # USB support @@ -1918,7 +2140,12 @@ CONFIG_USB_SL811_HCD=m # # USB Device Class drivers # -# CONFIG_USB_BLUETOOTH_TTY is not set +# CONFIG_USB_AUDIO is not set + +# +# USB Bluetooth TTY can only be used with disabled Bluetooth subsystem +# +CONFIG_USB_MIDI=m CONFIG_USB_ACM=m CONFIG_USB_PRINTER=m diff --git a/drivers/char/mem.c b/drivers/char/mem.c index aeeb1a766..e5245cb95 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -23,8 +23,6 @@ #include <linux/devfs_fs_kernel.h> #include <linux/ptrace.h> #include <linux/device.h> -#include <linux/highmem.h> -#include <linux/crash_dump.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -229,62 +227,6 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma) return 0; } -#ifdef CONFIG_CRASH_DUMP -/* - * Read memory corresponding to the old kernel. - * If we are reading from the reserved section, which is - * actually used by the current kernel, we just return zeroes. - * Or if we are reading from the first 640k, we return from the - * backed up area. - */ -static ssize_t read_oldmem(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - unsigned long pfn; - unsigned backup_start, backup_end, relocate_start; - size_t read=0, csize; - - backup_start = CRASH_BACKUP_BASE / PAGE_SIZE; - backup_end = backup_start + (CRASH_BACKUP_SIZE / PAGE_SIZE); - relocate_start = (CRASH_BACKUP_BASE + CRASH_BACKUP_SIZE) / PAGE_SIZE; - - while(count) { - pfn = *ppos / PAGE_SIZE; - - csize = (count > PAGE_SIZE) ? PAGE_SIZE : count; - - /* Perform translation (see comment above) */ - if ((pfn >= backup_start) && (pfn < backup_end)) { - if (clear_user(buf, csize)) { - read = -EFAULT; - goto done; - } - - goto copy_done; - } else if (pfn < (CRASH_RELOCATE_SIZE / PAGE_SIZE)) - pfn += relocate_start; - - if (pfn > saved_max_pfn) { - read = 0; - goto done; - } - - if (copy_oldmem_page(pfn, buf, csize, 1)) { - read = -EFAULT; - goto done; - } - -copy_done: - buf += csize; - *ppos += csize; - read += csize; - count -= csize; - } -done: - return read; -} -#endif - extern long vread(char *buf, char *addr, unsigned long count); extern long vwrite(char *buf, char *addr, unsigned long count); @@ -591,7 +533,6 @@ static int open_port(struct inode * inode, struct file * filp) #define read_full read_zero #define open_mem open_port #define open_kmem open_mem -#define open_oldmem open_mem static struct file_operations mem_fops = { .llseek = memory_lseek, @@ -636,13 +577,6 @@ static struct file_operations full_fops = { .write = write_full, }; -#ifdef CONFIG_CRASH_DUMP -static struct file_operations oldmem_fops = { - .read = read_oldmem, - .open = open_oldmem, -}; -#endif - static ssize_t kmsg_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { @@ -697,11 +631,6 @@ static int memory_open(struct inode * inode, struct file * filp) case 11: filp->f_op = &kmsg_fops; break; -#ifdef CONFIG_CRASH_DUMP - case 12: - filp->f_op = &oldmem_fops; - break; -#endif default: return -ENXIO; } @@ -730,9 +659,6 @@ static const struct { {8, "random", S_IRUGO | S_IWUSR, &random_fops}, {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops}, {11,"kmsg", S_IRUGO | S_IWUSR, &kmsg_fops}, -#ifdef CONFIG_CRASH_DUMP - {12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops}, -#endif }; static struct class_simple *mem_class; diff --git a/drivers/dump/dump_fmt.c b/drivers/dump/dump_fmt.c index 1ab035466..afa0aed56 100644 --- a/drivers/dump/dump_fmt.c +++ b/drivers/dump/dump_fmt.c @@ -305,7 +305,7 @@ int dump_lcrash_add_data(unsigned long loc, unsigned long len) buf += sizeof(struct __dump_page); while (len) { - addr = kmap_atomic(page, KM_CRASHDUMP); + addr = kmap_atomic(page, KM_DUMP); size = bytes = (len > PAGE_SIZE) ? PAGE_SIZE : len; /* check for compression */ if (dump_allow_compress(page, bytes)) { @@ -321,7 +321,7 @@ int dump_lcrash_add_data(unsigned long loc, unsigned long len) size = bytes; } /* memset(buf, 'A', size); temporary: testing only !! */ - kunmap_atomic(addr, KM_CRASHDUMP); + kunmap_atomic(addr, KM_DUMP); dp->dp_size += size; buf += size; len -= bytes; diff --git a/drivers/dump/dump_i386.c b/drivers/dump/dump_i386.c index 991db8eb1..5a01e0f6f 100644 --- a/drivers/dump/dump_i386.c +++ b/drivers/dump/dump_i386.c @@ -314,7 +314,7 @@ __dump_cleanup(void) free_dha_stack(); } -extern int page_is_ram(unsigned long); +extern int pfn_is_ram(unsigned long); /* * Name: __dump_page_valid() @@ -326,7 +326,7 @@ __dump_page_valid(unsigned long index) if (!pfn_valid(index)) return 0; - return page_is_ram(index); + return pfn_is_ram(index); } /* diff --git a/drivers/dump/dump_memdev.c b/drivers/dump/dump_memdev.c index b2bb64255..1cd700d31 100644 --- a/drivers/dump/dump_memdev.c +++ b/drivers/dump/dump_memdev.c @@ -146,7 +146,7 @@ void dump_mark_map(struct dump_memdev *dev) pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]); page = pfn_to_page(map1[i]); set_page_count(page, 1); - map2 = kmap_atomic(page, KM_CRASHDUMP); + map2 = kmap_atomic(page, KM_DUMP); for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] && (off + j < last); j++) { pr_debug("\t map[%d][%d] = 0x%lx\n", i, j, @@ -198,7 +198,7 @@ struct page *dump_mem_lookup(struct dump_memdev *dump_mdev, unsigned long loc) } if (page) - map = kmap_atomic(page, KM_CRASHDUMP); + map = kmap_atomic(page, KM_DUMP); else return NULL; @@ -213,7 +213,7 @@ struct page *dump_mem_lookup(struct dump_memdev *dump_mdev, unsigned long loc) } else { page = NULL; } - kunmap_atomic(map, KM_CRASHDUMP); + kunmap_atomic(map, KM_DUMP); return page; } @@ -248,10 +248,10 @@ struct page *dump_mem_next_page(struct dump_memdev *dev) }; if (*dev->curr_map) { - map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_CRASHDUMP); + map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_DUMP); if (map[i]) page = pfn_to_page(map[i]); - kunmap_atomic(map, KM_CRASHDUMP); + kunmap_atomic(map, KM_DUMP); dev->ddev.curr_offset += PAGE_SIZE; }; @@ -308,9 +308,9 @@ int dump_mem_add_space(struct dump_memdev *dev, struct page *page) /* add data space */ i = dev->curr_map_offset; map_page = pfn_to_page(*dev->curr_map); - map = (unsigned long *)kmap_atomic(map_page, KM_CRASHDUMP); + map = (unsigned long *)kmap_atomic(map_page, KM_DUMP); map[i] = page_to_pfn(page); - kunmap_atomic(map, KM_CRASHDUMP); + kunmap_atomic(map, KM_DUMP); dev->curr_map_offset = ++i; dev->last_offset += PAGE_SIZE; if (i >= DUMP_MAP_SZ) { @@ -572,10 +572,10 @@ int dump_mem_write(struct dump_dev *dev, void *buf, unsigned long len) page = dump_mem_lookup(dump_mdev, dev->curr_offset >> PAGE_SHIFT); for (n = len; (n > 0) && page; n -= PAGE_SIZE, buf += PAGE_SIZE ) { - addr = kmap_atomic(page, KM_CRASHDUMP); + addr = kmap_atomic(page, KM_DUMP); /* memset(addr, 'x', PAGE_SIZE); */ memcpy(addr, buf, PAGE_SIZE); - kunmap_atomic(addr, KM_CRASHDUMP); + kunmap_atomic(addr, KM_DUMP); /* dev->curr_offset += PAGE_SIZE; */ page = dump_mem_next_page(dump_mdev); } diff --git a/drivers/dump/dump_overlay.c b/drivers/dump/dump_overlay.c index a23f1b2b5..8e10b7827 100644 --- a/drivers/dump/dump_overlay.c +++ b/drivers/dump/dump_overlay.c @@ -481,7 +481,7 @@ int dump_saved_data_iterator(int pass, int (*action)(unsigned long, else count++; /* clear the contents of page */ - /* fixme: consider using KM_CRASHDUMP instead */ + /* fixme: consider using KM_DUMP instead */ clear_highpage(page); } @@ -679,10 +679,10 @@ int dump_copy_pages(void *buf, struct page *page, unsigned long sz) void *addr; while (len < sz) { - addr = kmap_atomic(page, KM_CRASHDUMP); + addr = kmap_atomic(page, KM_DUMP); bytes = (sz > len + PAGE_SIZE) ? PAGE_SIZE : sz - len; memcpy(buf, addr, bytes); - kunmap_atomic(addr, KM_CRASHDUMP); + kunmap_atomic(addr, KM_DUMP); buf += bytes; len += bytes; page++; diff --git a/drivers/dump/dump_setup.c b/drivers/dump/dump_setup.c index 338f7235d..668b2d052 100644 --- a/drivers/dump/dump_setup.c +++ b/drivers/dump/dump_setup.c @@ -740,7 +740,9 @@ static inline void dump_sysrq_register(void) { #ifdef CONFIG_MAGIC_SYSRQ - register_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op); + __sysrq_lock_table(); + __sysrq_put_key_op(DUMP_SYSRQ_KEY, &sysrq_crashdump_op); + __sysrq_unlock_table(); #endif } @@ -748,7 +750,10 @@ static inline void dump_sysrq_unregister(void) { #ifdef CONFIG_MAGIC_SYSRQ - unregister_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op); + __sysrq_lock_table(); + if (__sysrq_get_key_op(DUMP_SYSRQ_KEY) == &sysrq_crashdump_op) + __sysrq_put_key_op(DUMP_SYSRQ_KEY, NULL); + __sysrq_unlock_table(); #endif } diff --git a/fs/aio.c b/fs/aio.c index 6b523da1e..7a9c7a12a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -564,7 +564,7 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id) * (Note: this routine is intended to be called only * from a kernel thread context) */ -void use_mm(struct mm_struct *mm) +static void use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; diff --git a/fs/exec.c b/fs/exec.c index 95ae49ba1..5f7f09222 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -564,7 +564,7 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); - ckrm_task_change_mm(tsk, old_mm, mm); + ckrm_task_mm_change(tsk, old_mm, mm); if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); diff --git a/fs/ioctl.c b/fs/ioctl.c index 6af7a74c8..19e902dc3 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -174,19 +174,6 @@ asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) error = vx_proc_ioctl(filp->f_dentry->d_inode, filp, cmd, arg); break; #endif - case FIOC_SETIATTR: - case FIOC_GETIATTR: - /* - * Verify that this filp is a file object, - * not (say) a socket. - */ - error = -ENOTTY; - if (S_ISREG(filp->f_dentry->d_inode->i_mode) || - S_ISDIR(filp->f_dentry->d_inode->i_mode)) - error = vc_iattr_ioctl(filp->f_dentry, - cmd, arg); - break; - default: error = -ENOTTY; if (S_ISREG(filp->f_dentry->d_inode->i_mode)) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 9c676901a..97fbb8619 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -215,10 +215,6 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; - /* Prevent vservers from escaping chroot() barriers */ - if (IS_BARRIER(inode) && !vx_check(0, VX_ADMIN)) - return -EACCES; - FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: diff --git a/fs/proc/Makefile b/fs/proc/Makefile index b86869896..abdd91d9d 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -11,5 +11,4 @@ proc-y += inode.o root.o base.o generic.o array.o \ kmsg.o proc_tty.o proc_misc.o proc-$(CONFIG_PROC_KCORE) += kcore.o -proc-$(CONFIG_CRASH_DUMP) += vmcore.o proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 379804c57..ee1b56bfb 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -114,7 +114,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) /* * determine size of ELF note */ -int notesize(struct memelfnote *en) +static int notesize(struct memelfnote *en) { int sz; @@ -129,7 +129,7 @@ int notesize(struct memelfnote *en) /* * store a note in the header buffer */ -char *storenote(struct memelfnote *men, char *bufp) +static char *storenote(struct memelfnote *men, char *bufp) { struct elf_note en; @@ -156,7 +156,7 @@ char *storenote(struct memelfnote *men, char *bufp) * store an ELF coredump header in the supplied buffer * nphdr is the number of elf_phdr to insert */ -void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff, struct kcore_list *clist) +static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) { struct elf_prstatus prstatus; /* NT_PRSTATUS */ struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */ @@ -208,7 +208,7 @@ void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff, struct kcore_list * nhdr->p_align = 0; /* setup ELF PT_LOAD program header for every area */ - for (m=clist; m; m=m->next) { + for (m=kclist; m; m=m->next) { phdr = (struct elf_phdr *) bufp; bufp += sizeof(struct elf_phdr); offset += sizeof(struct elf_phdr); @@ -305,7 +305,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) return -ENOMEM; } memset(elf_buf, 0, elf_buflen); - elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen, kclist); + elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); read_unlock(&kclist_lock); if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { kfree(elf_buf); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 1c4f019cd..e042c2083 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -44,7 +44,6 @@ #include <linux/jiffies.h> #include <linux/sysrq.h> #include <linux/vmalloc.h> -#include <linux/crash_dump.h> #include <linux/vs_base.h> #include <linux/vs_cvirt.h> @@ -658,13 +657,11 @@ void __init proc_misc_init(void) (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; } #endif - crash_create_proc_entry(); #ifdef CONFIG_MAGIC_SYSRQ entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL); if (entry) entry->proc_fops = &proc_sysrq_trigger_operations; #endif - crash_enable_by_proc(); #ifdef CONFIG_PPC32 { extern struct file_operations ppc_htab_operations; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c deleted file mode 100644 index 0c057dc15..000000000 --- a/fs/proc/vmcore.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * fs/proc/vmcore.c Interface for accessing the crash - * dump from the system's previous life. - * Heavily borrowed from fs/proc/kcore.c - * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) - * Copyright (C) IBM Corporation, 2004. All rights reserved - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/proc_fs.h> -#include <linux/user.h> -#include <linux/a.out.h> -#include <linux/elf.h> -#include <linux/elfcore.h> -#include <linux/vmalloc.h> -#include <linux/proc_fs.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/init.h> -#include <linux/crash_dump.h> -#include <asm/uaccess.h> -#include <asm/io.h> - -/* This is to re-use the kcore header creation code */ -static struct kcore_list vmcore_mem; - -static int open_vmcore(struct inode * inode, struct file * filp) -{ - return 0; -} - -static ssize_t read_vmcore(struct file *,char __user *,size_t, loff_t *); - -#define BACKUP_START CRASH_BACKUP_BASE -#define BACKUP_END CRASH_BACKUP_BASE + CRASH_BACKUP_SIZE -#define REG_SIZE sizeof(elf_gregset_t) - -struct file_operations proc_vmcore_operations = { - .read = read_vmcore, - .open = open_vmcore, -}; - -struct proc_dir_entry *proc_vmcore; - -struct memelfnote -{ - const char *name; - int type; - unsigned int datasz; - void *data; -}; - -static size_t get_vmcore_size(int *nphdr, size_t *elf_buflen) -{ - size_t size; - - /* We need 1 PT_LOAD segment headers - * In addition, we need one PT_NOTE header - */ - *nphdr = 2; - size = (size_t)(saved_max_pfn << PAGE_SHIFT); - - *elf_buflen = sizeof(struct elfhdr) + - (*nphdr + 2)*sizeof(struct elf_phdr) + - 3 * sizeof(struct memelfnote) + - sizeof(struct elf_prstatus) + - sizeof(struct elf_prpsinfo) + - sizeof(struct task_struct); - *elf_buflen = PAGE_ALIGN(*elf_buflen); - return size + *elf_buflen; -} - -/* - * Reads a page from the oldmem device from given offset. - */ -static ssize_t read_from_oldmem(char *buf, size_t count, - loff_t *ppos, int userbuf) -{ - unsigned long pfn; - size_t read = 0; - - pfn = (unsigned long)(*ppos / PAGE_SIZE); - - if (pfn > saved_max_pfn) { - read = -EINVAL; - goto done; - } - - count = (count > PAGE_SIZE) ? PAGE_SIZE : count; - - if (copy_oldmem_page(pfn, buf, count, userbuf)) { - read = -EFAULT; - goto done; - } - - *ppos += count; -done: - return read; -} - -/* - * store an ELF crash dump header in the supplied buffer - * nphdr is the number of elf_phdr to insert - */ -static void elf_vmcore_store_hdr(char *bufp, int nphdr, int dataoff) -{ - struct elf_prstatus prstatus; /* NT_PRSTATUS */ - struct memelfnote notes[1]; - char reg_buf[REG_SIZE]; - loff_t reg_ppos; - char *buf = bufp; - - vmcore_mem.addr = (unsigned long)__va(0); - vmcore_mem.size = saved_max_pfn << PAGE_SHIFT; - vmcore_mem.next = NULL; - - /* Re-use the kcore code */ - elf_kcore_store_hdr(bufp, nphdr, dataoff, &vmcore_mem); - buf += sizeof(struct elfhdr) + 2*sizeof(struct elf_phdr); - - /* set up the process status */ - notes[0].name = "CORE"; - notes[0].type = NT_PRSTATUS; - notes[0].datasz = sizeof(struct elf_prstatus); - notes[0].data = &prstatus; - - memset(&prstatus, 0, sizeof(struct elf_prstatus)); - - /* 1 - Get the registers from the reserved memory area */ - reg_ppos = BACKUP_END + CRASH_RELOCATE_SIZE; - read_from_oldmem(reg_buf, REG_SIZE, ®_ppos, 0); - elf_core_copy_regs(&prstatus.pr_reg, (struct pt_regs *)reg_buf); - buf = storenote(¬es[0], buf); -} - -/* - * read from the ELF header and then the crash dump - */ -static ssize_t read_vmcore( -struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) -{ - ssize_t acc = 0; - size_t size, tsz; - size_t elf_buflen; - int nphdr; - unsigned long start; - - tsz = get_vmcore_size(&nphdr, &elf_buflen); - proc_vmcore->size = size = tsz + elf_buflen; - if (buflen == 0 || *fpos >= size) { - goto done; - } - - /* trim buflen to not go beyond EOF */ - if (buflen > size - *fpos) - buflen = size - *fpos; - - /* construct an ELF core header if we'll need some of it */ - if (*fpos < elf_buflen) { - char * elf_buf; - - tsz = elf_buflen - *fpos; - if (buflen < tsz) - tsz = buflen; - elf_buf = kmalloc(elf_buflen, GFP_ATOMIC); - if (!elf_buf) { - acc = -ENOMEM; - goto done; - } - memset(elf_buf, 0, elf_buflen); - elf_vmcore_store_hdr(elf_buf, nphdr, elf_buflen); - if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { - kfree(elf_buf); - acc = -EFAULT; - goto done; - } - kfree(elf_buf); - buflen -= tsz; - *fpos += tsz; - buffer += tsz; - acc += tsz; - - /* leave now if filled buffer already */ - if (buflen == 0) { - goto done; - } - } - - start = *fpos - elf_buflen; - if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) - tsz = buflen; - - while (buflen) { - unsigned long p; - loff_t pdup; - - if ((start < 0) || (start >= size)) - if (clear_user(buffer, tsz)) { - acc = -EFAULT; - goto done; - } - - /* tsz contains actual len of dump to be read. - * buflen is the total len that was requested. - * This may contain part of ELF header. start - * is the fpos for the oldmem region - * If the file position corresponds to the second - * kernel's memory, we just return zeroes - */ - p = start; - if ((p >= BACKUP_START) && (p < BACKUP_END)) { - if (clear_user(buffer, tsz)) { - acc = -EFAULT; - goto done; - } - - goto read_done; - } else if (p < CRASH_RELOCATE_SIZE) - p += BACKUP_END; - - pdup = p; - if (read_from_oldmem(buffer, tsz, &pdup, 1)) { - acc = -EINVAL; - goto done; - } - -read_done: - buflen -= tsz; - *fpos += tsz; - buffer += tsz; - acc += tsz; - start += tsz; - tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); - } - -done: - return acc; -} diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c4e7d98b9..99cef06a3 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -73,7 +73,7 @@ } #define SECURITY_INIT \ - .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) {\ + .security_initcall.init : { \ VMLINUX_SYMBOL(__security_initcall_start) = .; \ *(.security_initcall.init) \ VMLINUX_SYMBOL(__security_initcall_end) = .; \ diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h index 9513dd889..c689554ad 100644 --- a/include/asm-i386/apicdef.h +++ b/include/asm-i386/apicdef.h @@ -86,7 +86,6 @@ #define APIC_LVT_REMOTE_IRR (1<<14) #define APIC_INPUT_POLARITY (1<<13) #define APIC_SEND_PENDING (1<<12) -#define APIC_MODE_MASK 0x700 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7) #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8)) #define APIC_MODE_FIXED 0x0 diff --git a/include/asm-i386/crash_dump.h b/include/asm-i386/crash_dump.h deleted file mode 100644 index a13e4b6b6..000000000 --- a/include/asm-i386/crash_dump.h +++ /dev/null @@ -1,82 +0,0 @@ -/* asm-i386/crash_dump.h */ -#include <linux/bootmem.h> -#include <linux/irq.h> -#include <asm/apic.h> - -#ifdef CONFIG_CRASH_DUMP -extern unsigned int dump_enabled; -extern unsigned int crashed; - -extern void __crash_relocate_mem(unsigned long, unsigned long); -extern unsigned long __init find_max_low_pfn(void); -extern void __init find_max_pfn(void); - -extern struct pt_regs crash_smp_regs[NR_CPUS]; -extern long crash_smp_current_task[NR_CPUS]; -extern void crash_dump_save_this_cpu(struct pt_regs *, int); -extern void __crash_dump_stop_cpus(void); -extern void crash_get_current_regs(struct pt_regs *regs); - -#define CRASH_BACKUP_BASE ((unsigned long)CONFIG_BACKUP_BASE * 0x100000) -#define CRASH_BACKUP_SIZE ((unsigned long)CONFIG_BACKUP_SIZE * 0x100000) -#define CRASH_RELOCATE_SIZE 0xa0000 - -static inline void crash_relocate_mem(void) -{ - if (crashed) - __crash_relocate_mem(CRASH_BACKUP_BASE + CRASH_BACKUP_SIZE, - CRASH_RELOCATE_SIZE); -} - -static inline void set_saved_max_pfn(void) -{ - find_max_pfn(); - saved_max_pfn = find_max_low_pfn(); -} - -static inline void crash_reserve_bootmem(void) -{ - if (!dump_enabled) { - reserve_bootmem(CRASH_BACKUP_BASE, - CRASH_BACKUP_SIZE + CRASH_RELOCATE_SIZE + PAGE_SIZE); - } -} - -static inline void crash_dump_stop_cpus(void) -{ - int cpu; - - if (!crashed) - return; - - cpu = smp_processor_id(); - - crash_smp_current_task[cpu] = (long)current; - crash_get_current_regs(&crash_smp_regs[cpu]); - - /* This also captures the register states of the other cpus */ - __crash_dump_stop_cpus(); -#if defined(CONFIG_X86_IO_APIC) - disable_IO_APIC(); -#endif -#if defined(CONFIG_X86_LOCAL_APIC) - disconnect_bsp_APIC(); -#endif -} - -static inline void crash_dump_save_registers(void) -{ - void *addr; - - addr = __va(CRASH_BACKUP_BASE + CRASH_BACKUP_SIZE + CRASH_RELOCATE_SIZE); - memcpy(addr, crash_smp_regs, (sizeof(struct pt_regs)*NR_CPUS)); - addr += sizeof(struct pt_regs)*NR_CPUS; - memcpy(addr, crash_smp_current_task, (sizeof(long)*NR_CPUS)); -} -#else -#define crash_relocate_mem() do { } while(0) -#define set_saved_max_pfn() do { } while(0) -#define crash_reserve_bootmem() do { } while(0) -#define crash_dump_stop_cpus() do { } while(0) -#define crash_dump_save_registers() do { } while(0) -#endif diff --git a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h index a2525ba94..454b23ffd 100644 --- a/include/asm-i386/highmem.h +++ b/include/asm-i386/highmem.h @@ -61,7 +61,6 @@ void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic(struct page *page, enum km_type type); void kunmap_atomic(void *kvaddr, enum km_type type); -char *kmap_atomic_pfn(unsigned long pfn, enum km_type type); struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h deleted file mode 100644 index eb8fd9868..000000000 --- a/include/asm-i386/kexec.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _I386_KEXEC_H -#define _I386_KEXEC_H - -#include <asm/fixmap.h> - -/* - * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. - * I.e. Maximum page that is mapped directly into kernel memory, - * and kmap is not required. - * - * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct - * calculation for the amount of memory directly mappable into the - * kernel memory space. - */ - -/* Maximum physical address we can use pages from */ -#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) -/* Maximum address we can reach in physical address mode */ -#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) -/* Maximum address we can use for the control code buffer */ -#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE - -#define KEXEC_CONTROL_CODE_SIZE 4096 - -#endif /* _I386_KEXEC_H */ diff --git a/include/asm-i386/mach-default/irq_vectors.h b/include/asm-i386/mach-default/irq_vectors.h index 27e18a25a..0bcc6f1ad 100644 --- a/include/asm-i386/mach-default/irq_vectors.h +++ b/include/asm-i386/mach-default/irq_vectors.h @@ -48,7 +48,7 @@ #define INVALIDATE_TLB_VECTOR 0xfd #define RESCHEDULE_VECTOR 0xfc #define CALL_FUNCTION_VECTOR 0xfb -#define CRASH_DUMP_VECTOR 0xfa +#define DUMP_VECTOR 0xfa #define THERMAL_APIC_VECTOR 0xf0 /* diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 5fa792b1d..047cd2322 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -42,7 +42,6 @@ extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_invalidate_rcv(void); /* Process an NMI */ extern void (*mtrr_hook) (void); extern void zap_low_mappings (void); -extern void stop_this_cpu(void *); #define MAX_APICID 256 extern u8 x86_cpu_to_apicid[]; diff --git a/include/asm-ppc/kexec.h b/include/asm-ppc/kexec.h deleted file mode 100644 index 3531b6daf..000000000 --- a/include/asm-ppc/kexec.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef _PPC_KEXEC_H -#define _PPC_KEXEC_H - -#ifdef CONFIG_KEXEC - -/* - * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. - * I.e. Maximum page that is mapped directly into kernel memory, - * and kmap is not required. - * - * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct - * calculation for the amount of memory directly mappable into the - * kernel memory space. - */ - -/* Maximum physical address we can use pages from */ -#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) -/* Maximum address we can reach in physical address mode */ -#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) -/* Maximum address we can use for the control code buffer */ -#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE - -#define KEXEC_CONTROL_CODE_SIZE 4096 - - -#ifndef __ASSEMBLY__ - -struct kimage; - -extern void machine_kexec_simple(struct kimage *image); - -#endif /* __ASSEMBLY__ */ - -#endif /* CONFIG_KEXEC */ - -#endif /* _PPC_KEXEC_H */ diff --git a/include/asm-ppc/machdep.h b/include/asm-ppc/machdep.h index 61d385d5a..87a550163 100644 --- a/include/asm-ppc/machdep.h +++ b/include/asm-ppc/machdep.h @@ -4,7 +4,6 @@ #include <linux/config.h> #include <linux/init.h> -#include <linux/kexec.h> #include <asm/setup.h> @@ -107,30 +106,6 @@ struct machdep_calls { /* functions for dealing with other cpus */ struct smp_ops_t *smp_ops; #endif /* CONFIG_SMP */ - -#ifdef CONFIG_KEXEC - /* Called to shutdown machine specific hardware not already controlled - * by other drivers. - * XXX Should we move this one out of kexec scope? - */ - void (*machine_shutdown)(void); - - /* Called to do what every setup is needed on image and the - * reboot code buffer. Returns 0 on success. - * Provide your own (maybe dummy) implementation if your platform - * claims to support kexec. - */ - int (*machine_kexec_prepare)(struct kimage *image); - - /* Called to handle any machine specific cleanup on image */ - void (*machine_kexec_cleanup)(struct kimage *image); - - /* Called to perform the _real_ kexec. - * Do NOT allocate memory or fail here. We are past the point of - * no return. - */ - void (*machine_kexec)(struct kimage *image); -#endif /* CONFIG_KEXEC */ }; extern struct machdep_calls ppc_md; diff --git a/include/asm-x86_64/kexec.h b/include/asm-x86_64/kexec.h deleted file mode 100644 index b0531c514..000000000 --- a/include/asm-x86_64/kexec.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _X86_64_KEXEC_H -#define _X86_64_KEXEC_H - -#include <asm/page.h> -#include <asm/proto.h> - -/* - * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. - * I.e. Maximum page that is mapped directly into kernel memory, - * and kmap is not required. - * - * So far x86_64 is limited to 40 physical address bits. - */ - -/* Maximum physical address we can use pages from */ -#define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) -/* Maximum address we can reach in physical address mode */ -#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) -/* Maximum address we can use for the control pages */ -#define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) - -/* Allocate one page for the pdp and the second for the code */ -#define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL) - -#endif /* _X86_64_KEXEC_H */ diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index e16118309..cfecee5d1 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -559,7 +559,7 @@ __SYSCALL(__NR_mq_notify, sys_mq_notify) #define __NR_mq_getsetattr 245 __SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) #define __NR_kexec_load 246 -__SYSCALL(__NR_kexec_load, sys_kexec_load) +__SYSCALL(__NR_kexec_load, sys_ni_syscall) #define __NR_waitid 247 __SYSCALL(__NR_waitid, sys_waitid) #define __NR_syscall_max __NR_waitid diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index aeabe07af..376a5500a 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -21,7 +21,6 @@ extern unsigned long min_low_pfn; * highest page */ extern unsigned long max_pfn; -extern unsigned long saved_max_pfn; /* * node_bootmem_map is a map pointer - the bits represent all physical diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h index 3712aefb9..1e4c70fc1 100644 --- a/include/linux/ckrm_mem.h +++ b/include/linux/ckrm_mem.h @@ -29,8 +29,8 @@ struct ckrm_zone { struct list_head active_list; struct list_head inactive_list; - unsigned long nr_active; // # of pages in the active list - unsigned long nr_inactive; // # of pages in the inactive list + unsigned long nr_active; + unsigned long nr_inactive; unsigned long active_over; unsigned long inactive_over; @@ -38,68 +38,72 @@ struct ckrm_zone { unsigned long shrink_inactive; long shrink_weight; unsigned long shrink_flag; - - struct list_head victim_list; // list of ckrm_zones chosen for shrinking + struct list_head victim_list; /* list of ckrm_zones chosen for + * shrinking. These are over their + * 'guarantee' + */ struct zone *zone; struct ckrm_mem_res *memcls; }; struct ckrm_mem_res { unsigned long flags; - struct ckrm_core_class *core; // the core i am part of... - struct ckrm_core_class *parent; // parent of the core i am part of.... - struct ckrm_shares shares; - struct list_head mcls_list; // list of all 1-level classes - struct list_head shrink_list; // list of classes need to be shrunk - struct kref nr_users; // # of references to this class/data structure - atomic_t pg_total; // # of pages used by this class - int pg_guar; // # of pages this class is guaranteed - int pg_limit; // max # of pages this class can get - int pg_borrowed; // # of pages this class borrowed from its parent - int pg_lent; // # of pages this class lent to its children - int pg_unused; // # of pages left to this class (after giving the - // guarantees to children. need to borrow from parent if - // more than this is needed. - int impl_guar; // implicit guarantee for class with don't care guar - int nr_dontcare; // # of children with don't care guarantee + struct ckrm_core_class *core; /* the core i am part of... */ + struct ckrm_core_class *parent; /* parent of the core i am part of */ + struct ckrm_shares shares; + struct list_head mcls_list; /* list of all 1-level classes */ + struct kref nr_users; /* ref count */ + atomic_t pg_total; /* # of pages used by this class */ + int pg_guar; /* absolute # of guarantee */ + int pg_limit; /* absolute # of limit */ + int pg_borrowed; /* # of pages borrowed from parent */ + int pg_lent; /* # of pages lent to children */ + int pg_unused; /* # of pages left to this class + * (after giving the guarantees to + * children. need to borrow from + * parent if more than this is needed. + */ + int hier; /* hiearchy level, root = 0 */ + int impl_guar; /* for classes with don't care guar */ + int nr_dontcare; /* # of dont care children */ + struct ckrm_zone ckrm_zone[MAX_NR_ZONES]; + + struct list_head shrink_list; /* list of classes that are near + * limit and need to be shrunk + */ int shrink_count; unsigned long last_shrink; - int over_limit_failures; - int shrink_pages; // # of pages to free in this class - int hier; // hiearchy, root = 0 }; +#define CLS_SHRINK_BIT (1) + +#define CLS_AT_LIMIT (1) + extern atomic_t ckrm_mem_real_count; -extern unsigned int ckrm_tot_lru_pages; -extern int ckrm_nr_mem_classes; -extern struct list_head ckrm_shrink_list; -extern struct list_head ckrm_memclass_list; -extern spinlock_t ckrm_mem_lock; extern struct ckrm_res_ctlr mem_rcbs; extern struct ckrm_mem_res *ckrm_mem_root_class; +extern struct list_head ckrm_memclass_list; +extern struct list_head ckrm_shrink_list; +extern spinlock_t ckrm_mem_lock; +extern int ckrm_nr_mem_classes; +extern unsigned int ckrm_tot_lru_pages; +extern int ckrm_mem_shrink_count; +extern int ckrm_mem_shrink_to; +extern int ckrm_mem_shrink_interval ; -#define page_ckrmzone(page) ((page)->ckrm_zone) - -#define CLS_SHRINK_BIT (1) - -// used in flags. set when a class is more than 90% of its maxlimit -#define MEM_AT_LIMIT 1 - -extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *); -extern void ckrm_mem_evaluate_mm(struct mm_struct *, struct ckrm_mem_res *); -extern void ckrm_at_limit(struct ckrm_mem_res *); -extern int ckrm_memclass_valid(struct ckrm_mem_res *); -extern int ckrm_mem_get_shrink_to(void); -extern void check_memclass(struct ckrm_mem_res *, char *); +extern void ckrm_mem_migrate_mm(struct mm_struct *, struct ckrm_mem_res *); +extern void ckrm_mem_migrate_all_pages(struct ckrm_mem_res *, + struct ckrm_mem_res *); extern void memclass_release(struct kref *); - +extern void shrink_get_victims(struct zone *, unsigned long , + unsigned long, struct list_head *); +extern void ckrm_shrink_atlimit(struct ckrm_mem_res *); #else -#define ckrm_init_mm_to_current(a) do {} while (0) -#define ckrm_mem_evaluate_mm(a) do {} while (0) -#define ckrm_init_mm_to_task(a,b) do {} while (0) +#define ckrm_mem_migrate_mm(a, b) do {} while (0) +#define ckrm_mem_migrate_all_pages(a, b) do {} while (0) -#endif // CONFIG_CKRM_RES_MEM +#endif /* CONFIG_CKRM_RES_MEM */ -#endif //_LINUX_CKRM_MEM_H +#endif /* _LINUX_CKRM_MEM_H */ diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h index 1166956b7..fe752277b 100644 --- a/include/linux/ckrm_mem_inline.h +++ b/include/linux/ckrm_mem_inline.h @@ -26,8 +26,7 @@ #ifdef CONFIG_CKRM_RES_MEM -#define INACTIVE 0 -#define ACTIVE 1 +#define ckrm_shrink_list_empty() list_empty(&ckrm_shrink_list) static inline struct ckrm_mem_res * ckrm_get_mem_class(struct task_struct *tsk) @@ -36,8 +35,6 @@ ckrm_get_mem_class(struct task_struct *tsk) struct ckrm_mem_res); } -#define ckrm_shrink_list_empty() list_empty(&ckrm_shrink_list) - static inline void ckrm_set_shrink(struct ckrm_zone *cz) { @@ -56,6 +53,18 @@ ckrm_clear_shrink(struct ckrm_zone *cz) clear_bit(CLS_SHRINK_BIT, &cz->shrink_flag); } +static inline void +set_page_ckrmzone( struct page *page, struct ckrm_zone *cz) +{ + page->ckrm_zone = cz; +} + +static inline struct ckrm_zone * +page_ckrmzone(struct page *page) +{ + return page->ckrm_zone; +} + /* * Currently, a shared page that is shared by multiple classes is charged * to a class with max available guarantee. Simply replace this function @@ -67,7 +76,7 @@ ckrm_mem_share_compare(struct ckrm_mem_res *a, struct ckrm_mem_res *b) if (a == NULL) return -(b != NULL); if (b == NULL) - return 0; + return 1; if (a->pg_guar == b->pg_guar) return 0; if (a->pg_guar == CKRM_SHARE_DONTCARE) @@ -81,29 +90,30 @@ static inline void incr_use_count(struct ckrm_mem_res *cls, int borrow) { extern int ckrm_mem_shrink_at; - if (unlikely(!cls)) + struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, + mem_rcbs.resid, struct ckrm_mem_res); + + if (!cls) return; - BUG_ON(!ckrm_memclass_valid(cls)); - atomic_inc(&cls->pg_total); + atomic_inc(&cls->pg_total); if (borrow) cls->pg_lent++; - if ((cls->pg_guar == CKRM_SHARE_DONTCARE) || - (atomic_read(&cls->pg_total) > cls->pg_unused)) { - struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, + + parcls = ckrm_get_res_class(cls->parent, mem_rcbs.resid, struct ckrm_mem_res); - if (parcls) { - incr_use_count(parcls, 1); - cls->pg_borrowed++; - } - } else { + if (parcls && ((cls->pg_guar == CKRM_SHARE_DONTCARE) || + (atomic_read(&cls->pg_total) > cls->pg_unused))) { + incr_use_count(parcls, 1); + cls->pg_borrowed++; + } else atomic_inc(&ckrm_mem_real_count); - } - if (unlikely((cls->pg_limit != CKRM_SHARE_DONTCARE) && + + if ((cls->pg_limit != CKRM_SHARE_DONTCARE) && (atomic_read(&cls->pg_total) >= ((ckrm_mem_shrink_at * cls->pg_limit) / 100)) && - ((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT))) { - ckrm_at_limit(cls); + ((cls->flags & CLS_AT_LIMIT) != CLS_AT_LIMIT)) { + ckrm_shrink_atlimit(cls); } return; } @@ -111,9 +121,8 @@ incr_use_count(struct ckrm_mem_res *cls, int borrow) static inline void decr_use_count(struct ckrm_mem_res *cls, int borrowed) { - if (unlikely(!cls)) + if (!cls) return; - BUG_ON(!ckrm_memclass_valid(cls)); atomic_dec(&cls->pg_total); if (borrowed) cls->pg_lent--; @@ -132,64 +141,50 @@ decr_use_count(struct ckrm_mem_res *cls, int borrowed) static inline void ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls) { - if (unlikely(cls == NULL)) { - cls = ckrm_mem_root_class; - } - if (likely(cls != NULL)) { - struct ckrm_zone *czone = &cls->ckrm_zone[page_zonenum(page)]; - if (unlikely(page->ckrm_zone)) { - kref_put(&cls->nr_users, memclass_release); - } - page->ckrm_zone = czone; - kref_get(&cls->nr_users); - } else { - page->ckrm_zone = NULL; - } -} + struct ckrm_zone *new_czone, *old_czone; -static inline void -ckrm_set_pages_class(struct page *pages, int numpages, struct ckrm_mem_res *cls) -{ - int i; - for (i = 0; i < numpages; pages++, i++) { - ckrm_set_page_class(pages, cls); - } -} - -static inline void -ckrm_clear_page_class(struct page *page) -{ - if (likely(page->ckrm_zone != NULL)) { - if (CkrmAccount(page)) { - decr_use_count(page->ckrm_zone->memcls, 0); - ClearCkrmAccount(page); + if (!cls) { + if (!ckrm_mem_root_class) { + set_page_ckrmzone(page, NULL); + return; } - kref_put(&page->ckrm_zone->memcls->nr_users, memclass_release); - page->ckrm_zone = NULL; + cls = ckrm_mem_root_class; } + new_czone = &cls->ckrm_zone[page_zonenum(page)]; + old_czone = page_ckrmzone(page); + + if (old_czone) + kref_put(&old_czone->memcls->nr_users, memclass_release); + + set_page_ckrmzone(page, new_czone); + kref_get(&cls->nr_users); + incr_use_count(cls, 0); + SetPageCkrmAccount(page); } static inline void ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls) { - struct ckrm_zone *old_czone = page->ckrm_zone, *new_czone; + struct ckrm_zone *old_czone = page_ckrmzone(page), *new_czone; struct ckrm_mem_res *oldcls; - if (unlikely(!old_czone || !newcls)) { - BUG_ON(CkrmAccount(page)); - return; + if (!newcls) { + if (!ckrm_mem_root_class) + return; + newcls = ckrm_mem_root_class; } - BUG_ON(!CkrmAccount(page)); oldcls = old_czone->memcls; - if (oldcls == NULL || (oldcls == newcls)) + if (oldcls == newcls) return; - kref_put(&oldcls->nr_users, memclass_release); - decr_use_count(oldcls, 0); - - page->ckrm_zone = new_czone = &newcls->ckrm_zone[page_zonenum(page)]; + if (oldcls) { + kref_put(&oldcls->nr_users, memclass_release); + decr_use_count(oldcls, 0); + } + new_czone = &newcls->ckrm_zone[page_zonenum(page)]; + set_page_ckrmzone(page, new_czone); kref_get(&newcls->nr_users); incr_use_count(newcls, 0); @@ -205,34 +200,45 @@ ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls) } } +static inline void +ckrm_clear_page_class(struct page *page) +{ + struct ckrm_zone *czone = page_ckrmzone(page); + if (czone != NULL) { + if (PageCkrmAccount(page)) { + decr_use_count(czone->memcls, 0); + ClearPageCkrmAccount(page); + } + kref_put(&czone->memcls->nr_users, memclass_release); + set_page_ckrmzone(page, NULL); + } +} + static inline void ckrm_mem_inc_active(struct page *page) { - struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class; + struct ckrm_mem_res *cls = ckrm_get_mem_class(current) + ?: ckrm_mem_root_class; + struct ckrm_zone *czone; if (cls == NULL) return; - BUG_ON(CkrmAccount(page)); - BUG_ON(page->ckrm_zone != NULL); ckrm_set_page_class(page, cls); - incr_use_count(cls, 0); - SetCkrmAccount(page); - BUG_ON(page->ckrm_zone == NULL); - page->ckrm_zone->nr_active++; - list_add(&page->lru, &page->ckrm_zone->active_list); + czone = page_ckrmzone(page); + czone->nr_active++; + list_add(&page->lru, &czone->active_list); } static inline void ckrm_mem_dec_active(struct page *page) { - if (page->ckrm_zone == NULL) + struct ckrm_zone *czone = page_ckrmzone(page); + if (czone == NULL) return; - BUG_ON(page->ckrm_zone->memcls == NULL); - BUG_ON(!CkrmAccount(page)); list_del(&page->lru); - page->ckrm_zone->nr_active--; + czone->nr_active--; ckrm_clear_page_class(page); } @@ -240,39 +246,59 @@ ckrm_mem_dec_active(struct page *page) static inline void ckrm_mem_inc_inactive(struct page *page) { - struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class; + struct ckrm_mem_res *cls = ckrm_get_mem_class(current) + ?: ckrm_mem_root_class; + struct ckrm_zone *czone; if (cls == NULL) return; - BUG_ON(CkrmAccount(page)); - BUG_ON(page->ckrm_zone != NULL); ckrm_set_page_class(page, cls); - incr_use_count(cls, 0); - SetCkrmAccount(page); - BUG_ON(page->ckrm_zone == NULL); - page->ckrm_zone->nr_inactive++; - list_add(&page->lru, &page->ckrm_zone->inactive_list); + czone = page_ckrmzone(page); + czone->nr_inactive++; + list_add(&page->lru, &czone->inactive_list); } static inline void ckrm_mem_dec_inactive(struct page *page) { - if (page->ckrm_zone == NULL) + struct ckrm_zone *czone = page_ckrmzone(page); + if (czone == NULL) return; - BUG_ON(page->ckrm_zone->memcls == NULL); - BUG_ON(!CkrmAccount(page)); - page->ckrm_zone->nr_inactive--; + czone->nr_inactive--; list_del(&page->lru); ckrm_clear_page_class(page); } +static inline void +ckrm_zone_add_active(struct ckrm_zone *czone, int cnt) +{ + czone->nr_active += cnt; +} + +static inline void +ckrm_zone_add_inactive(struct ckrm_zone *czone, int cnt) +{ + czone->nr_inactive += cnt; +} + +static inline void +ckrm_zone_sub_active(struct ckrm_zone *czone, int cnt) +{ + czone->nr_active -= cnt; +} + +static inline void +ckrm_zone_sub_inactive(struct ckrm_zone *czone, int cnt) +{ + czone->nr_inactive -= cnt; +} + static inline int ckrm_class_limit_ok(struct ckrm_mem_res *cls) { int ret; - extern int ckrm_mem_fail_over; if ((mem_rcbs.resid == -1) || !cls) { return 1; @@ -281,19 +307,25 @@ ckrm_class_limit_ok(struct ckrm_mem_res *cls) struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, mem_rcbs.resid, struct ckrm_mem_res); ret = (parcls ? ckrm_class_limit_ok(parcls) : 0); - } else { - ret = (atomic_read(&cls->pg_total) <= - ((ckrm_mem_fail_over * cls->pg_limit) / 100)); - } + } else + ret = (atomic_read(&cls->pg_total) <= cls->pg_limit); + + /* If we are failing, just nudge the back end */ + if (ret == 0) + ckrm_shrink_atlimit(cls); - if (ret == 0) { - // if we are failing... just nudge the back end - ckrm_at_limit(cls); - } return ret; } -// task/mm initializations/cleanup +static inline void +ckrm_page_init(struct page *page) +{ + page->flags &= ~(1 << PG_ckrm_account); + set_page_ckrmzone(page, NULL); +} + + +/* task/mm initializations/cleanup */ static inline void ckrm_task_mm_init(struct task_struct *tsk) @@ -302,26 +334,42 @@ ckrm_task_mm_init(struct task_struct *tsk) } static inline void -ckrm_task_change_mm(struct task_struct *tsk, struct mm_struct *oldmm, struct mm_struct *newmm) +ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task) +{ + spin_lock(&mm->peertask_lock); + if (!list_empty(&task->mm_peers)) { + printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n"); + list_del_init(&task->mm_peers); + } + list_add_tail(&task->mm_peers, &mm->tasklist); + spin_unlock(&mm->peertask_lock); + if (mm->memclass != ckrm_get_mem_class(task)) + ckrm_mem_migrate_mm(mm, NULL); + return; +} + +static inline void +ckrm_task_mm_change(struct task_struct *tsk, + struct mm_struct *oldmm, struct mm_struct *newmm) { if (oldmm) { spin_lock(&oldmm->peertask_lock); list_del(&tsk->mm_peers); - ckrm_mem_evaluate_mm(oldmm, NULL); + ckrm_mem_migrate_mm(oldmm, NULL); spin_unlock(&oldmm->peertask_lock); } spin_lock(&newmm->peertask_lock); list_add_tail(&tsk->mm_peers, &newmm->tasklist); - ckrm_mem_evaluate_mm(newmm, NULL); + ckrm_mem_migrate_mm(newmm, NULL); spin_unlock(&newmm->peertask_lock); } static inline void -ckrm_task_clear_mm(struct task_struct *tsk, struct mm_struct *mm) +ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm) { spin_lock(&mm->peertask_lock); list_del_init(&tsk->mm_peers); - ckrm_mem_evaluate_mm(mm, NULL); + ckrm_mem_migrate_mm(mm, NULL); spin_unlock(&mm->peertask_lock); } @@ -348,56 +396,65 @@ ckrm_mm_clearclass(struct mm_struct *mm) } } -static inline void -ckrm_zone_inc_active(struct ckrm_zone *czone, int cnt) +static inline void ckrm_init_lists(struct zone *zone) {} + +static inline void ckrm_add_tail_inactive(struct page *page) { - czone->nr_active += cnt; + struct ckrm_zone *ckrm_zone = page_ckrmzone(page); + list_add_tail(&page->lru, &ckrm_zone->inactive_list); } -static inline void -ckrm_zone_inc_inactive(struct ckrm_zone *czone, int cnt) +#else + +#define ckrm_shrink_list_empty() (1) + +static inline void * +ckrm_get_memclass(struct task_struct *tsk) { - czone->nr_inactive += cnt; + return NULL; } -static inline void -ckrm_zone_dec_active(struct ckrm_zone *czone, int cnt) +static inline void ckrm_clear_page_class(struct page *p) {} + +static inline void ckrm_mem_inc_active(struct page *p) {} +static inline void ckrm_mem_dec_active(struct page *p) {} +static inline void ckrm_mem_inc_inactive(struct page *p) {} +static inline void ckrm_mem_dec_inactive(struct page *p) {} + +#define ckrm_zone_add_active(a, b) do {} while (0) +#define ckrm_zone_add_inactive(a, b) do {} while (0) +#define ckrm_zone_sub_active(a, b) do {} while (0) +#define ckrm_zone_sub_inactive(a, b) do {} while (0) + +#define ckrm_class_limit_ok(a) (1) + +static inline void ckrm_page_init(struct page *p) {} +static inline void ckrm_task_mm_init(struct task_struct *tsk) {} +static inline void ckrm_task_mm_set(struct mm_struct * mm, + struct task_struct *task) {} +static inline void ckrm_task_mm_change(struct task_struct *tsk, + struct mm_struct *oldmm, struct mm_struct *newmm) {} +static inline void ckrm_task_mm_clear(struct task_struct *tsk, + struct mm_struct *mm) {} + +static inline void ckrm_mm_init(struct mm_struct *mm) {} + +/* using #define instead of static inline as the prototype requires * + * data structures that is available only with the controller enabled */ +#define ckrm_mm_setclass(a, b) do {} while(0) + +static inline void ckrm_mm_clearclass(struct mm_struct *mm) {} + +static inline void ckrm_init_lists(struct zone *zone) { - czone->nr_active -= cnt; + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); } -static inline void -ckrm_zone_dec_inactive(struct ckrm_zone *czone, int cnt) +static inline void ckrm_add_tail_inactive(struct page *page) { - czone->nr_inactive -= cnt; + struct zone *zone = page_zone(page); + list_add_tail(&page->lru, &zone->inactive_list); } - -#else // !CONFIG_CKRM_RES_MEM - -#define ckrm_set_page_class(a,b) do{}while(0) -#define ckrm_set_pages_class(a,b,c) do{}while(0) -#define ckrm_clear_page_class(a) do{}while(0) -#define ckrm_clear_pages_class(a,b) do{}while(0) -#define ckrm_change_page_class(a,b) do{}while(0) -#define ckrm_change_pages_class(a,b,c) do{}while(0) -#define ckrm_mem_inc_active(a) do{}while(0) -#define ckrm_mem_dec_active(a) do{}while(0) -#define ckrm_mem_inc_inactive(a) do{}while(0) -#define ckrm_mem_dec_inactive(a) do{}while(0) -#define ckrm_shrink_list_empty() (1) -#define ckrm_kick_page(a,b) (0) -#define ckrm_class_limit_ok(a) (1) -#define ckrm_task_mm_init(a) do{}while(0) -#define ckrm_task_clear_mm(a, b) do{}while(0) -#define ckrm_task_change_mm(a, b, c) do{}while(0) -#define ckrm_mm_init(a) do{}while(0) -#define ckrm_mm_setclass(a, b) do{}while(0) -#define ckrm_mm_clearclass(a) do{}while(0) -#define ckrm_zone_inc_active(a, b) do{}while(0) -#define ckrm_zone_inc_inactive(a, b) do{}while(0) -#define ckrm_zone_dec_active(a, b) do{}while(0) -#define ckrm_zone_dec_inactive(a, b) do{}while(0) - -#endif // CONFIG_CKRM_RES_MEM - -#endif // _LINUX_CKRM_MEM_INLINE_H_ +#endif +#endif /* _LINUX_CKRM_MEM_INLINE_H_ */ diff --git a/include/linux/ckrm_tsk.h b/include/linux/ckrm_tsk.h index f61453901..9ef07a2c5 100644 --- a/include/linux/ckrm_tsk.h +++ b/include/linux/ckrm_tsk.h @@ -13,23 +13,29 @@ * */ +/* Changes + * + * 31 Mar 2004 + * Created. + */ + #ifndef _LINUX_CKRM_TSK_H #define _LINUX_CKRM_TSK_H #ifdef CONFIG_CKRM_TYPE_TASKCLASS #include <linux/ckrm_rc.h> -typedef int (*get_ref_t) (struct ckrm_core_class *, int); -typedef void (*put_ref_t) (struct ckrm_core_class *); +typedef int (*get_ref_t) (void *, int); +typedef void (*put_ref_t) (void *); -extern int numtasks_get_ref(struct ckrm_core_class *, int); -extern void numtasks_put_ref(struct ckrm_core_class *); +extern int numtasks_get_ref(void *, int); +extern void numtasks_put_ref(void *); extern void ckrm_numtasks_register(get_ref_t, put_ref_t); #else /* CONFIG_CKRM_TYPE_TASKCLASS */ -#define numtasks_get_ref(core_class, ref) (1) -#define numtasks_put_ref(core_class) do {} while (0) +#define numtasks_get_ref(a, b) (1) +#define numtasks_put_ref(a) do {} while(0) #endif /* CONFIG_CKRM_TYPE_TASKCLASS */ #endif /* _LINUX_CKRM_RES_H */ diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h deleted file mode 100644 index 11c65e908..000000000 --- a/include/linux/crash_dump.h +++ /dev/null @@ -1,34 +0,0 @@ -#include <linux/kexec.h> -#include <linux/smp_lock.h> -#include <linux/device.h> -#include <linux/proc_fs.h> -#ifdef CONFIG_CRASH_DUMP -#include <asm/crash_dump.h> -#endif - -extern unsigned long saved_max_pfn; -extern struct memelfnote memelfnote; -extern int notesize(struct memelfnote *); -extern char *storenote(struct memelfnote *, char *); -extern void elf_kcore_store_hdr(char *, int, int, struct kcore_list *); - -#ifdef CONFIG_CRASH_DUMP -extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, int); -extern void __crash_machine_kexec(void); -extern int crash_dump_on; -static inline void crash_machine_kexec(void) -{ - __crash_machine_kexec(); -} -#else -#define crash_machine_kexec() do { } while(0) -#endif - - -#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_PROC_FS) -extern void crash_enable_by_proc(void); -extern void crash_create_proc_entry(void); -#else -#define crash_enable_by_proc() do { } while(0) -#define crash_create_proc_entry() do { } while(0) -#endif diff --git a/include/linux/dump.h b/include/linux/dump.h index df06c889c..00c690f44 100644 --- a/include/linux/dump.h +++ b/include/linux/dump.h @@ -20,7 +20,7 @@ #ifndef _DUMP_H #define _DUMP_H -#if defined(CONFIG_CRASH_DUMP) +#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE) #include <linux/list.h> #include <linux/notifier.h> diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h index 12788c896..a9858024b 100644 --- a/include/linux/ext2_fs.h +++ b/include/linux/ext2_fs.h @@ -197,8 +197,8 @@ struct ext2_group_desc #define EXT2_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ #ifdef CONFIG_VSERVER_LEGACY -#define EXT2_FL_USER_VISIBLE 0x0C03DFFF /* User visible flags */ -#define EXT2_FL_USER_MODIFIABLE 0x0C0380FF /* User modifiable flags */ +#define EXT2_FL_USER_VISIBLE 0x0803DFFF /* User visible flags */ +#define EXT2_FL_USER_MODIFIABLE 0x080380FF /* User modifiable flags */ #else #define EXT2_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ #define EXT2_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 818516b81..f2d1cd9fa 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -190,8 +190,8 @@ struct ext3_group_desc #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ #ifdef CONFIG_VSERVER_LEGACY -#define EXT3_FL_USER_VISIBLE 0x0C03DFFF /* User visible flags */ -#define EXT3_FL_USER_MODIFIABLE 0x0C0380FF /* User modifiable flags */ +#define EXT3_FL_USER_VISIBLE 0x0803DFFF /* User visible flags */ +#define EXT3_FL_USER_MODIFIABLE 0x080380FF /* User modifiable flags */ #else #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d3950fcf5..7153aef34 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -30,7 +30,6 @@ static inline void *kmap(struct page *page) #define kmap_atomic(page, idx) page_address(page) #define kunmap_atomic(addr, idx) do { } while (0) -#define kmap_atomic_pfn(pfn, idx) ((char *)page_address(pfn_to_page(pfn))) #define kmap_atomic_to_page(ptr) virt_to_page(ptr) #endif /* CONFIG_HIGHMEM */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h deleted file mode 100644 index 523c45ab9..000000000 --- a/include/linux/kexec.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef LINUX_KEXEC_H -#define LINUX_KEXEC_H - -#ifdef CONFIG_KEXEC -#include <linux/types.h> -#include <linux/list.h> -#include <asm/kexec.h> - -/* - * This structure is used to hold the arguments that are used when loading - * kernel binaries. - */ - -typedef unsigned long kimage_entry_t; -#define IND_DESTINATION 0x1 -#define IND_INDIRECTION 0x2 -#define IND_DONE 0x4 -#define IND_SOURCE 0x8 - -#define KEXEC_SEGMENT_MAX 8 -struct kexec_segment { - void *buf; - size_t bufsz; - void *mem; - size_t memsz; -}; - -struct kimage { - kimage_entry_t head; - kimage_entry_t *entry; - kimage_entry_t *last_entry; - - unsigned long destination; - - unsigned long start; - struct page *control_code_page; - - unsigned long nr_segments; - struct kexec_segment segment[KEXEC_SEGMENT_MAX]; - - struct list_head control_pages; - struct list_head dest_pages; - struct list_head unuseable_pages; -}; - - -/* kexec interface functions */ -extern void machine_kexec(struct kimage *image); -extern int machine_kexec_prepare(struct kimage *image); -extern void machine_kexec_cleanup(struct kimage *image); -extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments, - struct kexec_segment *segments); -extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); -extern struct kimage *kexec_image; -extern struct kimage *kexec_crash_image; -#endif -#endif /* LINUX_KEXEC_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index d025bcbc6..447e46994 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -240,7 +240,7 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ #ifdef CONFIG_CKRM_RES_MEM struct ckrm_zone *ckrm_zone; -#endif // CONFIG_CKRM_RES_MEM +#endif }; /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c99f570b7..282141e43 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -75,10 +75,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ -#ifdef CONFIG_CKRM_RES_MEM -#define PG_ckrm_account 19 /* This page is accounted by CKRM */ -#endif - +#define PG_ckrm_account 20 /* CKRM accounting */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -303,9 +300,9 @@ extern unsigned long __read_page_state(unsigned offset); #endif #ifdef CONFIG_CKRM_RES_MEM -#define CkrmAccount(page) test_bit(PG_ckrm_account, &(page)->flags) -#define SetCkrmAccount(page) set_bit(PG_ckrm_account, &(page)->flags) -#define ClearCkrmAccount(page) clear_bit(PG_ckrm_account, &(page)->flags) +#define PageCkrmAccount(page) test_bit(PG_ckrm_account, &(page)->flags) +#define SetPageCkrmAccount(page) set_bit(PG_ckrm_account, &(page)->flags) +#define ClearPageCkrmAccount(page) clear_bit(PG_ckrm_account, &(page)->flags) #endif struct page; /* forward declaration */ diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 5460e94a1..d60fafc8b 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -51,8 +51,6 @@ extern void machine_restart(char *cmd); extern void machine_halt(void); extern void machine_power_off(void); -extern void machine_shutdown(void); - #endif #endif /* _LINUX_REBOOT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9cb07d16b..74719a938 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -31,6 +31,7 @@ #include <linux/percpu.h> #include <linux/topology.h> #include <linux/vs_base.h> +#include <linux/taskdelays.h> struct exec_domain; extern int exec_shield; @@ -267,8 +268,8 @@ struct mm_struct { struct kioctx default_kioctx; #ifdef CONFIG_CKRM_RES_MEM struct ckrm_mem_res *memclass; - struct list_head tasklist; /* list of all tasks sharing this address space */ - spinlock_t peertask_lock; /* protect above tasklist */ + struct list_head tasklist; /* tasks sharing this address space */ + spinlock_t peertask_lock; /* protect tasklist above */ #endif }; @@ -718,25 +719,25 @@ struct task_struct { struct mempolicy *mempolicy; short il_next; /* could be shared with used_math */ #endif - #ifdef CONFIG_CKRM - spinlock_t ckrm_tsklock; + spinlock_t ckrm_tsklock; void *ce_data; #ifdef CONFIG_CKRM_TYPE_TASKCLASS - // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; - struct list_head taskclass_link; + struct list_head taskclass_link; #ifdef CONFIG_CKRM_CPU_SCHEDULE struct ckrm_cpu_class *cpu_class; - //track cpu demand of this task + /* track cpu demand of this task */ struct ckrm_cpu_demand_stat demand_stat; -#endif //CONFIG_CKRM_CPU_SCHEDULE -#endif // CONFIG_CKRM_TYPE_TASKCLASS +#endif /* CONFIG_CKRM_CPU_SCHEDULE */ +#endif /* CONFIG_CKRM_TYPE_TASKCLASS */ #ifdef CONFIG_CKRM_RES_MEM - struct list_head mm_peers; // list of tasks using same mm_struct -#endif // CONFIG_CKRM_RES_MEM -#endif // CONFIG_CKRM - struct task_delay_info delays; + struct list_head mm_peers; /* list of tasks using same mm_struct */ +#endif +#endif /* CONFIG_CKRM */ +#ifdef CONFIG_DELAY_ACCT + struct task_delay_info delays; +#endif }; static inline pid_t process_group(struct task_struct *tsk) @@ -1303,6 +1304,86 @@ extern void normalize_rt_tasks(void); #endif +/* API for registering delay info */ +#ifdef CONFIG_DELAY_ACCT + +#define test_delay_flag(tsk,flg) ((tsk)->flags & (flg)) +#define set_delay_flag(tsk,flg) ((tsk)->flags |= (flg)) +#define clear_delay_flag(tsk,flg) ((tsk)->flags &= ~(flg)) + +#define def_delay_var(var) unsigned long long var +#define get_delay(tsk,field) ((tsk)->delays.field) + +#define start_delay(var) ((var) = sched_clock()) +#define start_delay_set(var,flg) (set_delay_flag(current,flg),(var) = sched_clock()) + +#define inc_delay(tsk,field) (((tsk)->delays.field)++) + +/* because of hardware timer drifts in SMPs and task continue on different cpu + * then where the start_ts was taken there is a possibility that + * end_ts < start_ts by some usecs. In this case we ignore the diff + * and add nothing to the total. + */ +#ifdef CONFIG_SMP +#define test_ts_integrity(start_ts,end_ts) (likely((end_ts) > (start_ts))) +#else +#define test_ts_integrity(start_ts,end_ts) (1) +#endif + +#define add_delay_ts(tsk,field,start_ts,end_ts) \ + do { if (test_ts_integrity(start_ts,end_ts)) (tsk)->delays.field += ((end_ts)-(start_ts)); } while (0) + +#define add_delay_clear(tsk,field,start_ts,flg) \ + do { \ + unsigned long long now = sched_clock();\ + add_delay_ts(tsk,field,start_ts,now); \ + clear_delay_flag(tsk,flg); \ + } while (0) + +static inline void add_io_delay(unsigned long long dstart) +{ + struct task_struct * tsk = current; + unsigned long long now = sched_clock(); + unsigned long long val; + + if (test_ts_integrity(dstart,now)) + val = now - dstart; + else + val = 0; + if (test_delay_flag(tsk,PF_MEMIO)) { + tsk->delays.mem_iowait_total += val; + tsk->delays.num_memwaits++; + } else { + tsk->delays.iowait_total += val; + tsk->delays.num_iowaits++; + } + clear_delay_flag(tsk,PF_IOWAIT); +} + +inline static void init_delays(struct task_struct *tsk) +{ + memset((void*)&tsk->delays,0,sizeof(tsk->delays)); +} + +#else + +#define test_delay_flag(tsk,flg) (0) +#define set_delay_flag(tsk,flg) do { } while (0) +#define clear_delay_flag(tsk,flg) do { } while (0) + +#define def_delay_var(var) +#define get_delay(tsk,field) (0) + +#define start_delay(var) do { } while (0) +#define start_delay_set(var,flg) do { } while (0) + +#define inc_delay(tsk,field) do { } while (0) +#define add_delay_ts(tsk,field,start_ts,now) do { } while (0) +#define add_delay_clear(tsk,field,start_ts,flg) do { } while (0) +#define add_io_delay(dstart) do { } while (0) +#define init_delays(tsk) do { } while (0) +#endif + #endif /* __KERNEL__ */ #endif diff --git a/include/linux/vserver/inode.h b/include/linux/vserver/inode.h index d9587f219..a1054e831 100644 --- a/include/linux/vserver/inode.h +++ b/include/linux/vserver/inode.h @@ -57,10 +57,6 @@ extern int vc_set_iattr_v0(uint32_t, void __user *); extern int vc_get_iattr(uint32_t, void __user *); extern int vc_set_iattr(uint32_t, void __user *); -extern int vc_iattr_ioctl(struct dentry *de, - unsigned int cmd, - unsigned long arg); - #endif /* __KERNEL__ */ /* inode ioctls */ @@ -68,9 +64,6 @@ extern int vc_iattr_ioctl(struct dentry *de, #define FIOC_GETXFLG _IOR('x', 5, long) #define FIOC_SETXFLG _IOW('x', 6, long) -#define FIOC_GETIATTR _IOR('x', 7, long) -#define FIOC_SETIATTR _IOR('x', 8, long) - #else /* _VX_INODE_H */ #warning duplicate inclusion #endif /* _VX_INODE_H */ diff --git a/init/Kconfig b/init/Kconfig index 509119525..b425cfb2c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -202,22 +202,11 @@ config CKRM_RES_NUMTASKS depends on CKRM_TYPE_TASKCLASS default m help - Provides a Resource Controller for CKRM that allows limiting number of + Provides a Resource Controller for CKRM that allows limiting no of tasks a task class can have. Say N if unsure, Y to use the feature. -config CKRM_RES_NUMTASKS_FORKRATE - tristate "Number of Tasks Resource Manager for Fork Rate" - depends on CKRM_RES_NUMTASKS - default y - help - Provides a Resource Controller for CKRM that allows limiting the rate - of tasks a task class can fork per hour. - - Say N if unsure, Y to use the feature. - - config CKRM_CPU_SCHEDULE bool "CKRM CPU scheduler" depends on CKRM_TYPE_TASKCLASS diff --git a/kernel/Makefile b/kernel/Makefile index 2096afd0c..23dc38fa1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -26,7 +26,6 @@ obj-$(CONFIG_MODULE_SIG) += module-verify-sig.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o -obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o @@ -37,7 +36,6 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ -obj-$(CONFIG_CRASH_DUMP) += crash.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index 0c3c98036..7ee24fb07 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -11,5 +11,5 @@ obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o -obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o +obj-$(CONFIG_CKRM_RES_MEM) += ckrm_memcore.o ckrm_memctlr.o obj-$(CONFIG_CKRM_RES_NULL) += ckrm_null_class.o diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c index 929c22d97..301ccbb89 100644 --- a/kernel/ckrm/ckrm_cpu_class.c +++ b/kernel/ckrm/ckrm_cpu_class.c @@ -145,8 +145,6 @@ static void ckrm_free_cpu_class(void *my_res) struct ckrm_cpu_class *cls = my_res, *parres, *childres; ckrm_core_class_t *child = NULL; int maxlimit; - ckrm_lrq_t* queue; - int i; if (!cls) return; @@ -154,15 +152,6 @@ static void ckrm_free_cpu_class(void *my_res) /*the default class can't be freed*/ if (cls == get_default_cpu_class()) return; -#if 1 -#warning "ACB: Remove freed class from any classqueues [PL #4233]" - for (i = 0 ; i < NR_CPUS ; i++) { - queue = &cls->local_queues[i]; - if (cls_in_classqueue(&queue->classqueue_linkobj)) - classqueue_dequeue(queue->classqueue, - &queue->classqueue_linkobj); - } -#endif // Assuming there will be no children when this function is called parres = ckrm_get_cpu_class(cls->parent); diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c index 5f59b375e..23f48ec02 100644 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ b/kernel/ckrm/ckrm_cpu_monitor.c @@ -841,9 +841,8 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) total_pressure += lrq->lrq_load; } -#define FIX_SHARES -#ifdef FIX_SHARES -#warning "ACB: fix share initialization problem [PL #4227]" +#if 1 +#warning "ACB taking out suspicious early return" #else if (! total_pressure) return; @@ -860,10 +859,6 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) /*give idle class a high share to boost interactiveness */ lw = cpu_class_weight(clsptr); else { -#ifdef FIX_SHARES - if (! total_pressure) - return; -#endif lw = lrq->lrq_load * class_weight; do_div(lw,total_pressure); if (!lw) @@ -965,11 +960,9 @@ static int thread_exit = 0; static int ckrm_cpu_monitord(void *nothing) { daemonize("ckrm_cpu_ctrld"); - current->flags |= PF_NOFREEZE; - for (;;) { /*sleep for sometime before next try*/ - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(CPU_MONITOR_INTERVAL); ckrm_cpu_monitor(1); if (thread_exit) { diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c deleted file mode 100644 index 736b579c7..000000000 --- a/kernel/ckrm/ckrm_mem.c +++ /dev/null @@ -1,981 +0,0 @@ -/* ckrm_mem.c - Memory Resource Manager for CKRM - * - * Copyright (C) Chandra Seetharaman, IBM Corp. 2004 - * - * Provides a Memory Resource controller for CKRM - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <linux/swapops.h> -#include <linux/cache.h> -#include <linux/percpu.h> -#include <linux/pagevec.h> -#include <linux/parser.h> -#include <linux/ckrm_mem_inline.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/errno.h> - -#define MEM_NAME "mem" - -#define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2 - -/* all 1-level memory_share_class are chained together */ -LIST_HEAD(ckrm_memclass_list); -LIST_HEAD(ckrm_shrink_list); -spinlock_t ckrm_mem_lock; // protects both lists above -unsigned int ckrm_tot_lru_pages; // total # of pages in the system - // currently doesn't handle memory add/remove -struct ckrm_mem_res *ckrm_mem_root_class; -atomic_t ckrm_mem_real_count = ATOMIC_INIT(0); -static void ckrm_mem_evaluate_all_pages(struct ckrm_mem_res *); -int ckrm_nr_mem_classes = 0; - -EXPORT_SYMBOL_GPL(ckrm_memclass_list); -EXPORT_SYMBOL_GPL(ckrm_shrink_list); -EXPORT_SYMBOL_GPL(ckrm_mem_lock); -EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages); -EXPORT_SYMBOL_GPL(ckrm_mem_root_class); -EXPORT_SYMBOL_GPL(ckrm_mem_real_count); -EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes); - -/* Initialize rescls values - * May be called on each rcfs unmount or as part of error recovery - * to make share values sane. - * Does not traverse hierarchy reinitializing children. - */ - -void -memclass_release(struct kref *kref) -{ - struct ckrm_mem_res *cls = container_of(kref, struct ckrm_mem_res, nr_users); - BUG_ON(ckrm_memclass_valid(cls)); - kfree(cls); -} -EXPORT_SYMBOL_GPL(memclass_release); - -static void -set_ckrm_tot_pages(void) -{ - struct zone *zone; - int tot_lru_pages = 0; - - for_each_zone(zone) { - tot_lru_pages += zone->nr_active; - tot_lru_pages += zone->nr_inactive; - tot_lru_pages += zone->free_pages; - } - ckrm_tot_lru_pages = tot_lru_pages; -} - -static void -mem_res_initcls_one(struct ckrm_mem_res *res) -{ - int zindex = 0; - struct zone *zone; - - memset(res, 0, sizeof(struct ckrm_mem_res)); - - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; - - res->pg_guar = CKRM_SHARE_DONTCARE; - res->pg_limit = CKRM_SHARE_DONTCARE; - - INIT_LIST_HEAD(&res->shrink_list); - INIT_LIST_HEAD(&res->mcls_list); - - for_each_zone(zone) { - INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list); - INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list); - INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list); - res->ckrm_zone[zindex].nr_active = 0; - res->ckrm_zone[zindex].nr_inactive = 0; - res->ckrm_zone[zindex].zone = zone; - res->ckrm_zone[zindex].memcls = res; - zindex++; - } - - res->pg_unused = 0; - res->nr_dontcare = 1; // for default class - kref_init(&res->nr_users); -} - -static void -set_impl_guar_children(struct ckrm_mem_res *parres) -{ - ckrm_core_class_t *child = NULL; - struct ckrm_mem_res *cres; - int nr_dontcare = 1; // for defaultclass - int guar, impl_guar; - int resid = mem_rcbs.resid; - - ckrm_lock_hier(parres->core); - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - // treat NULL cres as don't care as that child is just being - // created. - // FIXME: need a better way to handle this case. - if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) { - nr_dontcare++; - } - } - - parres->nr_dontcare = nr_dontcare; - guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ? - parres->impl_guar : parres->pg_unused; - impl_guar = guar / parres->nr_dontcare; - - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) { - cres->impl_guar = impl_guar; - set_impl_guar_children(cres); - } - } - ckrm_unlock_hier(parres->core); - -} - -void -check_memclass(struct ckrm_mem_res *res, char *str) -{ - int i, act = 0, inact = 0; - struct zone *zone; - struct ckrm_zone *ckrm_zone; - struct list_head *pos; - struct page *page; - -#if 0 - printk("Check<%s> %s: total=%d\n", - str, res->core->name, atomic_read(&res->pg_total)); -#endif - for (i = 0; i < MAX_NR_ZONES; i++) { - act = 0; inact = 0; - ckrm_zone = &res->ckrm_zone[i]; - zone = ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - pos = ckrm_zone->inactive_list.next; - while (pos != &ckrm_zone->inactive_list) { - page = list_entry(pos, struct page, lru); - pos = pos->next; - inact++; - } - pos = ckrm_zone->active_list.next; - while (pos != &ckrm_zone->active_list) { - page = list_entry(pos, struct page, lru); - pos = pos->next; - act++; - } - spin_unlock_irq(&zone->lru_lock); -#if 0 - printk("Check<%s>(zone=%d): act %ld, inae %ld lact %d lina %d\n", - str, i, ckrm_zone->nr_active, ckrm_zone->nr_inactive, - act, inact); -#endif - } -} -EXPORT_SYMBOL_GPL(check_memclass); - -static void * -mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent) -{ - struct ckrm_mem_res *res, *pres; - - if (mem_rcbs.resid == -1) { - return NULL; - } - - pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res); - if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) { - printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n", - CKRM_MEM_MAX_HIERARCHY); - return NULL; - } - - if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) { - printk(KERN_ERR "MEM_RC: Only one root class is allowed\n"); - return NULL; - } - - if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) { - printk(KERN_ERR "MEM_RC: child class with no root class!!"); - return NULL; - } - - res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC); - - if (res) { - mem_res_initcls_one(res); - res->core = core; - res->parent = parent; - spin_lock_irq(&ckrm_mem_lock); - list_add(&res->mcls_list, &ckrm_memclass_list); - spin_unlock_irq(&ckrm_mem_lock); - if (parent == NULL) { - // I am part of the root class. So, set the max to - // number of pages available - res->pg_guar = ckrm_tot_lru_pages; - res->pg_unused = ckrm_tot_lru_pages; - res->pg_limit = ckrm_tot_lru_pages; - res->hier = 0; - ckrm_mem_root_class = res; - } else { - int guar; - res->hier = pres->hier + 1; - set_impl_guar_children(pres); - guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ? - pres->impl_guar : pres->pg_unused; - res->impl_guar = guar / pres->nr_dontcare; - } - ckrm_nr_mem_classes++; - } - else - printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n"); - return res; -} - -/* - * It is the caller's responsibility to make sure that the parent only - * has chilren that are to be accounted. i.e if a new child is added - * this function should be called after it has been added, and if a - * child is deleted this should be called after the child is removed. - */ -static void -child_maxlimit_changed_local(struct ckrm_mem_res *parres) -{ - int maxlimit = 0; - struct ckrm_mem_res *childres; - ckrm_core_class_t *child = NULL; - - // run thru parent's children and get the new max_limit of the parent - ckrm_lock_hier(parres->core); - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_res_class(child, mem_rcbs.resid, - struct ckrm_mem_res); - if (maxlimit < childres->shares.my_limit) { - maxlimit = childres->shares.my_limit; - } - } - ckrm_unlock_hier(parres->core); - parres->shares.cur_max_limit = maxlimit; -} - -/* - * Recalculate the guarantee and limit in # of pages... and propagate the - * same to children. - * Caller is responsible for protecting res and for the integrity of parres - */ -static void -recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres) -{ - ckrm_core_class_t *child = NULL; - struct ckrm_mem_res *cres; - int resid = mem_rcbs.resid; - struct ckrm_shares *self = &res->shares; - - if (parres) { - struct ckrm_shares *par = &parres->shares; - - // calculate pg_guar and pg_limit - // - if (parres->pg_guar == CKRM_SHARE_DONTCARE || - self->my_guarantee == CKRM_SHARE_DONTCARE) { - res->pg_guar = CKRM_SHARE_DONTCARE; - } else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * parres->pg_guar; - do_div(temp, par->total_guarantee); - res->pg_guar = (int) temp; - res->impl_guar = CKRM_SHARE_DONTCARE; - } else { - res->pg_guar = 0; - res->impl_guar = CKRM_SHARE_DONTCARE; - } - - if (parres->pg_limit == CKRM_SHARE_DONTCARE || - self->my_limit == CKRM_SHARE_DONTCARE) { - res->pg_limit = CKRM_SHARE_DONTCARE; - } else if (par->max_limit) { - u64 temp = (u64) self->my_limit * parres->pg_limit; - do_div(temp, par->max_limit); - res->pg_limit = (int) temp; - } else { - res->pg_limit = 0; - } - } - - // Calculate unused units - if (res->pg_guar == CKRM_SHARE_DONTCARE) { - res->pg_unused = CKRM_SHARE_DONTCARE; - } else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * res->pg_guar; - do_div(temp, self->total_guarantee); - res->pg_unused = (int) temp; - } else { - res->pg_unused = 0; - } - - // propagate to children - ckrm_lock_hier(res->core); - while ((child = ckrm_get_next_child(res->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - recalc_and_propagate(cres, res); - } - ckrm_unlock_hier(res->core); - return; -} - -static void -mem_res_free(void *my_res) -{ - struct ckrm_mem_res *res = my_res; - struct ckrm_mem_res *pres; - - if (!res) - return; - - ckrm_mem_evaluate_all_pages(res); - - pres = ckrm_get_res_class(res->parent, mem_rcbs.resid, - struct ckrm_mem_res); - - if (pres) { - child_guarantee_changed(&pres->shares, - res->shares.my_guarantee, 0); - child_maxlimit_changed_local(pres); - recalc_and_propagate(pres, NULL); - set_impl_guar_children(pres); - } - - res->shares.my_guarantee = 0; - res->shares.my_limit = 0; - res->pg_guar = 0; - res->pg_limit = 0; - res->pg_unused = 0; - - spin_lock_irq(&ckrm_mem_lock); - list_del_init(&res->mcls_list); - spin_unlock_irq(&ckrm_mem_lock); - - res->core = NULL; - res->parent = NULL; - kref_put(&res->nr_users, memclass_release); - ckrm_nr_mem_classes--; - return; -} - -static int -mem_set_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_mem_res *res = my_res; - struct ckrm_mem_res *parres; - int rc; - - if (!res) - return -EINVAL; - - parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, - struct ckrm_mem_res); - - rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL); - - if ((rc == 0) && (parres != NULL)) { - child_maxlimit_changed_local(parres); - recalc_and_propagate(parres, NULL); - set_impl_guar_children(parres); - } - - return rc; -} - -static int -mem_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_mem_res *res = my_res; - - if (!res) - return -EINVAL; - *shares = res->shares; - return 0; -} - -static int -mem_get_stats(void *my_res, struct seq_file *sfile) -{ - struct ckrm_mem_res *res = my_res; - struct zone *zone; - int active = 0, inactive = 0, fr = 0; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "--------- Memory Resource stats start ---------\n"); - if (res == ckrm_mem_root_class) { - int i = 0; - for_each_zone(zone) { - active += zone->nr_active; - inactive += zone->nr_inactive; - fr += zone->free_pages; - i++; - } - seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d" - ",free=%d\n", ckrm_tot_lru_pages, - active, inactive, fr); - } - seq_printf(sfile, "Number of pages used(including pages lent to" - " children): %d\n", atomic_read(&res->pg_total)); - seq_printf(sfile, "Number of pages guaranteed: %d\n", - res->pg_guar); - seq_printf(sfile, "Maximum limit of pages: %d\n", - res->pg_limit); - seq_printf(sfile, "Total number of pages available" - "(after serving guarantees to children): %d\n", - res->pg_unused); - seq_printf(sfile, "Number of pages lent to children: %d\n", - res->pg_lent); - seq_printf(sfile, "Number of pages borrowed from the parent: %d\n", - res->pg_borrowed); - seq_printf(sfile, "---------- Memory Resource stats end ----------\n"); - - return 0; -} - -static void -mem_change_resclass(void *tsk, void *old, void *new) -{ - struct mm_struct *mm; - struct task_struct *task = tsk, *t1; - struct ckrm_mem_res *prev_mmcls; - - if (!task->mm || (new == old) || (old == (void *) -1)) - return; - - mm = task->active_mm; - spin_lock(&mm->peertask_lock); - prev_mmcls = mm->memclass; - - if (new == NULL) { - list_del_init(&task->mm_peers); - } else { - int found = 0; - list_for_each_entry(t1, &mm->tasklist, mm_peers) { - if (t1 == task) { - found++; - break; - } - } - if (!found) { - list_del_init(&task->mm_peers); - list_add_tail(&task->mm_peers, &mm->tasklist); - } - } - - spin_unlock(&mm->peertask_lock); - ckrm_mem_evaluate_mm(mm, (struct ckrm_mem_res *) new); - return; -} - -#define MEM_FAIL_OVER "fail_over" -#define MEM_SHRINK_AT "shrink_at" -#define MEM_SHRINK_TO "shrink_to" -#define MEM_SHRINK_COUNT "num_shrinks" -#define MEM_SHRINK_INTERVAL "shrink_interval" - -int ckrm_mem_fail_over = 110; -int ckrm_mem_shrink_at = 90; -static int ckrm_mem_shrink_to = 80; -static int ckrm_mem_shrink_count = 10; -static int ckrm_mem_shrink_interval = 10; - -EXPORT_SYMBOL_GPL(ckrm_mem_fail_over); -EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at); - -static int -mem_show_config(void *my_res, struct seq_file *sfile) -{ - struct ckrm_mem_res *res = my_res; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n", - MEM_NAME, - MEM_FAIL_OVER, ckrm_mem_fail_over, - MEM_SHRINK_AT, ckrm_mem_shrink_at, - MEM_SHRINK_TO, ckrm_mem_shrink_to, - MEM_SHRINK_COUNT, ckrm_mem_shrink_count, - MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval); - - return 0; -} - -// config file is available only at the root level, -// so assuming my_res to be the system level class -enum memclass_token { - mem_fail_over, - mem_shrink_at, - mem_shrink_to, - mem_shrink_count, - mem_shrink_interval, - mem_err -}; - -static match_table_t mem_tokens = { - {mem_fail_over, MEM_FAIL_OVER "=%d"}, - {mem_shrink_at, MEM_SHRINK_AT "=%d"}, - {mem_shrink_to, MEM_SHRINK_TO "=%d"}, - {mem_shrink_count, MEM_SHRINK_COUNT "=%d"}, - {mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"}, - {mem_err, NULL}, -}; - -static int -mem_set_config(void *my_res, const char *cfgstr) -{ - char *p; - struct ckrm_mem_res *res = my_res; - int err = 0, val; - - if (!res) - return -EINVAL; - - while ((p = strsep((char**)&cfgstr, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, mem_tokens, args); - switch (token) { - case mem_fail_over: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_fail_over = val; - } - break; - case mem_shrink_at: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_at = val; - } - break; - case mem_shrink_to: - if (match_int(args, &val) || (val < 0) || (val > 100)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_to = val; - } - break; - case mem_shrink_count: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_count = val; - } - break; - case mem_shrink_interval: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_interval = val; - } - break; - default: - err = -EINVAL; - } - } - return err; -} - -static int -mem_reset_stats(void *my_res) -{ - struct ckrm_mem_res *res = my_res; - printk(KERN_INFO "MEM_RC: reset stats called for class %s\n", - res->core->name); - return 0; -} - -struct ckrm_res_ctlr mem_rcbs = { - .res_name = MEM_NAME, - .res_hdepth = CKRM_MEM_MAX_HIERARCHY, - .resid = -1, - .res_alloc = mem_res_alloc, - .res_free = mem_res_free, - .set_share_values = mem_set_share_values, - .get_share_values = mem_get_share_values, - .get_stats = mem_get_stats, - .change_resclass = mem_change_resclass, - .show_config = mem_show_config, - .set_config = mem_set_config, - .reset_stats = mem_reset_stats, -}; - -EXPORT_SYMBOL_GPL(mem_rcbs); - -int __init -init_ckrm_mem_res(void) -{ - struct ckrm_classtype *clstype; - int resid = mem_rcbs.resid; - - set_ckrm_tot_pages(); - spin_lock_init(&ckrm_mem_lock); - clstype = ckrm_find_classtype_by_name("taskclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype<taskclass>"); - return -ENOENT; - } - - if (resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &mem_rcbs); - if (resid != -1) { - mem_rcbs.classtype = clstype; - } - } - return ((resid < 0) ? resid : 0); -} - -void __exit -exit_ckrm_mem_res(void) -{ - ckrm_unregister_res_ctlr(&mem_rcbs); - mem_rcbs.resid = -1; -} - -module_init(init_ckrm_mem_res) -module_exit(exit_ckrm_mem_res) - -int -ckrm_mem_get_shrink_to(void) -{ - return ckrm_mem_shrink_to; -} - -void -ckrm_at_limit(struct ckrm_mem_res *cls) -{ - struct zone *zone; - unsigned long now = jiffies; - - if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || - ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) { - return; - } - if ((cls->last_shrink > now) /* jiffies wrapped around */ || - (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) { - cls->last_shrink = now; - cls->shrink_count = 0; - } - cls->shrink_count++; - if (cls->shrink_count > ckrm_mem_shrink_count) { - return; - } - spin_lock_irq(&ckrm_mem_lock); - list_add(&cls->shrink_list, &ckrm_shrink_list); - spin_unlock_irq(&ckrm_mem_lock); - cls->flags |= MEM_AT_LIMIT; - for_each_zone(zone) { - wakeup_kswapd(zone); - break; // only once is enough - } -} - -static int -ckrm_mem_evaluate_page_anon(struct page* page) -{ - struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; - struct ckrm_mem_res* maxshareclass = NULL; - struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; - struct vm_area_struct *vma; - struct mm_struct* mm; - int ret = 0; - - spin_lock(&anon_vma->lock); - BUG_ON(list_empty(&anon_vma->head)); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - mm = vma->vm_mm; - if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, - mm->memclass) < 0) { - maxshareclass = mm->memclass; - } - } - spin_unlock(&anon_vma->lock); - - if (!maxshareclass) { - maxshareclass = ckrm_mem_root_class; - } - if (pgcls != maxshareclass) { - ckrm_change_page_class(page, maxshareclass); - ret = 1; - } - return ret; -} - -static int -ckrm_mem_evaluate_page_file(struct page* page) -{ - struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; - struct ckrm_mem_res* maxshareclass = NULL; - struct address_space *mapping = page->mapping; - struct vm_area_struct *vma = NULL; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct prio_tree_iter iter; - struct mm_struct* mm; - int ret = 0; - - if (!mapping) - return 0; - - if (!spin_trylock(&mapping->i_mmap_lock)) - return 0; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, - pgoff, pgoff) { - mm = vma->vm_mm; - if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, - mm->memclass)<0) - maxshareclass = mm->memclass; - } - spin_unlock(&mapping->i_mmap_lock); - - if (!maxshareclass) { - maxshareclass = ckrm_mem_root_class; - } - if (pgcls != maxshareclass) { - ckrm_change_page_class(page, maxshareclass); - ret = 1; - } - return ret; -} - -static int -ckrm_mem_evaluate_page(struct page* page) -{ - int ret = 0; - BUG_ON(page->ckrm_zone == NULL); - if (page->mapping) { - if (PageAnon(page)) - ret = ckrm_mem_evaluate_page_anon(page); - else - ret = ckrm_mem_evaluate_page_file(page); - } - return ret; -} - -static void -ckrm_mem_evaluate_all_pages(struct ckrm_mem_res* res) -{ - struct page *page; - struct ckrm_zone *ckrm_zone; - struct zone *zone; - struct list_head *pos, *next; - int i; - - check_memclass(res, "bef_eval_all_pgs"); - for (i = 0; i < MAX_NR_ZONES; i++) { - ckrm_zone = &res->ckrm_zone[i]; - zone = ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - pos = ckrm_zone->inactive_list.next; - while (pos != &ckrm_zone->inactive_list) { - next = pos->next; - page = list_entry(pos, struct page, lru); - if (!ckrm_mem_evaluate_page(page)) - ckrm_change_page_class(page, - ckrm_mem_root_class); - pos = next; - } - pos = ckrm_zone->active_list.next; - while (pos != &ckrm_zone->active_list) { - next = pos->next; - page = list_entry(pos, struct page, lru); - if (!ckrm_mem_evaluate_page(page)) - ckrm_change_page_class(page, - ckrm_mem_root_class); - pos = next; - } - spin_unlock_irq(&zone->lru_lock); - } - check_memclass(res, "aft_eval_all_pgs"); - return; -} - -static inline int -class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma, - pmd_t* pmdir, unsigned long address, unsigned long end) -{ - pte_t *pte; - unsigned long pmd_end; - - if (pmd_none(*pmdir)) - return 0; - BUG_ON(pmd_bad(*pmdir)); - - pmd_end = (address+PMD_SIZE)&PMD_MASK; - if (end>pmd_end) - end = pmd_end; - - do { - pte = pte_offset_map(pmdir,address); - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); - BUG_ON(mm->memclass == NULL); - if (page->mapping && page->ckrm_zone) { - struct zone *zone = page->ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - ckrm_change_page_class(page, mm->memclass); - spin_unlock_irq(&zone->lru_lock); - } - } - address += PAGE_SIZE; - pte_unmap(pte); - pte++; - } while(address && (address<end)); - return 0; -} - -static inline int -class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma, - pgd_t* pgdir, unsigned long address, unsigned long end) -{ - pmd_t* pmd; - unsigned long pgd_end; - - if (pgd_none(*pgdir)) - return 0; - BUG_ON(pgd_bad(*pgdir)); - - pmd = pmd_offset(pgdir,address); - pgd_end = (address+PGDIR_SIZE)&PGDIR_MASK; - - if (pgd_end && (end>pgd_end)) - end = pgd_end; - - do { - class_migrate_pmd(mm,vma,pmd,address,end); - address = (address+PMD_SIZE)&PMD_MASK; - pmd++; - } while (address && (address<end)); - return 0; -} - -static inline int -class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma) -{ - pgd_t* pgdir; - unsigned long address, end; - - address = vma->vm_start; - end = vma->vm_end; - - pgdir = pgd_offset(vma->vm_mm, address); - do { - class_migrate_pgd(mm,vma,pgdir,address,end); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while(address && (address<end)); - return 0; -} - -/* this function is called with mm->peertask_lock hold */ -void -ckrm_mem_evaluate_mm(struct mm_struct* mm, struct ckrm_mem_res *def) -{ - struct task_struct *task; - struct ckrm_mem_res *maxshareclass = def; - struct vm_area_struct *vma; - - if (list_empty(&mm->tasklist)) { - /* We leave the mm->memclass untouched since we believe that one - * mm with no task associated will be deleted soon or attach - * with another task later. - */ - return; - } - - list_for_each_entry(task, &mm->tasklist, mm_peers) { - struct ckrm_mem_res* cls = ckrm_get_mem_class(task); - if (!cls) - continue; - if (!maxshareclass || - ckrm_mem_share_compare(maxshareclass,cls)<0 ) - maxshareclass = cls; - } - - if (maxshareclass && (mm->memclass != maxshareclass)) { - if (mm->memclass) { - kref_put(&mm->memclass->nr_users, memclass_release); - } - mm->memclass = maxshareclass; - kref_get(&maxshareclass->nr_users); - - /* Go through all VMA to migrate pages */ - down_read(&mm->mmap_sem); - vma = mm->mmap; - while(vma) { - class_migrate_vma(mm, vma); - vma = vma->vm_next; - } - up_read(&mm->mmap_sem); - } - return; -} - -void -ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task) -{ - spin_lock(&mm->peertask_lock); - if (!list_empty(&task->mm_peers)) { - printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n"); - list_del_init(&task->mm_peers); - } - list_add_tail(&task->mm_peers, &mm->tasklist); - spin_unlock(&mm->peertask_lock); - if (mm->memclass != ckrm_get_mem_class(task)) - ckrm_mem_evaluate_mm(mm, NULL); - return; -} - -int -ckrm_memclass_valid(struct ckrm_mem_res *cls) -{ - struct ckrm_mem_res *tmp; - unsigned long flags; - - if (!cls || list_empty(&cls->mcls_list)) { - return 0; - } - spin_lock_irqsave(&ckrm_mem_lock, flags); - list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) { - if (tmp == cls) { - spin_unlock(&ckrm_mem_lock); - return 1; - } - } - spin_unlock_irqrestore(&ckrm_mem_lock, flags); - return 0; -} - -MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_numtasks.c b/kernel/ckrm/ckrm_numtasks.c index c0583055d..21d8f9b17 100644 --- a/kernel/ckrm/ckrm_numtasks.c +++ b/kernel/ckrm/ckrm_numtasks.c @@ -11,8 +11,14 @@ * */ +/* Changes + * + * 31 Mar 2004: Created + * + */ + /* - * CKRM Resource controller for tracking number of tasks in a class. + * Code Description: TBD */ #include <linux/module.h> @@ -22,43 +28,59 @@ #include <asm/div64.h> #include <linux/list.h> #include <linux/spinlock.h> +#include <linux/parser.h> #include <linux/ckrm_rc.h> #include <linux/ckrm_tc.h> #include <linux/ckrm_tsk.h> -#define TOTAL_NUM_TASKS (131072) /* 128 K */ +#define DEF_TOTAL_NUM_TASKS (131072) // 128 K +#define DEF_FORKRATE (1000000) // 1 million tasks +#define DEF_FORKRATE_INTERVAL (3600) // per hour #define NUMTASKS_DEBUG #define NUMTASKS_NAME "numtasks" - -struct ckrm_numtasks { - struct ckrm_core_class *core; /* the core i am part of... */ - struct ckrm_core_class *parent; /* parent of the core above. */ +#define SYS_TOTAL_TASKS "sys_total_tasks" +#define FORKRATE "forkrate" +#define FORKRATE_INTERVAL "forkrate_interval" + +static int total_numtasks = DEF_TOTAL_NUM_TASKS; +static int total_cnt_alloc = 0; +static int forkrate = DEF_FORKRATE; +static int forkrate_interval = DEF_FORKRATE_INTERVAL; +static ckrm_core_class_t *root_core; + +typedef struct ckrm_numtasks { + struct ckrm_core_class *core; // the core i am part of... + struct ckrm_core_class *parent; // parent of the core above. struct ckrm_shares shares; - spinlock_t cnt_lock; /* always grab parent's lock before child's */ - int cnt_guarantee; /* num_tasks guarantee in local units */ - int cnt_unused; /* has to borrow if more than this is needed */ - int cnt_limit; /* no tasks over this limit. */ - atomic_t cnt_cur_alloc; /* current alloc from self */ - atomic_t cnt_borrowed; /* borrowed from the parent */ - - int over_guarantee; /* turn on/off when cur_alloc goes */ - /* over/under guarantee */ - - /* internally maintained statictics to compare with max numbers */ - int limit_failures; /* # failures as request was over the limit */ - int borrow_sucesses; /* # successful borrows */ - int borrow_failures; /* # borrow failures */ - - /* Maximum the specific statictics has reached. */ + spinlock_t cnt_lock; // always grab parent's lock before child's + int cnt_guarantee; // num_tasks guarantee in local units + int cnt_unused; // has to borrow if more than this is needed + int cnt_limit; // no tasks over this limit. + atomic_t cnt_cur_alloc; // current alloc from self + atomic_t cnt_borrowed; // borrowed from the parent + + int over_guarantee; // turn on/off when cur_alloc goes + // over/under guarantee + + // internally maintained statictics to compare with max numbers + int limit_failures; // # failures as request was over the limit + int borrow_sucesses; // # successful borrows + int borrow_failures; // # borrow failures + + // Maximum the specific statictics has reached. int max_limit_failures; int max_borrow_sucesses; int max_borrow_failures; - /* Total number of specific statistics */ + // Total number of specific statistics int tot_limit_failures; int tot_borrow_sucesses; int tot_borrow_failures; -}; + + // fork rate fields + int forks_in_period; + unsigned long period_start; +} ckrm_numtasks_t; struct ckrm_res_ctlr numtasks_rcbs; @@ -67,7 +89,7 @@ struct ckrm_res_ctlr numtasks_rcbs; * to make share values sane. * Does not traverse hierarchy reinitializing children. */ -static void numtasks_res_initcls_one(struct ckrm_numtasks * res) +static void numtasks_res_initcls_one(ckrm_numtasks_t * res) { res->shares.my_guarantee = CKRM_SHARE_DONTCARE; res->shares.my_limit = CKRM_SHARE_DONTCARE; @@ -94,23 +116,58 @@ static void numtasks_res_initcls_one(struct ckrm_numtasks * res) res->tot_borrow_sucesses = 0; res->tot_borrow_failures = 0; + res->forks_in_period = 0; + res->period_start = jiffies; + atomic_set(&res->cnt_cur_alloc, 0); atomic_set(&res->cnt_borrowed, 0); return; } -static int numtasks_get_ref_local(struct ckrm_core_class *core, int force) +#if 0 +static void numtasks_res_initcls(void *my_res) { - int rc, resid = numtasks_rcbs.resid; - struct ckrm_numtasks *res; + ckrm_numtasks_t *res = my_res; + + /* Write a version which propagates values all the way down + and replace rcbs callback with that version */ + +} +#endif + +static int numtasks_get_ref_local(void *arg, int force) +{ + int rc, resid = numtasks_rcbs.resid, borrowed = 0; + unsigned long now = jiffies, chg_at; + ckrm_numtasks_t *res; + ckrm_core_class_t *core = arg; if ((resid < 0) || (core == NULL)) return 1; - res = ckrm_get_res_class(core, resid, struct ckrm_numtasks); + res = ckrm_get_res_class(core, resid, ckrm_numtasks_t); if (res == NULL) return 1; + // force is not associated with fork. So, if force is specified + // we don't have to bother about forkrate. + if (!force) { + // Take care of wraparound situation + chg_at = res->period_start + forkrate_interval * HZ; + if (chg_at < res->period_start) { + chg_at += forkrate_interval * HZ; + now += forkrate_interval * HZ; + } + if (chg_at <= now) { + res->period_start = now; + res->forks_in_period = 0; + } + + if (res->forks_in_period >= forkrate) { + return 0; + } + } + atomic_inc(&res->cnt_cur_alloc); rc = 1; @@ -129,76 +186,91 @@ static int numtasks_get_ref_local(struct ckrm_core_class *core, int force) res->borrow_sucesses++; res->tot_borrow_sucesses++; res->over_guarantee = 1; + borrowed++; } else { res->borrow_failures++; res->tot_borrow_failures++; } - } else + } else { rc = force; + } } else if (res->over_guarantee) { res->over_guarantee = 0; - if (res->max_limit_failures < res->limit_failures) + if (res->max_limit_failures < res->limit_failures) { res->max_limit_failures = res->limit_failures; - if (res->max_borrow_sucesses < res->borrow_sucesses) + } + if (res->max_borrow_sucesses < res->borrow_sucesses) { res->max_borrow_sucesses = res->borrow_sucesses; - if (res->max_borrow_failures < res->borrow_failures) + } + if (res->max_borrow_failures < res->borrow_failures) { res->max_borrow_failures = res->borrow_failures; + } res->limit_failures = 0; res->borrow_sucesses = 0; res->borrow_failures = 0; } - if (!rc) + if (!rc) { atomic_dec(&res->cnt_cur_alloc); + } else if (!borrowed) { + total_cnt_alloc++; + if (!force) { // force is not associated with a real fork. + res->forks_in_period++; + } + } return rc; } -static void numtasks_put_ref_local(struct ckrm_core_class *core) +static void numtasks_put_ref_local(void *arg) { int resid = numtasks_rcbs.resid; - struct ckrm_numtasks *res; + ckrm_numtasks_t *res; + ckrm_core_class_t *core = arg; - if ((resid == -1) || (core == NULL)) + if ((resid == -1) || (core == NULL)) { return; + } - res = ckrm_get_res_class(core, resid, struct ckrm_numtasks); + res = ckrm_get_res_class(core, resid, ckrm_numtasks_t); if (res == NULL) return; - - if (atomic_read(&res->cnt_cur_alloc)==0) + if (unlikely(atomic_read(&res->cnt_cur_alloc) == 0)) { + printk(KERN_WARNING "numtasks_put_ref: Trying to decrement " + "counter below 0\n"); return; - + } atomic_dec(&res->cnt_cur_alloc); - if (atomic_read(&res->cnt_borrowed) > 0) { atomic_dec(&res->cnt_borrowed); numtasks_put_ref_local(res->parent); + } else { + total_cnt_alloc--; } + return; } static void *numtasks_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent) { - struct ckrm_numtasks *res; + ckrm_numtasks_t *res; - res = kmalloc(sizeof(struct ckrm_numtasks), GFP_ATOMIC); + res = kmalloc(sizeof(ckrm_numtasks_t), GFP_ATOMIC); if (res) { - memset(res, 0, sizeof(struct ckrm_numtasks)); + memset(res, 0, sizeof(ckrm_numtasks_t)); res->core = core; res->parent = parent; numtasks_res_initcls_one(res); res->cnt_lock = SPIN_LOCK_UNLOCKED; if (parent == NULL) { - /* - * I am part of root class. So set the max tasks - * to available default. - */ - res->cnt_guarantee = TOTAL_NUM_TASKS; - res->cnt_unused = TOTAL_NUM_TASKS; - res->cnt_limit = TOTAL_NUM_TASKS; + // I am part of root class. So set the max tasks + // to available default + res->cnt_guarantee = total_numtasks; + res->cnt_unused = total_numtasks; + res->cnt_limit = total_numtasks; + root_core = core; // store the root core. } try_module_get(THIS_MODULE); } else { @@ -214,36 +286,47 @@ static void *numtasks_res_alloc(struct ckrm_core_class *core, */ static void numtasks_res_free(void *my_res) { - struct ckrm_numtasks *res = my_res, *parres, *childres; - struct ckrm_core_class *child = NULL; + ckrm_numtasks_t *res = my_res, *parres, *childres; + ckrm_core_class_t *child = NULL; int i, borrowed, maxlimit, resid = numtasks_rcbs.resid; if (!res) return; - /* Assuming there will be no children when this function is called */ + // Assuming there will be no children when this function is called - parres = ckrm_get_res_class(res->parent, resid, struct ckrm_numtasks); + parres = ckrm_get_res_class(res->parent, resid, ckrm_numtasks_t); - if ((borrowed = atomic_read(&res->cnt_borrowed)) > 0) - for (i = 0; i < borrowed; i++) - numtasks_put_ref_local(parres->core); - - /* return child's limit/guarantee to parent node */ + if (unlikely(atomic_read(&res->cnt_cur_alloc) < 0)) { + printk(KERN_WARNING "numtasks_res: counter below 0\n"); + } + if (unlikely(atomic_read(&res->cnt_cur_alloc) > 0 || + atomic_read(&res->cnt_borrowed) > 0)) { + printk(KERN_WARNING "numtasks_res_free: resource still " + "alloc'd %p\n", res); + if ((borrowed = atomic_read(&res->cnt_borrowed)) > 0) { + for (i = 0; i < borrowed; i++) { + numtasks_put_ref_local(parres->core); + } + } + } + // return child's limit/guarantee to parent node spin_lock(&parres->cnt_lock); child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0); - /* run thru parent's children and get the new max_limit of the parent */ + // run thru parent's children and get the new max_limit of the parent ckrm_lock_hier(parres->core); maxlimit = 0; while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_res_class(child, resid, struct ckrm_numtasks); - if (maxlimit < childres->shares.my_limit) + childres = ckrm_get_res_class(child, resid, ckrm_numtasks_t); + if (maxlimit < childres->shares.my_limit) { maxlimit = childres->shares.my_limit; + } } ckrm_unlock_hier(parres->core); - if (parres->shares.cur_max_limit < maxlimit) + if (parres->shares.cur_max_limit < maxlimit) { parres->shares.cur_max_limit = maxlimit; + } spin_unlock(&parres->cnt_lock); kfree(res); @@ -251,63 +334,67 @@ static void numtasks_res_free(void *my_res) return; } + /* * Recalculate the guarantee and limit in real units... and propagate the * same to children. * Caller is responsible for protecting res and for the integrity of parres */ static void -recalc_and_propagate(struct ckrm_numtasks * res, struct ckrm_numtasks * parres) +recalc_and_propagate(ckrm_numtasks_t * res, ckrm_numtasks_t * parres) { - struct ckrm_core_class *child = NULL; - struct ckrm_numtasks *childres; + ckrm_core_class_t *child = NULL; + ckrm_numtasks_t *childres; int resid = numtasks_rcbs.resid; if (parres) { struct ckrm_shares *par = &parres->shares; struct ckrm_shares *self = &res->shares; - /* calculate cnt_guarantee and cnt_limit */ - if ((parres->cnt_guarantee == CKRM_SHARE_DONTCARE) || - (self->my_guarantee == CKRM_SHARE_DONTCARE)) + // calculate cnt_guarantee and cnt_limit + // + if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { res->cnt_guarantee = CKRM_SHARE_DONTCARE; - else if (par->total_guarantee) { + } else if (par->total_guarantee) { u64 temp = (u64) self->my_guarantee * parres->cnt_guarantee; do_div(temp, par->total_guarantee); res->cnt_guarantee = (int) temp; - } else + } else { res->cnt_guarantee = 0; + } - if ((parres->cnt_limit == CKRM_SHARE_DONTCARE) || - (self->my_limit == CKRM_SHARE_DONTCARE)) + if (parres->cnt_limit == CKRM_SHARE_DONTCARE) { res->cnt_limit = CKRM_SHARE_DONTCARE; - else if (par->max_limit) { + } else if (par->max_limit) { u64 temp = (u64) self->my_limit * parres->cnt_limit; do_div(temp, par->max_limit); res->cnt_limit = (int) temp; - } else + } else { res->cnt_limit = 0; + } - /* Calculate unused units */ - if ((res->cnt_guarantee == CKRM_SHARE_DONTCARE) || - (self->my_guarantee == CKRM_SHARE_DONTCARE)) + // Calculate unused units + if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) { res->cnt_unused = CKRM_SHARE_DONTCARE; - else if (self->total_guarantee) { + } else if (self->total_guarantee) { u64 temp = (u64) self->unused_guarantee * res->cnt_guarantee; do_div(temp, self->total_guarantee); res->cnt_unused = (int) temp; - } else + } else { res->cnt_unused = 0; + } } - - /* propagate to children */ + // propagate to children ckrm_lock_hier(res->core); while ((child = ckrm_get_next_child(res->core, child)) != NULL) { - childres = ckrm_get_res_class(child, resid, struct ckrm_numtasks); - - spin_lock(&childres->cnt_lock); - recalc_and_propagate(childres, res); - spin_unlock(&childres->cnt_lock); + childres = ckrm_get_res_class(child, resid, ckrm_numtasks_t); + if (childres) { + spin_lock(&childres->cnt_lock); + recalc_and_propagate(childres, res); + spin_unlock(&childres->cnt_lock); + } else { + printk(KERN_ERR "%s: numtasks resclass missing\n",__FUNCTION__); + } } ckrm_unlock_hier(res->core); return; @@ -315,7 +402,7 @@ recalc_and_propagate(struct ckrm_numtasks * res, struct ckrm_numtasks * parres) static int numtasks_set_share_values(void *my_res, struct ckrm_shares *new) { - struct ckrm_numtasks *parres, *res = my_res; + ckrm_numtasks_t *parres, *res = my_res; struct ckrm_shares *cur = &res->shares, *par; int rc = -EINVAL, resid = numtasks_rcbs.resid; @@ -324,7 +411,7 @@ static int numtasks_set_share_values(void *my_res, struct ckrm_shares *new) if (res->parent) { parres = - ckrm_get_res_class(res->parent, resid, struct ckrm_numtasks); + ckrm_get_res_class(res->parent, resid, ckrm_numtasks_t); spin_lock(&parres->cnt_lock); spin_lock(&res->cnt_lock); par = &parres->shares; @@ -337,26 +424,28 @@ static int numtasks_set_share_values(void *my_res, struct ckrm_shares *new) rc = set_shares(new, cur, par); if ((rc == 0) && parres) { - /* Calculate parent's unused units */ - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) + // Calculate parent's unused units + if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { parres->cnt_unused = CKRM_SHARE_DONTCARE; - else if (par->total_guarantee) { + } else if (par->total_guarantee) { u64 temp = (u64) par->unused_guarantee * parres->cnt_guarantee; do_div(temp, par->total_guarantee); parres->cnt_unused = (int) temp; - } else + } else { parres->cnt_unused = 0; + } recalc_and_propagate(res, parres); } spin_unlock(&res->cnt_lock); - if (res->parent) + if (res->parent) { spin_unlock(&parres->cnt_lock); + } return rc; } static int numtasks_get_share_values(void *my_res, struct ckrm_shares *shares) { - struct ckrm_numtasks *res = my_res; + ckrm_numtasks_t *res = my_res; if (!res) return -EINVAL; @@ -366,12 +455,12 @@ static int numtasks_get_share_values(void *my_res, struct ckrm_shares *shares) static int numtasks_get_stats(void *my_res, struct seq_file *sfile) { - struct ckrm_numtasks *res = my_res; + ckrm_numtasks_t *res = my_res; if (!res) return -EINVAL; - seq_printf(sfile, "---------Number of tasks stats start---------\n"); + seq_printf(sfile, "Number of tasks resource:\n"); seq_printf(sfile, "Total Over limit failures: %d\n", res->tot_limit_failures); seq_printf(sfile, "Total Over guarantee sucesses: %d\n", @@ -385,7 +474,6 @@ static int numtasks_get_stats(void *my_res, struct seq_file *sfile) res->max_borrow_sucesses); seq_printf(sfile, "Maximum Over guarantee failures: %d\n", res->max_borrow_failures); - seq_printf(sfile, "---------Number of tasks stats end---------\n"); #ifdef NUMTASKS_DEBUG seq_printf(sfile, "cur_alloc %d; borrowed %d; cnt_guar %d; cnt_limit %d " @@ -402,29 +490,114 @@ static int numtasks_get_stats(void *my_res, struct seq_file *sfile) static int numtasks_show_config(void *my_res, struct seq_file *sfile) { - struct ckrm_numtasks *res = my_res; + ckrm_numtasks_t *res = my_res; if (!res) return -EINVAL; - seq_printf(sfile, "res=%s,parameter=somevalue\n", NUMTASKS_NAME); + seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d\n", NUMTASKS_NAME, + SYS_TOTAL_TASKS, total_numtasks, + FORKRATE, forkrate, + FORKRATE_INTERVAL, forkrate_interval); return 0; } +enum numtasks_token_t { + numtasks_token_total, + numtasks_token_forkrate, + numtasks_token_interval, + numtasks_token_err +}; + +static match_table_t numtasks_tokens = { + {numtasks_token_total, SYS_TOTAL_TASKS "=%d"}, + {numtasks_token_forkrate, FORKRATE "=%d"}, + {numtasks_token_interval, FORKRATE_INTERVAL "=%d"}, + {numtasks_token_err, NULL}, +}; + +static void reset_forkrates(ckrm_core_class_t *parent, unsigned long now) +{ + ckrm_numtasks_t *parres; + ckrm_core_class_t *child = NULL; + + parres = ckrm_get_res_class(parent, numtasks_rcbs.resid, + ckrm_numtasks_t); + if (!parres) { + return; + } + parres->forks_in_period = 0; + parres->period_start = now; + + ckrm_lock_hier(parent); + while ((child = ckrm_get_next_child(parent, child)) != NULL) { + reset_forkrates(child, now); + } + ckrm_unlock_hier(parent); +} + static int numtasks_set_config(void *my_res, const char *cfgstr) { - struct ckrm_numtasks *res = my_res; + char *p; + ckrm_numtasks_t *res = my_res; + int new_total, fr = 0, itvl = 0, err = 0; if (!res) return -EINVAL; - printk("numtasks config='%s'\n", cfgstr); - return 0; + + while ((p = strsep((char**)&cfgstr, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, numtasks_tokens, args); + switch (token) { + case numtasks_token_total: + if (match_int(args, &new_total) || + (new_total < total_cnt_alloc)) { + err = -EINVAL; + } else { + total_numtasks = new_total; + + // res is the default class, as config is present only + // in that directory + spin_lock(&res->cnt_lock); + res->cnt_guarantee = total_numtasks; + res->cnt_unused = total_numtasks; + res->cnt_limit = total_numtasks; + recalc_and_propagate(res, NULL); + spin_unlock(&res->cnt_lock); + } + break; + case numtasks_token_forkrate: + if (match_int(args, &fr) || (fr <= 0)) { + err = -EINVAL; + } else { + forkrate = fr; + } + break; + case numtasks_token_interval: + if (match_int(args, &itvl) || (itvl <= 0)) { + err = -EINVAL; + } else { + forkrate_interval = itvl; + } + break; + default: + err = -EINVAL; + } + } + if ((fr > 0) || (itvl > 0)) { + reset_forkrates(root_core, jiffies); + } + return err; } static void numtasks_change_resclass(void *task, void *old, void *new) { - struct ckrm_numtasks *oldres = old; - struct ckrm_numtasks *newres = new; + ckrm_numtasks_t *oldres = old; + ckrm_numtasks_t *newres = new; if (oldres != (void *)-1) { struct task_struct *tsk = task; @@ -433,13 +606,13 @@ static void numtasks_change_resclass(void *task, void *old, void *new) &(tsk->parent->taskclass->core); oldres = ckrm_get_res_class(old_core, numtasks_rcbs.resid, - struct ckrm_numtasks); + ckrm_numtasks_t); } - if (oldres) - numtasks_put_ref_local(oldres->core); + numtasks_put_ref_local(oldres->core); } - if (newres) + if (newres) { (void)numtasks_get_ref_local(newres->core, 1); + } } struct ckrm_res_ctlr numtasks_rcbs = { @@ -469,7 +642,7 @@ int __init init_ckrm_numtasks_res(void) if (resid == -1) { resid = ckrm_register_res_ctlr(clstype, &numtasks_rcbs); - printk("........init_ckrm_numtasks_res -> %d\n", resid); + printk(KERN_DEBUG "........init_ckrm_numtasks_res -> %d\n", resid); if (resid != -1) { ckrm_numtasks_register(numtasks_get_ref_local, numtasks_put_ref_local); @@ -481,13 +654,14 @@ int __init init_ckrm_numtasks_res(void) void __exit exit_ckrm_numtasks_res(void) { - if (numtasks_rcbs.resid != -1) + if (numtasks_rcbs.resid != -1) { ckrm_numtasks_register(NULL, NULL); + } ckrm_unregister_res_ctlr(&numtasks_rcbs); numtasks_rcbs.resid = -1; } module_init(init_ckrm_numtasks_res) -module_exit(exit_ckrm_numtasks_res) + module_exit(exit_ckrm_numtasks_res) -MODULE_LICENSE("GPL"); + MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_numtasks_stub.c b/kernel/ckrm/ckrm_numtasks_stub.c index d9f15c98b..179e6b5d6 100644 --- a/kernel/ckrm/ckrm_numtasks_stub.c +++ b/kernel/ckrm/ckrm_numtasks_stub.c @@ -11,6 +11,12 @@ * */ +/* Changes + * + * 16 May 2004: Created + * + */ + #include <linux/spinlock.h> #include <linux/module.h> #include <linux/ckrm_tsk.h> @@ -28,7 +34,7 @@ void ckrm_numtasks_register(get_ref_t gr, put_ref_t pr) spin_unlock(&stub_lock); } -int numtasks_get_ref(struct ckrm_core_class *arg, int force) +int numtasks_get_ref(void *arg, int force) { int ret = 1; spin_lock(&stub_lock); @@ -39,7 +45,7 @@ int numtasks_get_ref(struct ckrm_core_class *arg, int force) return ret; } -void numtasks_put_ref(struct ckrm_core_class *arg) +void numtasks_put_ref(void *arg) { spin_lock(&stub_lock); if (real_put_ref) { diff --git a/kernel/crash.c b/kernel/crash.c deleted file mode 100644 index 885def927..000000000 --- a/kernel/crash.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * kernel/crash.c - Memory preserving reboot related code. - * - * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) - * Copyright (C) IBM Corporation, 2004. All rights reserved - */ - -#include <linux/smp_lock.h> -#include <linux/kexec.h> -#include <linux/errno.h> -#include <linux/proc_fs.h> -#include <linux/bootmem.h> -#include <linux/highmem.h> -#include <linux/crash_dump.h> - -#include <asm/io.h> -#include <asm/uaccess.h> - -#ifdef CONFIG_PROC_FS -/* - * Enable kexec reboot upon panic; for dumping - */ -static ssize_t write_crash_dump_on(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - if (count) { - if (get_user(crash_dump_on, buf)) - return -EFAULT; - } - return count; -} - -static struct file_operations proc_crash_dump_on_operations = { - .write = write_crash_dump_on, -}; - -extern struct file_operations proc_vmcore_operations; -extern struct proc_dir_entry *proc_vmcore; - -void crash_enable_by_proc(void) -{ - struct proc_dir_entry *entry; - - entry = create_proc_entry("kexec-dump", S_IWUSR, NULL); - if (entry) - entry->proc_fops = &proc_crash_dump_on_operations; -} - -void crash_create_proc_entry(void) -{ - if (dump_enabled) { - proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL); - if (proc_vmcore) { - proc_vmcore->proc_fops = &proc_vmcore_operations; - proc_vmcore->size = - (size_t)(saved_max_pfn << PAGE_SHIFT); - } - } -} - -#endif /* CONFIG_PROC_FS */ - -void __crash_machine_kexec(void) -{ - struct kimage *image; - - if ((!crash_dump_on) || (crashed)) - return; - - image = xchg(&kexec_crash_image, 0); - if (image) { - crashed = 1; - printk(KERN_EMERG "kexec: opening parachute\n"); - crash_dump_stop_cpus(); - crash_dump_save_registers(); - - /* If we are here to do a crash dump, save the memory from - * 0-640k before we copy over the kexec kernel image. Otherwise - * our dump will show the wrong kernel entirely. - */ - crash_relocate_mem(); - - machine_kexec(image); - } else { - printk(KERN_EMERG "kexec: No kernel image loaded!\n"); - } -} - -/* - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, int userbuf) -{ - void *page, *vaddr; - - if (!csize) - return 0; - - page = kmalloc(PAGE_SIZE, GFP_KERNEL); - - vaddr = kmap_atomic_pfn(pfn, KM_PTE0); - copy_page(page, vaddr); - kunmap_atomic(vaddr, KM_PTE0); - - if (userbuf) { - if (copy_to_user(buf, page, csize)) { - kfree(page); - return -EFAULT; - } - } else - memcpy(buf, page, csize); - kfree(page); - - return 0; -} diff --git a/kernel/exit.c b/kernel/exit.c index 8ca3c1711..0d55d3842 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -514,7 +514,7 @@ static inline void __exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); - ckrm_task_clear_mm(tsk, mm); + ckrm_task_mm_clear(tsk, mm); enter_lazy_tlb(mm, current); task_unlock(tsk); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index 1902e9d2e..20e10311f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -41,6 +41,7 @@ #include <linux/rmap.h> #include <linux/ckrm_events.h> #include <linux/ckrm_tsk.h> +#include <linux/ckrm_tc.h> #include <linux/ckrm_mem_inline.h> #include <linux/vs_network.h> #include <linux/vs_limit.h> @@ -309,7 +310,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; - ckrm_mm_init(mm); + ckrm_mm_init(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -489,7 +490,8 @@ good_mm: ckrm_mm_setclass(mm, oldmm->memclass); tsk->mm = mm; tsk->active_mm = mm; - ckrm_init_mm_to_task(mm, tsk); + ckrm_mm_setclass(mm, oldmm->memclass); + ckrm_task_mm_set(mm, tsk); return 0; free_pt: diff --git a/kernel/kexec.c b/kernel/kexec.c deleted file mode 100644 index e83887511..000000000 --- a/kernel/kexec.c +++ /dev/null @@ -1,637 +0,0 @@ -/* - * kexec.c - kexec system call - * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/mm.h> -#include <linux/file.h> -#include <linux/slab.h> -#include <linux/fs.h> -#include <linux/kexec.h> -#include <linux/spinlock.h> -#include <linux/list.h> -#include <linux/highmem.h> -#include <net/checksum.h> -#include <asm/page.h> -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/system.h> - -/* - * When kexec transitions to the new kernel there is a one-to-one - * mapping between physical and virtual addresses. On processors - * where you can disable the MMU this is trivial, and easy. For - * others it is still a simple predictable page table to setup. - * - * In that environment kexec copies the new kernel to its final - * resting place. This means I can only support memory whose - * physical address can fit in an unsigned long. In particular - * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. - * If the assembly stub has more restrictive requirements - * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be - * defined more restrictively in <asm/kexec.h>. - * - * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single - * page of memory is necessary, but some architectures require more. - * Because this memory must be identity mapped in the transition from - * virtual to physical addresses it must live in the range - * 0 - TASK_SIZE, as only the user space mappings are arbitrarily - * modifiable. - * - * The assembly stub in the control code buffer is passed a linked list - * of descriptor pages detailing the source pages of the new kernel, - * and the destination addresses of those source pages. As this data - * structure is not used in the context of the current OS, it must - * be self-contained. - * - * The code has been made to work with highmem pages and will use a - * destination page in its final resting place (if it happens - * to allocate it). The end product of this is that most of the - * physical address space, and most of RAM can be used. - * - * Future directions include: - * - allocating a page table with the control code buffer identity - * mapped, to simplify machine_kexec and make kexec_on_panic more - * reliable. - */ - -/* - * KIMAGE_NO_DEST is an impossible destination address..., for - * allocating pages whose destination address we do not care about. - */ -#define KIMAGE_NO_DEST (-1UL) - -static int kimage_is_destination_range( - struct kimage *image, unsigned long start, unsigned long end); -static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest); - - -static int kimage_alloc(struct kimage **rimage, - unsigned long nr_segments, struct kexec_segment *segments) -{ - int result; - struct kimage *image; - size_t segment_bytes; - unsigned long i; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kmalloc(sizeof(*image), GFP_KERNEL); - if (!image) { - goto out; - } - memset(image, 0, sizeof(*image)); - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unuseable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); - - /* Read in the segments */ - image->nr_segments = nr_segments; - segment_bytes = nr_segments * sizeof*segments; - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) - goto out; - - /* - * Verify we have good destination addresses. The caller is - * responsible for making certain we don't attempt to load - * the new image into invalid or reserved areas of RAM. This - * just verifies it is an address we can use. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mend; - mend = ((unsigned long)(image->segment[i].mem)) + - image->segment[i].memsz; - if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; - } - - /* - * Find a location for the control code buffer, and add it - * the vector of segments so that it's pages will also be - * counted as destination pages. - */ - result = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); - if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; - } - - result = 0; - out: - if (result == 0) { - *rimage = image; - } else { - kfree(image); - } - return result; -} - -static int kimage_is_destination_range( - struct kimage *image, unsigned long start, unsigned long end) -{ - unsigned long i; - - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - mstart = (unsigned long)image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) { - return 1; - } - } - return 0; -} - -static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - struct page *pages; - pages = alloc_pages(gfp_mask, order); - if (pages) { - unsigned int count, i; - pages->mapping = NULL; - pages->private = order; - count = 1 << order; - for(i = 0; i < count; i++) { - SetPageReserved(pages + i); - } - } - return pages; -} - -static void kimage_free_pages(struct page *page) -{ - unsigned int order, count, i; - order = page->private; - count = 1 << order; - for(i = 0; i < count; i++) { - ClearPageReserved(page + i); - } - __free_pages(page, order); -} - -static void kimage_free_page_list(struct list_head *list) -{ - struct list_head *pos, *next; - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); - list_del(&page->lru); - - kimage_free_pages(page); - } -} - -struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * At worst this runs in O(N) of the image size. - */ - struct list_head extra_pages; - struct page *pages; - unsigned int count; - - count = 1 << order; - INIT_LIST_HEAD(&extra_pages); - - /* Loop while I can allocate a page and the page allocated - * is a destination page. - */ - do { - unsigned long pfn, epfn, addr, eaddr; - pages = kimage_alloc_pages(GFP_KERNEL, order); - if (!pages) - break; - pfn = page_to_pfn(pages); - epfn = pfn + count; - addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; - if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || - kimage_is_destination_range(image, addr, eaddr)) - { - list_add(&pages->lru, &extra_pages); - pages = NULL; - } - } while(!pages); - if (pages) { - /* Remember the allocated page... */ - list_add(&pages->lru, &image->control_pages); - - /* Because the page is already in it's destination - * location we will never allocate another page at - * that address. Therefore kimage_alloc_pages - * will not return it (again) and we don't need - * to give it an entry in image->segment[]. - */ - } - /* Deal with the destination pages I have inadvertently allocated. - * - * Ideally I would convert multi-page allocations into single - * page allocations, and add everyting to image->dest_pages. - * - * For now it is simpler to just free the pages. - */ - kimage_free_page_list(&extra_pages); - return pages; - -} - -static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) -{ - if (*image->entry != 0) { - image->entry++; - } - if (image->entry == image->last_entry) { - kimage_entry_t *ind_page; - struct page *page; - page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); - if (!page) { - return -ENOMEM; - } - ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; - image->entry = ind_page; - image->last_entry = - ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); - } - *image->entry = entry; - image->entry++; - *image->entry = 0; - return 0; -} - -static int kimage_set_destination( - struct kimage *image, unsigned long destination) -{ - int result; - - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - if (result == 0) { - image->destination = destination; - } - return result; -} - - -static int kimage_add_page(struct kimage *image, unsigned long page) -{ - int result; - - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - if (result == 0) { - image->destination += PAGE_SIZE; - } - return result; -} - - -static void kimage_free_extra_pages(struct kimage *image) -{ - /* Walk through and free any extra destination pages I may have */ - kimage_free_page_list(&image->dest_pages); - - /* Walk through and free any unuseable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); - -} -static int kimage_terminate(struct kimage *image) -{ - int result; - - result = kimage_add_entry(image, IND_DONE); - if (result == 0) { - /* Point at the terminating element */ - image->entry--; - kimage_free_extra_pages(image); - } - return result; -} - -#define for_each_kimage_entry(image, ptr, entry) \ - for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) - -static void kimage_free_entry(kimage_entry_t entry) -{ - struct page *page; - - page = pfn_to_page(entry >> PAGE_SHIFT); - kimage_free_pages(page); -} - -static void kimage_free(struct kimage *image) -{ - kimage_entry_t *ptr, entry; - kimage_entry_t ind = 0; - - if (!image) - return; - kimage_free_extra_pages(image); - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_INDIRECTION) { - /* Free the previous indirection page */ - if (ind & IND_INDIRECTION) { - kimage_free_entry(ind); - } - /* Save this indirection page until we are - * done with it. - */ - ind = entry; - } - else if (entry & IND_SOURCE) { - kimage_free_entry(entry); - } - } - /* Free the final indirection page */ - if (ind & IND_INDIRECTION) { - kimage_free_entry(ind); - } - - /* Handle any machine specific cleanup */ - machine_kexec_cleanup(image); - - /* Free the kexec control pages... */ - kimage_free_page_list(&image->control_pages); - kfree(image); -} - -static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) -{ - kimage_entry_t *ptr, entry; - unsigned long destination = 0; - - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_DESTINATION) { - destination = entry & PAGE_MASK; - } - else if (entry & IND_SOURCE) { - if (page == destination) { - return ptr; - } - destination += PAGE_SIZE; - } - } - return 0; -} - -static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination) -{ - /* - * Here we implement safeguards to ensure that a source page - * is not copied to its destination page before the data on - * the destination page is no longer useful. - * - * To do this we maintain the invariant that a source page is - * either its own destination page, or it is not a - * destination page at all. - * - * That is slightly stronger than required, but the proof - * that no problems will not occur is trivial, and the - * implementation is simply to verify. - * - * When allocating all pages normally this algorithm will run - * in O(N) time, but in the worst case it will run in O(N^2) - * time. If the runtime is a problem the data structures can - * be fixed. - */ - struct page *page; - unsigned long addr; - - /* - * Walk through the list of destination pages, and see if I - * have a match. - */ - list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; - if (addr == destination) { - list_del(&page->lru); - return page; - } - } - page = NULL; - while (1) { - kimage_entry_t *old; - - /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); - if (!page) { - return 0; - } - /* If the page cannot be used file it away */ - if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); - continue; - } - addr = page_to_pfn(page) << PAGE_SHIFT; - - /* If it is the destination page we want use it */ - if (addr == destination) - break; - - /* If the page is not a destination page use it */ - if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE)) - break; - - /* - * I know that the page is someones destination page. - * See if there is already a source page for this - * destination page. And if so swap the source pages. - */ - old = kimage_dst_used(image, addr); - if (old) { - /* If so move it */ - unsigned long old_addr; - struct page *old_page; - - old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); - copy_highpage(page, old_page); - *old = addr | (*old & ~PAGE_MASK); - - /* The old page I have found cannot be a - * destination page, so return it. - */ - addr = old_addr; - page = old_page; - break; - } - else { - /* Place the page on the destination list I - * will use it later. - */ - list_add(&page->lru, &image->dest_pages); - } - } - return page; -} - -static int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) -{ - unsigned long mstart; - int result; - unsigned long offset; - unsigned long offset_end; - unsigned char *buf; - - result = 0; - buf = segment->buf; - mstart = (unsigned long)segment->mem; - - offset_end = segment->memsz; - - result = kimage_set_destination(image, mstart); - if (result < 0) { - goto out; - } - for (offset = 0; offset < segment->memsz; offset += PAGE_SIZE) { - struct page *page; - char *ptr; - size_t size, leader; - page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset); - if (page == 0) { - result = -ENOMEM; - goto out; - } - result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT); - if (result < 0) { - goto out; - } - ptr = kmap(page); - if (segment->bufsz < offset) { - /* We are past the end zero the whole page */ - memset(ptr, 0, PAGE_SIZE); - kunmap(page); - continue; - } - size = PAGE_SIZE; - leader = 0; - if ((offset == 0)) { - leader = mstart & ~PAGE_MASK; - } - if (leader) { - /* We are on the first page zero the unused portion */ - memset(ptr, 0, leader); - size -= leader; - ptr += leader; - } - if (size > (segment->bufsz - offset)) { - size = segment->bufsz - offset; - } - if (size < (PAGE_SIZE - leader)) { - /* zero the trailing part of the page */ - memset(ptr + size, 0, (PAGE_SIZE - leader) - size); - } - result = copy_from_user(ptr, buf + offset, size); - kunmap(page); - if (result) { - result = (result < 0) ? result : -EIO; - goto out; - } - } - out: - return result; -} - -/* - * Exec Kernel system call: for obvious reasons only root may call it. - * - * This call breaks up into three pieces. - * - A generic part which loads the new kernel from the current - * address space, and very carefully places the data in the - * allocated pages. - * - * - A generic part that interacts with the kernel and tells all of - * the devices to shut down. Preventing on-going dmas, and placing - * the devices in a consistent state so a later kernel can - * reinitialize them. - * - * - A machine specific part that includes the syscall number - * and the copies the image to it's final destination. And - * jumps into the image at entry. - * - * kexec does not sync, or unmount filesystems so if you need - * that to happen you need to do that yourself. - */ -struct kimage *kexec_image = NULL; -struct kimage *kexec_crash_image = NULL; - -asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, - struct kexec_segment *segments, unsigned long flags) -{ - struct kimage *image; - int result; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) - return -EPERM; - - if (nr_segments > KEXEC_SEGMENT_MAX) - return -EINVAL; - - image = NULL; - result = 0; - - if (nr_segments > 0) { - unsigned long i; - result = kimage_alloc(&image, nr_segments, segments); - if (result) { - goto out; - } - result = machine_kexec_prepare(image); - if (result) { - goto out; - } - image->start = entry; - for (i = 0; i < nr_segments; i++) { - result = kimage_load_segment(image, &image->segment[i]); - if (result) { - goto out; - } - } - result = kimage_terminate(image); - if (result) { - goto out; - } - } - - if (!flags) - image = xchg(&kexec_image, image); - else - image = xchg(&kexec_crash_image, image); - - out: - kimage_free(image); - return result; -} diff --git a/kernel/panic.c b/kernel/panic.c index 3fea0f21a..2bdd2cf25 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -18,14 +18,13 @@ #include <linux/sysrq.h> #include <linux/interrupt.h> #include <linux/nmi.h> +#ifdef CONFIG_KEXEC #include <linux/kexec.h> -#include <linux/crash_dump.h> +#endif int panic_timeout = 900; int panic_on_oops = 1; int tainted; -unsigned int crashed; -int crash_dump_on; void (*dump_function_ptr)(const char *, const struct pt_regs *) = 0; EXPORT_SYMBOL(panic_timeout); @@ -79,9 +78,6 @@ NORET_TYPE void panic(const char * fmt, ...) BUG(); bust_spinlocks(0); - /* If we have crashed, perform a kexec reboot, for dump write-out */ - crash_machine_kexec(); - notifier_call_chain(&panic_notifier_list, 0, buf); #ifdef CONFIG_SMP diff --git a/kernel/sys.c b/kernel/sys.c index 85a448959..cbdc01971 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -17,8 +17,6 @@ #include <linux/init.h> #include <linux/highuid.h> #include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/kexec.h> #include <linux/workqueue.h> #include <linux/device.h> #include <linux/key.h> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 41a327ddd..9f36b40ca 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -18,7 +18,6 @@ cond_syscall(sys_acct) cond_syscall(sys_lookup_dcookie) cond_syscall(sys_swapon) cond_syscall(sys_swapoff) -cond_syscall(sys_kexec_load) cond_syscall(sys_init_module) cond_syscall(sys_delete_module) cond_syscall(sys_socketpair) diff --git a/kernel/vserver/inode.c b/kernel/vserver/inode.c index ca16e0cd4..8fdd30c62 100644 --- a/kernel/vserver/inode.c +++ b/kernel/vserver/inode.c @@ -12,7 +12,6 @@ #include <linux/config.h> #include <linux/sched.h> #include <linux/vs_context.h> -#include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/devpts_fs.h> #include <linux/namei.h> @@ -189,37 +188,6 @@ int vc_set_iattr(uint32_t id, void __user *data) return ret; } -int vc_iattr_ioctl(struct dentry *de, unsigned int cmd, unsigned long arg) -{ - void __user *data = (void __user *)arg; - struct vcmd_ctx_iattr_v1 vc_data; - int ret; - - /* - * I don't think we need any dget/dput pairs in here as long as - * this function is always called from sys_ioctl i.e., de is - * a field of a struct file that is guaranteed not to be freed. - */ - if (cmd == FIOC_SETIATTR) { - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - if (copy_from_user (&vc_data, data, sizeof(vc_data))) - return -EFAULT; - ret = __vc_set_iattr(de, - &vc_data.xid, &vc_data.flags, &vc_data.mask); - } - else { - if (!vx_check(0, VX_ADMIN)) - return -ENOSYS; - ret = __vc_get_iattr(de->d_inode, - &vc_data.xid, &vc_data.flags, &vc_data.mask); - } - - if (!ret && copy_to_user (data, &vc_data, sizeof(vc_data))) - ret = -EFAULT; - return ret; -} - #ifdef CONFIG_VSERVER_LEGACY diff --git a/mm/bootmem.c b/mm/bootmem.c index da73a9de4..8d7ff9bf6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -28,11 +28,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; EXPORT_SYMBOL(min_low_pfn); unsigned long max_pfn; -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; EXPORT_SYMBOL(max_pfn); /* This is exported so * dma_get_required_mask(), which uses diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8c206e407..d484a5d11 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -35,6 +35,7 @@ #include <linux/vs_base.h> #include <linux/vs_limit.h> #include <linux/nodemask.h> +#include <linux/ckrm_mem_inline.h> #include <asm/tlbflush.h> @@ -50,7 +51,7 @@ int sysctl_lower_zone_protection = 0; EXPORT_SYMBOL(totalram_pages); EXPORT_SYMBOL(nr_swap_pages); -#ifdef CONFIG_CRASH_DUMP +#ifdef CONFIG_CRASH_DUMP_MODULE /* This symbol has to be exported to use 'for_each_pgdat' macro by modules. */ EXPORT_SYMBOL(pgdat_list); #endif @@ -105,7 +106,8 @@ static void bad_page(const char *function, struct page *page) tainted |= TAINT_BAD_PAGE; } -#if !defined(CONFIG_HUGETLB_PAGE) && !defined(CONFIG_CRASH_DUMP) +#if !defined(CONFIG_HUGETLB_PAGE) && !defined(CONFIG_CRASH_DUMP) \ + && !defined(CONFIG_CRASH_DUMP_MODULE) #define prep_compound_page(page, order) do { } while (0) #define destroy_compound_page(page, order) do { } while (0) #else @@ -275,7 +277,7 @@ free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, area, order); - ckrm_clear_page_class(page); + ckrm_clear_page_class(page); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -371,9 +373,7 @@ static void prep_new_page(struct page *page, int order) #endif 1 << PG_checked | 1 << PG_mappedtodisk); page->private = 0; -#ifdef CONFIG_CKRM_RES_MEM - page->ckrm_zone = NULL; -#endif + ckrm_page_init(page); set_page_refs(page, order); } @@ -636,9 +636,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, */ can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; - if (!ckrm_class_limit_ok((ckrm_get_mem_class(current)))) { + if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p))) return NULL; - } zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ @@ -1573,10 +1572,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); -#ifndef CONFIG_CKRM_RES_MEM - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); -#endif + ckrm_init_lists(zone); zone->nr_scan_active = 0; zone->nr_scan_inactive = 0; zone->nr_active = 0; diff --git a/mm/swap.c b/mm/swap.c index a7eb64921..015dc5e81 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -92,11 +92,7 @@ int rotate_reclaimable_page(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { list_del(&page->lru); -#ifdef CONFIG_CKRM_RES_MEM - list_add_tail(&page->lru, &ckrm_zone->inactive_list); -#else - list_add_tail(&page->lru, &zone->inactive_list); -#endif + ckrm_add_tail_inactive(page); inc_page_state(pgrotated); } if (!test_clear_page_writeback(page)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f7fba513..8fc4a3d5d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,6 +33,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/rwsem.h> +#include <linux/ckrm_mem.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -589,7 +590,7 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) nr_taken++; } zone->nr_inactive -= nr_taken; - ckrm_zone_dec_inactive(ckrm_zone, nr_taken); + ckrm_zone_sub_inactive(ckrm_zone, nr_taken); spin_unlock_irq(&zone->lru_lock); if (nr_taken == 0) @@ -616,11 +617,11 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) BUG(); list_del(&page->lru); if (PageActive(page)) { - ckrm_zone_inc_active(ckrm_zone, 1); + ckrm_zone_add_active(ckrm_zone, 1); zone->nr_active++; list_add(&page->lru, active_list); } else { - ckrm_zone_inc_inactive(ckrm_zone, 1); + ckrm_zone_add_inactive(ckrm_zone, 1); zone->nr_inactive++; list_add(&page->lru, inactive_list); } @@ -709,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } zone->pages_scanned += pgscanned; zone->nr_active -= pgmoved; - ckrm_zone_dec_active(ckrm_zone, pgmoved); + ckrm_zone_sub_active(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); /* @@ -770,8 +771,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) list_move(&page->lru, inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { - ckrm_zone_inc_inactive(ckrm_zone, pgmoved); zone->nr_inactive += pgmoved; + ckrm_zone_add_inactive(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -781,8 +782,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } - ckrm_zone_inc_inactive(ckrm_zone, pgmoved); zone->nr_inactive += pgmoved; + ckrm_zone_add_inactive(ckrm_zone, pgmoved); pgdeactivate += pgmoved; if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); @@ -800,16 +801,16 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) list_move(&page->lru, active_list); pgmoved++; if (!pagevec_add(&pvec, page)) { - ckrm_zone_inc_active(ckrm_zone, pgmoved); zone->nr_active += pgmoved; + ckrm_zone_add_active(ckrm_zone, pgmoved); pgmoved = 0; spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); spin_lock_irq(&zone->lru_lock); } } - ckrm_zone_inc_active(ckrm_zone, pgmoved); zone->nr_active += pgmoved; + ckrm_zone_add_active(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); @@ -818,45 +819,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } #ifdef CONFIG_CKRM_RES_MEM -static int -shrink_weight(struct ckrm_zone *czone) -{ - u64 temp; - struct zone *zone = czone->zone; - struct ckrm_mem_res *cls = czone->memcls; - int zone_usage, zone_guar, zone_total, guar, ret, cnt; - - zone_usage = czone->nr_active + czone->nr_inactive; - czone->active_over = czone->inactive_over = 0; - - if (zone_usage < SWAP_CLUSTER_MAX * 4) - return 0; - - if (cls->pg_guar == CKRM_SHARE_DONTCARE) { - // no guarantee for this class. use implicit guarantee - guar = cls->impl_guar / cls->nr_dontcare; - } else { - guar = cls->pg_unused / cls->nr_dontcare; - } - zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; - temp = (u64) guar * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_guar = (int) temp; - - ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ? - (zone_usage - zone_guar) : 0; - if (ret) { - cnt = czone->nr_active - (2 * zone_guar / 3); - if (cnt > 0) - czone->active_over = cnt; - cnt = czone->active_over + czone->nr_inactive - - zone_guar / 3; - if (cnt > 0) - czone->inactive_over = cnt; - } - return ret; -} - static void shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc) { @@ -878,121 +840,96 @@ shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc) break; } } - - throttle_vm_writeout(); } } -/* insert an entry to the list and sort decendently*/ +/* FIXME: This function needs to be given more thought. */ static void -list_add_sort(struct list_head *entry, struct list_head *head) +ckrm_shrink_class(struct ckrm_mem_res *cls) { - struct ckrm_zone *czone, *new = - list_entry(entry, struct ckrm_zone, victim_list); - struct list_head* pos = head->next; - - while (pos != head) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - if (new->shrink_weight > czone->shrink_weight) { - __list_add(entry, pos->prev, pos); - return; - } - pos = pos->next; - } - list_add_tail(entry, head); - return; -} + struct scan_control sc; + struct zone *zone; + int zindex = 0, cnt, act_credit = 0, inact_credit = 0; -static void -shrink_choose_victims(struct list_head *victims, - unsigned long nr_active, unsigned long nr_inactive) -{ - unsigned long nr; - struct ckrm_zone* czone; - struct list_head *pos, *next; - - pos = victims->next; - while ((pos != victims) && (nr_active || nr_inactive)) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - - if (nr_active && czone->active_over) { - nr = min(nr_active, czone->active_over); - czone->shrink_active += nr; - czone->active_over -= nr; - nr_active -= nr; + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + sc.priority = 0; // always very high priority + + for_each_zone(zone) { + int zone_total, zone_limit, active_limit, + inactive_limit, clszone_limit; + struct ckrm_zone *czone; + u64 temp; + + czone = &cls->ckrm_zone[zindex]; + if (ckrm_test_set_shrink(czone)) + continue; + + zone->temp_priority = zone->prev_priority; + zone->prev_priority = sc.priority; + + zone_total = zone->nr_active + zone->nr_inactive + + zone->free_pages; + + temp = (u64) cls->pg_limit * zone_total; + do_div(temp, ckrm_tot_lru_pages); + zone_limit = (int) temp; + clszone_limit = (ckrm_mem_shrink_to * zone_limit) / 100; + active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list + inactive_limit = clszone_limit / 3; // 1/3rd in inactive list + + czone->shrink_active = 0; + cnt = czone->nr_active + act_credit - active_limit; + if (cnt > 0) { + czone->shrink_active = (unsigned long) cnt; + act_credit = 0; + } else { + act_credit += cnt; } - if (nr_inactive && czone->inactive_over) { - nr = min(nr_inactive, czone->inactive_over); - czone->shrink_inactive += nr; - czone->inactive_over -= nr; - nr_inactive -= nr; + czone->shrink_inactive = 0; + cnt = czone->shrink_active + inact_credit + + (czone->nr_inactive - inactive_limit); + if (cnt > 0) { + czone->shrink_inactive = (unsigned long) cnt; + inact_credit = 0; + } else { + inact_credit += cnt; } - pos = pos->next; - } - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - next = pos->next; - if (czone->shrink_active == 0 && czone->shrink_inactive == 0) { - list_del_init(pos); - ckrm_clear_shrink(czone); + + if (czone->shrink_active || czone->shrink_inactive) { + sc.nr_to_reclaim = czone->shrink_inactive; + shrink_ckrmzone(czone, &sc); } - pos = next; - } - return; + zone->prev_priority = zone->temp_priority; + zindex++; + ckrm_clear_shrink(czone); + } } static void -shrink_get_victims(struct zone *zone, unsigned long nr_active, - unsigned long nr_inactive, struct list_head *victims) +ckrm_shrink_classes(void) { - struct list_head *pos; struct ckrm_mem_res *cls; - struct ckrm_zone *czone; - int zoneindex = zone_idx(zone); - - if (ckrm_nr_mem_classes <= 1) { - if (ckrm_mem_root_class) { - czone = ckrm_mem_root_class->ckrm_zone + zoneindex; - if (!ckrm_test_set_shrink(czone)) { - list_add(&czone->victim_list, victims); - czone->shrink_active = nr_active; - czone->shrink_inactive = nr_inactive; - } - } - return; - } - spin_lock_irq(&ckrm_mem_lock); - list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) { - czone = cls->ckrm_zone + zoneindex; - if (ckrm_test_set_shrink(czone)) - continue; - czone->shrink_active = 0; - czone->shrink_inactive = 0; - czone->shrink_weight = shrink_weight(czone); - if (czone->shrink_weight) { - list_add_sort(&czone->victim_list, victims); - } else { - ckrm_clear_shrink(czone); - } - } - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - pos = pos->next; - } - shrink_choose_victims(victims, nr_active, nr_inactive); - spin_unlock_irq(&ckrm_mem_lock); - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - pos = pos->next; + spin_lock(&ckrm_mem_lock); + while (!ckrm_shrink_list_empty()) { + cls = list_entry(ckrm_shrink_list.next, struct ckrm_mem_res, + shrink_list); + list_del(&cls->shrink_list); + cls->flags &= ~CLS_AT_LIMIT; + spin_unlock(&ckrm_mem_lock); + ckrm_shrink_class(cls); + spin_lock(&ckrm_mem_lock); } + spin_unlock(&ckrm_mem_lock); } -#endif /* CONFIG_CKRM_RES_MEM */ + +#else +#define ckrm_shrink_classes() do { } while(0) +#endif /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -1037,9 +974,9 @@ shrink_zone(struct zone *zone, struct scan_control *sc) czone = list_entry(pos, struct ckrm_zone, victim_list); next = pos->next; list_del_init(pos); - ckrm_clear_shrink(czone); sc->nr_to_reclaim = czone->shrink_inactive; shrink_ckrmzone(czone, sc); + ckrm_clear_shrink(czone); pos = next; } } @@ -1064,97 +1001,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) #endif } -#ifdef CONFIG_CKRM_RES_MEM -// This function needs to be given more thought. -// Shrink the class to be at shrink_to%" of its limit -static void -ckrm_shrink_class(struct ckrm_mem_res *cls) -{ - struct scan_control sc; - struct zone *zone; - int zindex = 0, cnt, act_credit = 0, inact_credit = 0; - int shrink_to = ckrm_mem_get_shrink_to(); - - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = 0; // always very high priority - - check_memclass(cls, "bef_shnk_cls"); - for_each_zone(zone) { - int zone_total, zone_limit, active_limit, - inactive_limit, clszone_limit; - struct ckrm_zone *czone; - u64 temp; - - czone = &cls->ckrm_zone[zindex]; - if (ckrm_test_set_shrink(czone)) - continue; - - zone->temp_priority = zone->prev_priority; - zone->prev_priority = sc.priority; - - zone_total = zone->nr_active + zone->nr_inactive - + zone->free_pages; - - temp = (u64) cls->pg_limit * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_limit = (int) temp; - clszone_limit = (shrink_to * zone_limit) / 100; - active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list - inactive_limit = clszone_limit / 3; // 1/3rd in inactive list - - czone->shrink_active = 0; - cnt = czone->nr_active + act_credit - active_limit; - if (cnt > 0) { - czone->shrink_active = (unsigned long) cnt; - } else { - act_credit += cnt; - } - - czone->shrink_inactive = 0; - cnt = czone->shrink_active + inact_credit + - (czone->nr_inactive - inactive_limit); - if (cnt > 0) { - czone->shrink_inactive = (unsigned long) cnt; - } else { - inact_credit += cnt; - } - - - if (czone->shrink_active || czone->shrink_inactive) { - sc.nr_to_reclaim = czone->shrink_inactive; - shrink_ckrmzone(czone, &sc); - } - zone->prev_priority = zone->temp_priority; - zindex++; - ckrm_clear_shrink(czone); - } - check_memclass(cls, "aft_shnk_cls"); -} - -static void -ckrm_shrink_classes(void) -{ - struct ckrm_mem_res *cls; - - spin_lock_irq(&ckrm_mem_lock); - while (!ckrm_shrink_list_empty()) { - cls = list_entry(ckrm_shrink_list.next, struct ckrm_mem_res, - shrink_list); - list_del(&cls->shrink_list); - cls->flags &= ~MEM_AT_LIMIT; - spin_unlock_irq(&ckrm_mem_lock); - ckrm_shrink_class(cls); - spin_lock_irq(&ckrm_mem_lock); - } - spin_unlock_irq(&ckrm_mem_lock); -} - -#else -#define ckrm_shrink_classes() do { } while(0) -#endif - /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1492,7 +1338,7 @@ static int kswapd(void *p) if (!ckrm_shrink_list_empty()) ckrm_shrink_classes(); - else + else balance_pgdat(pgdat, 0); } return 0; diff --git a/scripts/kernel-2.6-planetlab.spec b/scripts/kernel-2.6-planetlab.spec index e516b27d7..ca2935d13 100644 --- a/scripts/kernel-2.6-planetlab.spec +++ b/scripts/kernel-2.6-planetlab.spec @@ -22,7 +22,7 @@ Summary: The Linux kernel (the core of the Linux operating system) %define kversion 2.6.%{sublevel} %define rpmversion 2.6.%{sublevel} %define rhbsys %([ -r /etc/beehive-root ] && echo || echo .`whoami`) -%define release 1.14_FC2.2.planetlab%{?date:.%{date}} +%define release 1.14_FC2.1.planetlab%{?date:.%{date}} %define signmodules 0 %define KVERREL %{PACKAGE_VERSION}-%{PACKAGE_RELEASE} -- 2.47.0