http://www.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.10/2.6.10-mm3/
9 = /dev/urandom Faster, less secure random number gen.
10 = /dev/aio Asyncronous I/O notification interface
11 = /dev/kmsg Writes to this come out as printk's
+ 12 = /dev/oldmem Access to kexec-ed crash dump
1 block RAM disk
0 = /dev/ram0 First RAM disk
1 = /dev/ram1 Second RAM disk
--- /dev/null
+Documentation for kdump - the kexec based crash dumping solution
+================================================================
+
+DESIGN
+======
+
+We use kexec to reboot to a second kernel whenever a dump needs to be taken.
+This second kernel is booted with with very little memory (configurable
+at compile time). The first kernel reserves the section of memory that the
+second kernel uses. This ensures that on-going DMA from the first kernel
+does not corrupt the second kernel. The first 640k of physical memory is
+needed irrespective of where the kernel loads at. Hence, this region is
+backed up before reboot.
+
+In the second kernel, "old memory" can be accessed in two ways. The
+first one is through a device interface. We can create a /dev/oldmem or
+whatever and write out the memory in raw format. The second interface is
+through /proc/vmcore. This exports the dump as an ELF format file which
+can be written out using any file copy command (cp, scp, etc). Further, gdb
+can be used to perform some minimal debugging on the dump file. Both these
+methods ensure that there is correct ordering of the dump pages (corresponding
+to the first 640k that has been relocated).
+
+SETUP
+=====
+
+1) Obtain the appropriate -mm tree patch and apply it on to the vanilla
+ kernel tree.
+
+2) Two kernels need to be built in order to get this feature working.
+
+ For the first kernel, choose the default values for the following options.
+
+ a) Physical address where the kernel is loaded
+ b) kexec system call
+ c) kernel crash dumps
+
+ All the options are under "Processor type and features"
+
+ For the second kernel, change (a) to 16MB. If you want to choose another
+ value here, ensure "location from where the crash dumping kernel will boot
+ (MB)" under (c) reflects the same value.
+
+ Also ensure you have CONFIG_HIGHMEM on.
+
+3) Boot into the first kernel. You are now ready to try out kexec based crash
+ dumps.
+
+4) Load the second kernel to be booted using
+
+ kexec -p <second-kernel> --args-linux --append="root=<root-dev> dump
+ init 1 memmap=exactmap memmap=640k@0 memmap=32M@16M"
+
+ Note that <second-kernel> has to be a vmlinux image. bzImage will not
+ work, as of now.
+
+5) Enable kexec based dumping by
+
+ echo 1 > /proc/kexec-dump
+
+ If this is not set, the system will not do a kexec reboot in the event
+ of a panic.
+
+6) System reboots into the second kernel when a panic occurs.
+ You could write a module to call panic, for testing purposes.
+
+7) Write out the dump file using
+
+ cp /proc/vmcore <dump-file>
+
+You can also access the dump as a device for a linear/raw view. To do this,
+you will need the kd-oldmem-<version>.patch built into the kernel. To create
+the device, type
+
+ mknod /dev/oldmem c 1 12
+
+Use "dd" with suitable options for count, bs and skip to access specific
+portions of the dump.
+
+ANALYSIS
+========
+
+You can run gdb on the dump file copied out of /proc/vmcore. Use vmlinux built
+with -g and run
+
+ gdb vmlinux <dump-file>
+
+Stack trace for the task on processor 0, register display, memory display
+work fine.
+
+TODO
+====
+
+1) Provide a kernel-pages only view for the dump. This could possibly turn up
+ as /proc/vmcore-kern.
+2) Provide register contents of all processors (similar to what multi-threaded
+ core dumps does).
+3) Modify "crash" to make it recognize this dump.
+4) Make the i386 kernel boot from any location so we can run the second kernel
+ from the reserved location instead of the current approach.
+
+CONTACT
+=======
+
+Hariprasad Nellitheertha - hari at in dot ibm dot com
L: linux-kernel@vger.kernel.org
S: Maintained
+KEXEC
+P: Eric Biederman
+P: Randy Dunlap
+M: ebiederm@xmission.com
+M: rddunlap@osdl.org
+W: http://www.xmission.com/~ebiederm/files/kexec/
+W: http://developer.osdl.org/rddunlap/kexec/
+L: linux-kernel@vger.kernel.org
+L: fastboot@osdl.org
+S: Maintained
+
LANMEDIA WAN CARD DRIVER
P: Andrew Stanley-Jones
M: asj@lanmedia.com
}
}
interrupt_redirect_table = ramvec;
-#ifdef DUMP_VECTOR
+#ifdef CRASH_DUMP_VECTOR
ramvec_p = ramvec;
for (i = 0; i < NR_IRQS; i++) {
if ((i % 8) == 0)
ramvec[TRAP0_VEC] = VECTOR(system_call);
ramvec[TRAP3_VEC] = break_vec;
interrupt_redirect_table = ramvec;
-#ifdef DUMP_VECTOR
+#ifdef CRASH_DUMP_VECTOR
ramvec_p = ramvec;
for (i = 0; i < NR_IRQS; i++) {
if ((i % 8) == 0)
generate incorrect output with certain kernel constructs when
-mregparm=3 is used.
+config KERN_PHYS_OFFSET
+ int "Physical address where the kernel is loaded (1-112)MB"
+ range 1 112
+ default "1"
+ help
+ This gives the physical address where the kernel is loaded.
+ Primarily used in the case of kexec on panic where the
+ recovery kernel needs to run at a different address than
+ the panic-ed kernel.
+
+config KEXEC
+ bool "kexec system call (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is indepedent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similiarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. It may help to enable device hotplugging
+ support. As of this writing the exact hardware interface is
+ strongly in flux, so no good recommendation can be made.
+
+config CRASH_DUMP
+ bool "kernel crash dumps (EXPERIMENTAL)"
+ depends on KEXEC
+ help
+ Generate crash dump using kexec.
+
+config BACKUP_BASE
+ int "location from where the crash dumping kernel will boot (MB)"
+ depends on CRASH_DUMP
+ default 16
+ help
+ This is the location where the second kernel will boot from.
+
+config BACKUP_SIZE
+ int "Size of memory used by the crash dumping kernel (MB)"
+ depends on CRASH_DUMP
+ range 16 64
+ default 32
+ help
+ The size of the second kernel's memory.
endmenu
popl %esi # discard address
popl %esi # real mode pointer
xorl %ebx,%ebx
- ljmp $(__BOOT_CS), $0x100000
+ ljmp $(__BOOT_CS), $KERN_PHYS_OFFSET
/*
* We come here, if we were loaded high.
popl %ecx # lcount
popl %edx # high_buffer_start
popl %eax # hcount
- movl $0x100000,%edi
+ movl $KERN_PHYS_OFFSET,%edi
cli # make sure we don't get interrupted
ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine
movsl
movl %ebx,%esi # Restore setup pointer
xorl %ebx,%ebx
- ljmp $(__BOOT_CS), $0x100000
+ ljmp $(__BOOT_CS), $KERN_PHYS_OFFSET
move_routine_end:
#include <linux/tty.h>
#include <video/edid.h>
#include <asm/io.h>
+#include <asm/segment.h>
/*
* gzip declarations
#else
if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
#endif
- output_data = (char *)0x100000; /* Points to 1M */
+ output_data = (char *)KERN_PHYS_OFFSET; /* Points to 1M */
free_mem_end_ptr = (long)real_mode;
}
low_buffer_size = low_buffer_end - LOW_BUFFER_START;
high_loaded = 1;
free_mem_end_ptr = (long)high_buffer_start;
- if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) {
- high_buffer_start = (uch *)(0x100000 + low_buffer_size);
+ if ( (KERN_PHYS_OFFSET + low_buffer_size) > ((ulg)high_buffer_start)) {
+ high_buffer_start = (uch *)(KERN_PHYS_OFFSET + low_buffer_size);
mv->hcount = 0; /* say: we need not to move high_buffer */
}
else mv->hcount = -1;
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_X86_NUMAQ) += numaq.o
obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o
obj-$(CONFIG_KPROBES) += kprobes.o
outb(0x70, 0x22);
outb(0x00, 0x23);
}
+ else {
+ /* Go back to Virtual Wire compatibility mode */
+ unsigned long value;
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write_around(APIC_SPIV, value);
+
+ /* For LVT0 make it edge triggered, active high, external and enabled */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
+ apic_write_around(APIC_LVT0, value);
+
+ /* For LVT1 make it edge triggered, active high, nmi and enabled */
+ value = apic_read(APIC_LVT1);
+ value &= ~(
+ APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write_around(APIC_LVT1, value);
+ }
}
void disable_local_APIC(void)
--- /dev/null
+/*
+ * Architecture specific (i386) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+
+#include <asm/crash_dump.h>
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+
+struct pt_regs crash_smp_regs[NR_CPUS];
+long crash_smp_current_task[NR_CPUS];
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_dump_ipi;
+static int crash_dump_expect_ipi[NR_CPUS];
+extern void crash_dump_send_ipi(void);
+extern void stop_this_cpu(void *);
+
+static int crash_dump_nmi_callback(struct pt_regs *regs, int cpu)
+{
+ if (!crash_dump_expect_ipi[cpu])
+ return 0;
+
+ crash_dump_expect_ipi[cpu] = 0;
+ crash_dump_save_this_cpu(regs, cpu);
+ atomic_dec(&waiting_for_dump_ipi);
+
+ stop_this_cpu(NULL);
+
+ return 1;
+}
+
+void __crash_dump_stop_cpus(void)
+{
+ int i, cpu, other_cpus;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ other_cpus = num_online_cpus()-1;
+
+ if (other_cpus > 0) {
+ atomic_set(&waiting_for_dump_ipi, other_cpus);
+
+ for (i = 0; i < NR_CPUS; i++)
+ crash_dump_expect_ipi[i] = (i != cpu && cpu_online(i));
+
+ set_nmi_callback(crash_dump_nmi_callback);
+ /* Ensure the new callback function is set before sending
+ * out the IPI
+ */
+ wmb();
+
+ crash_dump_send_ipi();
+ while (atomic_read(&waiting_for_dump_ipi) > 0)
+ cpu_relax();
+
+ unset_nmi_callback();
+ } else {
+ local_irq_disable();
+ disable_local_APIC();
+ local_irq_enable();
+ }
+ preempt_enable();
+}
+#else
+void __crash_dump_stop_cpus(void) {}
+#endif
+
+void crash_get_current_regs(struct pt_regs *regs)
+{
+ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx));
+ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx));
+ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx));
+ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi));
+ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi));
+ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp));
+ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax));
+ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp));
+ __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss));
+ __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs));
+ __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds));
+ __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes));
+ __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags));
+
+ regs->eip = (unsigned long)current_text_addr();
+}
+
+void crash_dump_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+ crash_smp_current_task[cpu] = (long)current;
+ crash_smp_regs[cpu] = *regs;
+}
+
.long sys_mq_timedreceive /* 280 */
.long sys_mq_notify
.long sys_mq_getsetattr
- .long sys_ni_syscall /* reserved for kexec */
+ .long sys_kexec_load
.long sys_waitid
.long sys_ni_syscall /* 285 */ /* available */
.long sys_add_key
EXPORT_SYMBOL(csum_partial);
-#ifdef CONFIG_CRASH_DUMP_MODULE
+#ifdef CONFIG_CRASH_DUMP
#ifdef CONFIG_SMP
extern irq_desc_t irq_desc[NR_IRQS];
extern unsigned long irq_affinity[NR_IRQS];
EXPORT_SYMBOL(stop_this_cpu);
EXPORT_SYMBOL(dump_send_ipi);
#endif
-extern int pfn_is_ram(unsigned long);
-EXPORT_SYMBOL(pfn_is_ram);
+extern int page_is_ram(unsigned long);
+EXPORT_SYMBOL(page_is_ram);
#ifdef ARCH_HAS_NMI_WATCHDOG
EXPORT_SYMBOL(touch_nmi_watchdog);
#endif
return 0;
}
+static int i8259A_shutdown(struct sys_device *dev)
+{
+ /* Put the i8259A into a quiescent state that
+ * the kernel initialization code can get it
+ * out of.
+ */
+ outb(0xff, 0x21); /* mask all of 8259A-1 */
+ outb(0xff, 0xA1); /* mask all of 8259A-1 */
+ return 0;
+}
+
static struct sysdev_class i8259_sysdev_class = {
set_kset_name("i8259"),
.suspend = i8259A_suspend,
.resume = i8259A_resume,
+ .shutdown = i8259A_shutdown,
};
static struct sys_device device_i8259A = {
--- /dev/null
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/crash_dump.h>
+
+static inline unsigned long read_cr3(void)
+{
+ unsigned long cr3;
+ asm volatile("movl %%cr3,%0": "=r"(cr3));
+ return cr3;
+}
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#define LEVEL0_SIZE (1UL << 12UL)
+
+#ifndef CONFIG_X86_PAE
+#define LEVEL1_SIZE (1UL << 22UL)
+static u32 pgtable_level1[1024] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+ unsigned long level1_index, level2_index;
+ u32 *pgtable_level2;
+
+ /* Find the current page table */
+ pgtable_level2 = __va(read_cr3());
+
+ /* Find the indexes of the physical address to identity map */
+ level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+ level2_index = address / LEVEL1_SIZE;
+
+ /* Identity map the page table entry */
+ pgtable_level1[level1_index] = address | L0_ATTR;
+ pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+
+ /* Flush the tlb so the new mapping takes effect.
+ * Global tlb entries are not flushed but that is not an issue.
+ */
+ load_cr3(pgtable_level2);
+}
+
+#else
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+static u64 pgtable_level1[512] PAGE_ALIGNED;
+static u64 pgtable_level2[512] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+ unsigned long level1_index, level2_index, level3_index;
+ u64 *pgtable_level3;
+
+ /* Find the current page table */
+ pgtable_level3 = __va(read_cr3());
+
+ /* Find the indexes of the physical address to identity map */
+ level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+ level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+ level3_index = address / LEVEL2_SIZE;
+
+ /* Identity map the page table entry */
+ pgtable_level1[level1_index] = address | L0_ATTR;
+ pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+ set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR);
+
+ /* Flush the tlb so the new mapping takes effect.
+ * Global tlb entries are not flushed but that is not an issue.
+ */
+ load_cr3(pgtable_level3);
+}
+#endif
+
+
+static void set_idt(void *newidt, __u16 limit)
+{
+ unsigned char curidt[6];
+
+ /* ia32 supports unaliged loads & stores */
+ (*(__u16 *)(curidt)) = limit;
+ (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
+
+ __asm__ __volatile__ (
+ "lidt %0\n"
+ : "=m" (curidt)
+ );
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+ unsigned char curgdt[6];
+
+ /* ia32 supports unaligned loads & stores */
+ (*(__u16 *)(curgdt)) = limit;
+ (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+ __asm__ __volatile__ (
+ "lgdt %0\n"
+ : "=m" (curgdt)
+ );
+};
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+ __asm__ __volatile__ (
+ "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+ "\t1:\n"
+ "\tmovl $"STR(__KERNEL_DS)",%eax\n"
+ "\tmovl %eax,%ds\n"
+ "\tmovl %eax,%es\n"
+ "\tmovl %eax,%fs\n"
+ "\tmovl %eax,%gs\n"
+ "\tmovl %eax,%ss\n"
+ );
+#undef STR
+#undef __STR
+}
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+ unsigned long indirection_page, unsigned long reboot_code_buffer,
+ unsigned long start_address, unsigned int has_pae);
+
+const extern unsigned char relocate_new_kernel[];
+extern void relocate_new_kernel_end(void);
+const extern unsigned int relocate_new_kernel_size;
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later. Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+ return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * We are going to do a memory preserving reboot. So, we copy over the
+ * first 640k of memory into a backup location. Though the second kernel
+ * boots from a different location, it still requires the first 640k.
+ * Hence this backup.
+ */
+void __crash_relocate_mem(unsigned long backup_addr, unsigned long backup_size)
+{
+ unsigned long pfn, pfn_max;
+ void *src_addr, *dest_addr;
+ struct page *page;
+
+ pfn_max = backup_size >> PAGE_SHIFT;
+ for (pfn = 0; pfn < pfn_max; pfn++) {
+ src_addr = phys_to_virt(pfn << PAGE_SHIFT);
+ dest_addr = backup_addr + src_addr;
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (PageReserved(page))
+ copy_page(dest_addr, src_addr);
+ }
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+ unsigned long indirection_page;
+ unsigned long reboot_code_buffer;
+ relocate_new_kernel_t rnk;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+ /* Compute some offsets */
+ reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ indirection_page = image->head & PAGE_MASK;
+
+ /* Set up an identity mapping for the reboot_code_buffer */
+ identity_map_page(reboot_code_buffer);
+
+ /* copy it out */
+ memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
+
+ /* The segment registers are funny things, they are
+ * automatically loaded from a table, in memory wherever you
+ * set them to a specific selector, but this table is never
+ * accessed again you set the segment to a different selector.
+ *
+ * The more common model is are caches where the behide
+ * the scenes work is done, but is also dropped at arbitrary
+ * times.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /* The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ set_gdt(phys_to_virt(0),0);
+ set_idt(phys_to_virt(0),0);
+
+ /* now call it */
+ rnk = (relocate_new_kernel_t) reboot_code_buffer;
+ (*rnk)(indirection_page, reboot_code_buffer, image->start, cpu_has_pae);
+}
int reboot_thru_bios;
#ifdef CONFIG_SMP
-int reboot_smp = 0;
static int reboot_cpu = -1;
/* shamelessly grabbed from lib/vsprintf.c for readability */
#define is_digit(c) ((c) >= '0' && (c) <= '9')
break;
#ifdef CONFIG_SMP
case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
- reboot_smp = 1;
if (is_digit(*(str+1))) {
reboot_cpu = (int) (*(str+1) - '0');
if (is_digit(*(str+2)))
return 0;
}
-/*
- * Some machines require the "reboot=s" commandline option, this quirk makes that automatic.
- */
-static int __init set_smp_reboot(struct dmi_system_id *d)
-{
-#ifdef CONFIG_SMP
- if (!reboot_smp) {
- reboot_smp = 1;
- printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
- }
-#endif
- return 0;
-}
-
-/*
- * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic.
- */
-static int __init set_smp_bios_reboot(struct dmi_system_id *d)
-{
- set_smp_reboot(d);
- set_bios_reboot(d);
- return 0;
-}
-
static struct dmi_system_id __initdata reboot_dmi_table[] = {
{ /* Handle problems with rebooting on Dell 1300's */
- .callback = set_smp_bios_reboot,
+ .callback = set_bios_reboot,
.ident = "Dell PowerEdge 1300",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
: "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
}
-void machine_restart(char * __unused)
+void machine_shutdown(void)
{
#ifdef CONFIG_SMP
- int cpuid;
-
- cpuid = GET_APIC_ID(apic_read(APIC_ID));
-
- if (reboot_smp) {
-
- /* check to see if reboot_cpu is valid
- if its not, default to the BSP */
- if ((reboot_cpu == -1) ||
- (reboot_cpu > (NR_CPUS -1)) ||
- !physid_isset(cpuid, phys_cpu_present_map))
- reboot_cpu = boot_cpu_physical_apicid;
-
- reboot_smp = 0; /* use this as a flag to only go through this once*/
- /* re-run this function on the other CPUs
- it will fall though this section since we have
- cleared reboot_smp, and do the reboot if it is the
- correct CPU, otherwise it halts. */
- if (reboot_cpu != cpuid)
- smp_call_function((void *)machine_restart , NULL, 1, 0);
+ int reboot_cpu_id;
+
+ /* The boot cpu is always logical cpu 0 */
+ reboot_cpu_id = 0;
+
+ /* See if there has been given a command line override */
+ if ((reboot_cpu_id != -1) && (reboot_cpu < NR_CPUS) &&
+ cpu_isset(reboot_cpu, cpu_online_map)) {
+ reboot_cpu_id = reboot_cpu;
}
- /* if reboot_cpu is still -1, then we want a tradional reboot,
- and if we are not running on the reboot_cpu,, halt */
- if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
- for (;;)
- __asm__ __volatile__ ("hlt");
+ /* Make certain the cpu I'm rebooting on is online */
+ if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+ reboot_cpu_id = smp_processor_id();
}
- /*
- * Stop all CPUs and turn off local APICs and the IO-APIC, so
- * other OSs see a clean IRQ state.
+
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+ /* O.K. Now that I'm on the appropriate processor, stop
+ * all of the others, and disable their local APICs.
*/
+
smp_send_stop();
#endif /* CONFIG_SMP */
#ifdef CONFIG_X86_IO_APIC
disable_IO_APIC();
#endif
+}
+
+void machine_restart(char * __unused)
+{
+ machine_shutdown();
if (!reboot_thru_bios) {
if (efi_enabled) {
--- /dev/null
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+ /*
+ * Must be relocatable PIC code callable as a C function, that once
+ * it starts can not use the previous processes stack.
+ */
+ .globl relocate_new_kernel
+relocate_new_kernel:
+ /* read the arguments and say goodbye to the stack */
+ movl 4(%esp), %ebx /* indirection_page */
+ movl 8(%esp), %ebp /* reboot_code_buffer */
+ movl 12(%esp), %edx /* start address */
+ movl 16(%esp), %ecx /* cpu_has_pae */
+
+ /* zero out flags, and disable interrupts */
+ pushl $0
+ popfl
+
+ /* set a new stack at the bottom of our page... */
+ lea 4096(%ebp), %esp
+
+ /* store the parameters back on the stack */
+ pushl %edx /* store the start address */
+
+ /* Set cr0 to a known state:
+ * 31 0 == Paging disabled
+ * 18 0 == Alignment check disabled
+ * 16 0 == Write protect disabled
+ * 3 0 == No task switch
+ * 2 0 == Don't do FP software emulation.
+ * 0 1 == Proctected mode enabled
+ */
+ movl %cr0, %eax
+ andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+ orl $(1<<0), %eax
+ movl %eax, %cr0
+
+ /* clear cr4 if applicable */
+ testl %ecx, %ecx
+ jz 1f
+ /* Set cr4 to a known state:
+ * Setting everything to zero seems safe.
+ */
+ movl %cr4, %eax
+ andl $0, %eax
+ movl %eax, %cr4
+
+ jmp 1f
+1:
+
+ /* Flush the TLB (needed?) */
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ /* Do the copies */
+ cld
+0: /* top, read another word for the indirection page */
+ movl %ebx, %ecx
+ movl (%ebx), %ecx
+ addl $4, %ebx
+ testl $0x1, %ecx /* is it a destination page */
+ jz 1f
+ movl %ecx, %edi
+ andl $0xfffff000, %edi
+ jmp 0b
+1:
+ testl $0x2, %ecx /* is it an indirection page */
+ jz 1f
+ movl %ecx, %ebx
+ andl $0xfffff000, %ebx
+ jmp 0b
+1:
+ testl $0x4, %ecx /* is it the done indicator */
+ jz 1f
+ jmp 2f
+1:
+ testl $0x8, %ecx /* is it the source indicator */
+ jz 0b /* Ignore it otherwise */
+ movl %ecx, %esi /* For every source page do a copy */
+ andl $0xfffff000, %esi
+
+ movl $1024, %ecx
+ rep ; movsl
+ jmp 0b
+
+2:
+
+ /* To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB, it's handy, and not processor dependent.
+ */
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ /* set all of the registers to known values */
+ /* leave %esp alone */
+
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ ret
+relocate_new_kernel_end:
+
+ .globl relocate_new_kernel_size
+relocate_new_kernel_size:
+ .long relocate_new_kernel_end - relocate_new_kernel
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/io.h>
+#include <asm/crash_dump.h>
#include "setup_arch_pre.h"
#include <bios_ebda.h>
unsigned long init_pg_tables_end __initdata = ~0UL;
int disable_pse __initdata = 0;
+unsigned int dump_enabled;
/*
* Machine setup..
if (to != command_line)
to--;
if (!memcmp(from+7, "exactmap", 8)) {
+ /* If we are doing a crash dump, we
+ * still need to know the real mem
+ * size.
+ */
+ set_saved_max_pfn();
from += 8+7;
e820.nr_map = 0;
userdef = 1;
*/
if (c == ' ' && !memcmp(from, "highmem=", 8))
highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
+
+ if (!memcmp(from, "dump", 4))
+ dump_enabled = 1;
if (c == ' ' && !memcmp(from, "crashdump=", 10))
crashdump_addr = memparse(from+10, &from);
}
}
#endif
+
+ crash_reserve_bootmem();
+
return max_low_pfn;
}
#else
*/
apic_wait_icr_idle();
+ if (vector == CRASH_DUMP_VECTOR)
+ cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI;
+
/*
* No need to touch the target chip field
*/
cfg = __prepare_ICR(shortcut, vector);
- if (vector == DUMP_VECTOR) {
+ if (vector == CRASH_DUMP_VECTOR) {
/*
* Setup DUMP IPI to be delivered as an NMI
*/
*/
cfg = __prepare_ICR(0, vector);
- if (vector == DUMP_VECTOR) {
+ if (vector == CRASH_DUMP_VECTOR) {
/*
* Setup DUMP IPI to be delivered as an NMI
*/
void dump_send_ipi(void)
{
- send_IPI_allbutself(DUMP_VECTOR);
+ send_IPI_allbutself(CRASH_DUMP_VECTOR);
}
/*
send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
}
+void crash_dump_send_ipi(void)
+{
+ send_IPI_allbutself(CRASH_DUMP_VECTOR);
+}
+
/*
* Structure and data for smp_call_function(). This is designed to minimise
* static memory requirements. It also looks cleaner.
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
*/
+#define LOAD_OFFSET __PAGE_OFFSET
+
#include <asm-generic/vmlinux.lds.h>
#include <asm/thread_info.h>
#include <asm/page.h>
+#include <asm/segment.h>
OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
OUTPUT_ARCH(i386)
-ENTRY(startup_32)
+ENTRY(phys_startup_32)
jiffies = jiffies_64;
SECTIONS
{
- . = __PAGE_OFFSET + 0x100000;
+ . = LOAD_OFFSET + KERN_PHYS_OFFSET;
+ phys_startup_32 = startup_32 - LOAD_OFFSET;
/* read-only */
_text = .; /* Text and read-only data */
- .text : {
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
*(.text)
SCHED_TEXT
LOCK_TEXT
. = ALIGN(16); /* Exception table */
__start___ex_table = .;
- __ex_table : { *(__ex_table) }
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
__stop___ex_table = .;
RODATA
/* writeable */
- .data : { /* Data */
+ .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
*(.data)
CONSTRUCTORS
}
. = ALIGN(4096);
__nosave_begin = .;
- .data_nosave : { *(.data.nosave) }
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
. = ALIGN(4096);
__nosave_end = .;
. = ALIGN(4096);
- .data.page_aligned : { *(.data.idt) }
+ .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { *(.data.idt) }
. = ALIGN(32);
- .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+ .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+ *(.data.cacheline_aligned)
+ }
_edata = .; /* End of data section */
. = ALIGN(THREAD_SIZE); /* init_task */
- .data.init_task : { *(.data.init_task) }
+ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) }
/* will be freed after init */
. = ALIGN(4096); /* Init code and data */
__init_begin = .;
- .init.text : {
+ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
_sinittext = .;
*(.init.text)
_einittext = .;
}
- .init.data : { *(.init.data) }
+ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
. = ALIGN(16);
__setup_start = .;
- .init.setup : { *(.init.setup) }
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
__setup_end = .;
__initcall_start = .;
- .initcall.init : {
+ .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
*(.initcall1.init)
*(.initcall2.init)
*(.initcall3.init)
}
__initcall_end = .;
__con_initcall_start = .;
- .con_initcall.init : { *(.con_initcall.init) }
+ .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ *(.con_initcall.init)
+ }
__con_initcall_end = .;
SECURITY_INIT
. = ALIGN(4);
__alt_instructions = .;
- .altinstructions : { *(.altinstructions) }
- __alt_instructions_end = .;
- .altinstr_replacement : { *(.altinstr_replacement) }
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ *(.altinstructions)
+ }
+ __alt_instructions_end = .;
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
- .exit.text : { *(.exit.text) }
- .exit.data : { *(.exit.data) }
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
. = ALIGN(4096);
__initramfs_start = .;
- .init.ramfs : { *(.init.ramfs) }
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
__initramfs_end = .;
. = ALIGN(32);
__per_cpu_start = .;
- .data.percpu : { *(.data.percpu) }
+ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
__per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;
/* freed after init ends here */
__bss_start = .; /* BSS */
- .bss : {
- *(.bss.page_aligned)
+ .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) {
+ *(.bss.page_aligned) }
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
*(.bss)
}
. = ALIGN(4);
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
+#include <asm/crash_dump.h>
#include <bios_ebda.h>
struct pglist_data *node_data[MAX_NUMNODES];
}
}
#endif
+
+ crash_reserve_bootmem();
+
return system_max_low_pfn;
}
preempt_check_resched();
}
+/* This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+char *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+{
+ enum fixed_addresses idx;
+ unsigned long vaddr;
+
+ inc_preempt_count();
+
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
+ __flush_tlb_one(vaddr);
+
+ return (char *)vaddr;
+}
+
struct page *kmap_atomic_to_page(void *ptr)
{
unsigned long idx, vaddr = (unsigned long)ptr;
pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
return pte_page(*pte);
}
-
here. Saying Y here will not hurt performance (on any machine) but
will increase the size of the kernel.
+config KEXEC
+ bool "kexec system call (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is indepedent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similiarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. It may help to enable device hotplugging
+ support. As of this writing the exact hardware interface is
+ strongly in flux, so no good recommendation can be made.
+
+ In the GameCube implementation, kexec allows you to load and
+ run DOL files, including kernel and homebrew DOLs.
+
source "drivers/cpufreq/Kconfig"
config CPU_FREQ_PMAC
obj-$(CONFIG_SMP) += smp.o smp-tbsync.o
obj-$(CONFIG_TAU) += temp.o
obj-$(CONFIG_ALTIVEC) += vecemu.o vector.o
+obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
ifndef CONFIG_MATH_EMULATION
obj-$(CONFIG_8xx) += softemu8xx.o
--- /dev/null
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com>
+ *
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+typedef void (*relocate_new_kernel_t)(
+ unsigned long indirection_page, unsigned long reboot_code_buffer,
+ unsigned long start_address);
+
+const extern unsigned char relocate_new_kernel[];
+const extern unsigned int relocate_new_kernel_size;
+
+void machine_shutdown(void)
+{
+ if (ppc_md.machine_shutdown) {
+ ppc_md.machine_shutdown();
+ }
+}
+
+/*
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+ if (ppc_md.machine_kexec_prepare) {
+ return ppc_md.machine_kexec_prepare(image);
+ }
+ /*
+ * Fail if platform doesn't provide its own machine_kexec_prepare
+ * implementation.
+ */
+ return -ENOSYS;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+ if (ppc_md.machine_kexec_cleanup) {
+ ppc_md.machine_kexec_cleanup(image);
+ }
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+ if (ppc_md.machine_kexec) {
+ ppc_md.machine_kexec(image);
+ } else {
+ /*
+ * Fall back to normal restart if platform doesn't provide
+ * its own kexec function, and user insist to kexec...
+ */
+ machine_restart(NULL);
+ }
+}
+
+
+/*
+ * This is a generic machine_kexec function suitable at least for
+ * non-OpenFirmware embedded platforms.
+ * It merely copies the image relocation code to the control page and
+ * jumps to it.
+ * A platform specific function may just call this one.
+ */
+void machine_kexec_simple(struct kimage *image)
+{
+ unsigned long indirection_page;
+ unsigned long reboot_code_buffer, reboot_code_buffer_phys;
+ relocate_new_kernel_t rnk;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+ indirection_page = image->head & PAGE_MASK;
+
+ /* we need both effective and real address here */
+ reboot_code_buffer =
+ (unsigned long)page_address(image->control_code_page);
+ reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer);
+
+ /* copy our kernel relocation code to the control code page */
+ memcpy((void *)reboot_code_buffer,
+ relocate_new_kernel, relocate_new_kernel_size);
+
+ flush_icache_range(reboot_code_buffer,
+ reboot_code_buffer + KEXEC_CONTROL_CODE_SIZE);
+ printk(KERN_INFO "Bye!\n");
+
+ /* now call it */
+ rnk = (relocate_new_kernel_t) reboot_code_buffer;
+ (*rnk)(indirection_page, reboot_code_buffer_phys, image->start);
+}
+
--- /dev/null
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com>
+ *
+ * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/processor.h>
+
+#include <asm/kexec.h>
+
+#define PAGE_SIZE 4096 /* must be same value as in <asm/page.h> */
+
+/* returns r3 = relocated address of sym */
+/* modifies r0 */
+#define RELOC_SYM(sym) \
+ mflr r3; \
+ bl 1f; \
+1: mflr r0; \
+ mtlr r3; \
+ lis r3, 1b@ha; \
+ ori r3, r3, 1b@l; \
+ subf r0, r3, r0; \
+ lis r3, sym@ha; \
+ ori r3, r3, sym@l; \
+ add r3, r3, r0
+
+ /*
+ * Must be relocatable PIC code callable as a C function.
+ */
+ .globl relocate_new_kernel
+relocate_new_kernel:
+ /* r3 = indirection_page */
+ /* r4 = reboot_code_buffer */
+ /* r5 = start_address */
+
+ li r0, 0
+
+ /*
+ * Set Machine Status Register to a known status,
+ * switch the MMU off and jump to 1: in a single step.
+ */
+
+ mr r8, r0
+ ori r8, r8, MSR_RI|MSR_ME
+ mtspr SRR1, r8
+ addi r8, r4, 1f - relocate_new_kernel
+ mtspr SRR0, r8
+ sync
+ rfi
+
+1:
+ /* from this point address translation is turned off */
+ /* and interrupts are disabled */
+
+ /* set a new stack at the bottom of our page... */
+ /* (not really needed now) */
+ addi r1, r4, KEXEC_CONTROL_CODE_SIZE - 8 /* for LR Save+Back Chain */
+ stw r0, 0(r1)
+
+ /* Do the copies */
+ li r6, 0 /* checksum */
+ subi r3, r3, 4
+
+0: /* top, read another word for the indirection page */
+ lwzu r0, 4(r3)
+
+ /* is it a destination page? (r8) */
+ rlwinm. r7, r0, 0, 31, 31 /* IND_DESTINATION (1<<0) */
+ beq 1f
+
+ rlwinm r8, r0, 0, 0, 19 /* clear kexec flags, page align */
+ b 0b
+
+1: /* is it an indirection page? (r3) */
+ rlwinm. r7, r0, 0, 30, 30 /* IND_INDIRECTION (1<<1) */
+ beq 1f
+
+ rlwinm r3, r0, 0, 0, 19 /* clear kexec flags, page align */
+ subi r3, r3, 4
+ b 0b
+
+1: /* are we done? */
+ rlwinm. r7, r0, 0, 29, 29 /* IND_DONE (1<<2) */
+ beq 1f
+ b 2f
+
+1: /* is it a source page? (r9) */
+ rlwinm. r7, r0, 0, 28, 28 /* IND_SOURCE (1<<3) */
+ beq 0b
+
+ rlwinm r9, r0, 0, 0, 19 /* clear kexec flags, page align */
+
+ li r7, PAGE_SIZE / 4
+ mtctr r7
+ subi r9, r9, 4
+ subi r8, r8, 4
+9:
+ lwzu r0, 4(r9) /* do the copy */
+ xor r6, r6, r0
+ stwu r0, 4(r8)
+ dcbst 0, r8
+ sync
+ icbi 0, r8
+ bdnz 9b
+
+ addi r9, r9, 4
+ addi r8, r8, 4
+ b 0b
+
+2:
+
+ /* To be certain of avoiding problems with self-modifying code
+ * execute a serializing instruction here.
+ */
+ isync
+ sync
+
+ /* jump to the entry point, usually the setup routine */
+ mtlr r5
+ blrl
+
+1: b 1b
+
+relocate_new_kernel_end:
+
+ .globl relocate_new_kernel_size
+relocate_new_kernel_size:
+ .long relocate_new_kernel_end - relocate_new_kernel
+
depends on IA32_EMULATION
default y
+config KEXEC
+ bool "kexec system call (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is indepedent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similiarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. It may help to enable device hotplugging
+ support. As of this writing the exact hardware interface is
+ strongly in flux, so no good recommendation can be made.
+
endmenu
source drivers/Kconfig
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
obj-$(CONFIG_PM) += suspend.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
outb(0x70, 0x22);
outb(0x00, 0x23);
}
+ else {
+ /* Go back to Virtual Wire compatibility mode */
+ unsigned long value;
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write_around(APIC_SPIV, value);
+
+ /* For LVT0 make it edge triggered, active high, external and enabled */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
+ apic_write_around(APIC_LVT0, value);
+
+ /* For LVT1 make it edge triggered, active high, nmi and enabled */
+ value = apic_read(APIC_LVT1);
+ value &= ~(
+ APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write_around(APIC_LVT1, value);
+ }
}
void disable_local_APIC(void)
int i;
for (i = 0; i < e820.nr_map; i++) {
struct resource *res;
- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
- continue;
res = alloc_bootmem_low(sizeof(struct resource));
switch (e820.map[i].type) {
case E820_RAM: res->name = "System RAM"; break;
return 0;
}
+
+
+static int i8259A_shutdown(struct sys_device *dev)
+{
+ /* Put the i8259A into a quiescent state that
+ * the kernel initialization code can get it
+ * out of.
+ */
+ outb(0xff, 0x21); /* mask all of 8259A-1 */
+ outb(0xff, 0xA1); /* mask all of 8259A-1 */
+ return 0;
+}
+
static struct sysdev_class i8259_sysdev_class = {
set_kset_name("i8259"),
.suspend = i8259A_suspend,
.resume = i8259A_resume,
+ .shutdown = i8259A_shutdown,
};
static struct sys_device device_i8259A = {
/*
* Find the pin to which IRQ[irq] (ISA) is connected
*/
-static int __init find_isa_irq_pin(int irq, int type)
+static int find_isa_irq_pin(int irq, int type)
{
int i;
*/
void disable_IO_APIC(void)
{
+ int pin;
/*
* Clear the IO-APIC before rebooting:
*/
clear_IO_APIC();
+ /*
+ * If the i82559 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrups can be delivered.
+ */
+ pin = find_isa_irq_pin(0, mp_ExtINT);
+ if (pin != -1) {
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 0; /* Enabled */
+ entry.trigger = 0; /* Edge */
+ entry.irr = 0;
+ entry.polarity = 0; /* High */
+ entry.delivery_status = 0;
+ entry.dest_mode = 0; /* Physical */
+ entry.delivery_mode = 7; /* ExtInt */
+ entry.vector = 0;
+ entry.dest.physical.physical_dest = 0;
+
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+ io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
disconnect_bsp_APIC();
}
--- /dev/null
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/hw_irq.h>
+
+#define LEVEL0_SIZE (1UL << 12UL)
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+#define LEVEL3_SIZE (1UL << 39UL)
+#define LEVEL4_SIZE (1UL << 48UL)
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
+#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+static void init_level2_page(
+ uint64_t *level2p, unsigned long addr)
+{
+ unsigned long end_addr;
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL2_SIZE;
+ while(addr < end_addr) {
+ *(level2p++) = addr | L1_ATTR;
+ addr += LEVEL1_SIZE;
+ }
+}
+
+static int init_level3_page(struct kimage *image,
+ uint64_t *level3p, unsigned long addr, unsigned long last_addr)
+{
+ unsigned long end_addr;
+ int result;
+ result = 0;
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL3_SIZE;
+ while((addr < last_addr) && (addr < end_addr)) {
+ struct page *page;
+ uint64_t *level2p;
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ level2p = (uint64_t *)page_address(page);
+ init_level2_page(level2p, addr);
+ *(level3p++) = __pa(level2p) | L2_ATTR;
+ addr += LEVEL2_SIZE;
+ }
+ /* clear the unused entries */
+ while(addr < end_addr) {
+ *(level3p++) = 0;
+ addr += LEVEL2_SIZE;
+ }
+out:
+ return result;
+}
+
+
+static int init_level4_page(struct kimage *image,
+ uint64_t *level4p, unsigned long addr, unsigned long last_addr)
+{
+ unsigned long end_addr;
+ int result;
+ result = 0;
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL4_SIZE;
+ while((addr < last_addr) && (addr < end_addr)) {
+ struct page *page;
+ uint64_t *level3p;
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ level3p = (uint64_t *)page_address(page);
+ result = init_level3_page(image, level3p, addr, last_addr);
+ if (result) {
+ goto out;
+ }
+ *(level4p++) = __pa(level3p) | L3_ATTR;
+ addr += LEVEL3_SIZE;
+ }
+ /* clear the unused entries */
+ while(addr < end_addr) {
+ *(level4p++) = 0;
+ addr += LEVEL3_SIZE;
+ }
+ out:
+ return result;
+}
+
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+ uint64_t *level4p;
+ level4p = (uint64_t *)__va(start_pgtable);
+ return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+
+static void set_idt(void *newidt, __u16 limit)
+{
+ unsigned char curidt[10];
+
+ /* x86-64 supports unaliged loads & stores */
+ (*(__u16 *)(curidt)) = limit;
+ (*(__u64 *)(curidt +2)) = (unsigned long)(newidt);
+
+ __asm__ __volatile__ (
+ "lidt %0\n"
+ : "=m" (curidt)
+ );
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+ unsigned char curgdt[10];
+
+ /* x86-64 supports unaligned loads & stores */
+ (*(__u16 *)(curgdt)) = limit;
+ (*(__u64 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+ __asm__ __volatile__ (
+ "lgdt %0\n"
+ : "=m" (curgdt)
+ );
+};
+
+static void load_segments(void)
+{
+ __asm__ __volatile__ (
+ "\tmovl $"STR(__KERNEL_DS)",%eax\n"
+ "\tmovl %eax,%ds\n"
+ "\tmovl %eax,%es\n"
+ "\tmovl %eax,%ss\n"
+ "\tmovl %eax,%fs\n"
+ "\tmovl %eax,%gs\n"
+ );
+#undef STR
+#undef __STR
+}
+
+typedef void (*relocate_new_kernel_t)(
+ unsigned long indirection_page, unsigned long control_code_buffer,
+ unsigned long start_address, unsigned long pgtable);
+
+const extern unsigned char relocate_new_kernel[];
+extern void relocate_new_kernel_end(void);
+const extern unsigned long relocate_new_kernel_size;
+
+int machine_kexec_prepare(struct kimage *image)
+{
+ unsigned long start_pgtable, control_code_buffer;
+ int result;
+
+ /* Calculate the offsets */
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_code_buffer = start_pgtable + 4096UL;
+
+ /* Setup the identity mapped 64bit page table */
+ result = init_pgtable(image, start_pgtable);
+ if (result) {
+ return result;
+ }
+
+ /* Place the code in the reboot code buffer */
+ memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
+
+ return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+ return;
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+ unsigned long indirection_page;
+ unsigned long control_code_buffer;
+ unsigned long start_pgtable;
+ relocate_new_kernel_t rnk;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+ /* Calculate the offsets */
+ indirection_page = image->head & PAGE_MASK;
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_code_buffer = start_pgtable + 4096UL;
+
+ /* Set the low half of the page table to my identity mapped
+ * page table for kexec. Leave the high half pointing at the
+ * kernel pages. Don't bother to flush the global pages
+ * as that will happen when I fully switch to my identity mapped
+ * page table anyway.
+ */
+// memcpy(current->active_mm->pml4, __va(start_pgtable), PAGE_SIZE/2);
+ __flush_tlb();
+
+
+ /* The segment registers are funny things, they are
+ * automatically loaded from a table, in memory wherever you
+ * set them to a specific selector, but this table is never
+ * accessed again unless you set the segment to a different selector.
+ *
+ * The more common model are caches where the behide
+ * the scenes work is done, but is also dropped at arbitrary
+ * times.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /* The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ set_gdt(phys_to_virt(0),0);
+ set_idt(phys_to_virt(0),0);
+ /* now call it */
+ rnk = (relocate_new_kernel_t) control_code_buffer;
+ (*rnk)(indirection_page, control_code_buffer, image->start, start_pgtable);
+}
[target] "b" (WARMBOOT_TRAMP));
}
-#ifdef CONFIG_SMP
-static void smp_halt(void)
-{
- int cpuid = safe_smp_processor_id();
- static int first_entry = 1;
-
- if (first_entry) {
- first_entry = 0;
- smp_call_function((void *)machine_restart, NULL, 1, 0);
- }
-
- smp_stop_cpu();
-
- /* AP calling this. Just halt */
- if (cpuid != boot_cpu_id) {
- for (;;)
- asm("hlt");
- }
-
- /* Wait for all other CPUs to have run smp_stop_cpu */
- while (!cpus_empty(cpu_online_map))
- rep_nop();
-}
-#endif
-
static inline void kb_wait(void)
{
int i;
-
+
for (i=0; i<0x10000; i++)
if ((inb_p(0x64) & 0x02) == 0)
break;
}
+
+void machine_shutdown(void)
+{
+ /* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+ int reboot_cpu_id;
+
+ /* The boot cpu is always logical cpu 0 */
+ reboot_cpu_id = 0;
+
+ /* Make certain the cpu I'm about to reboot on is online */
+ if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+ reboot_cpu_id = smp_processor_id();
+ }
+
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+ /* O.K Now that I'm on the appropriate processor,
+ * stop all of the others.
+ */
+ smp_send_stop();
+#endif
+
+ local_irq_disable();
+
+#ifndef CONFIG_SMP
+ disable_local_APIC();
+#endif
+
+ disable_IO_APIC();
+
+ local_irq_enable();
+}
void machine_restart(char * __unused)
{
int i;
-#ifdef CONFIG_SMP
- smp_halt();
-#endif
+ machine_shutdown();
local_irq_disable();
--- /dev/null
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+ /*
+ * Must be relocatable PIC code callable as a C function, that once
+ * it starts can not use the previous processes stack.
+ */
+ .globl relocate_new_kernel
+ .code64
+relocate_new_kernel:
+ /* %rdi indirection_page
+ * %rsi reboot_code_buffer
+ * %rdx start address
+ * %rcx page_table
+ * %r8 arg5
+ * %r9 arg6
+ */
+
+ /* zero out flags, and disable interrupts */
+ pushq $0
+ popfq
+
+ /* set a new stack at the bottom of our page... */
+ lea 4096(%rsi), %rsp
+
+ /* store the parameters back on the stack */
+ pushq %rdx /* store the start address */
+
+ /* Set cr0 to a known state:
+ * 31 1 == Paging enabled
+ * 18 0 == Alignment check disabled
+ * 16 0 == Write protect disabled
+ * 3 0 == No task switch
+ * 2 0 == Don't do FP software emulation.
+ * 0 1 == Proctected mode enabled
+ */
+ movq %cr0, %rax
+ andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+ orl $((1<<31)|(1<<0)), %eax
+ movq %rax, %cr0
+
+ /* Set cr4 to a known state:
+ * 10 0 == xmm exceptions disabled
+ * 9 0 == xmm registers instructions disabled
+ * 8 0 == performance monitoring counter disabled
+ * 7 0 == page global disabled
+ * 6 0 == machine check exceptions disabled
+ * 5 1 == physical address extension enabled
+ * 4 0 == page size extensions disabled
+ * 3 0 == Debug extensions disabled
+ * 2 0 == Time stamp disable (disabled)
+ * 1 0 == Protected mode virtual interrupts disabled
+ * 0 0 == VME disabled
+ */
+
+ movq $((1<<5)), %rax
+ movq %rax, %cr4
+
+ jmp 1f
+1:
+
+ /* Switch to the identity mapped page tables,
+ * and flush the TLB.
+ */
+ movq %rcx, %cr3
+
+ /* Do the copies */
+ movq %rdi, %rbx /* Put the indirection page in %rbx */
+ xorq %rdi, %rdi
+ xorq %rsi, %rsi
+
+0: /* top, read another word for the indirection page */
+
+ movq (%rbx), %rcx
+ addq $8, %rbx
+ testq $0x1, %rcx /* is it a destination page? */
+ jz 1f
+ movq %rcx, %rdi
+ andq $0xfffffffffffff000, %rdi
+ jmp 0b
+1:
+ testq $0x2, %rcx /* is it an indirection page? */
+ jz 1f
+ movq %rcx, %rbx
+ andq $0xfffffffffffff000, %rbx
+ jmp 0b
+1:
+ testq $0x4, %rcx /* is it the done indicator? */
+ jz 1f
+ jmp 2f
+1:
+ testq $0x8, %rcx /* is it the source indicator? */
+ jz 0b /* Ignore it otherwise */
+ movq %rcx, %rsi /* For ever source page do a copy */
+ andq $0xfffffffffffff000, %rsi
+
+ movq $512, %rcx
+ rep ; movsq
+ jmp 0b
+2:
+
+ /* To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /* set all of the registers to known values */
+ /* leave %rsp alone */
+
+ xorq %rax, %rax
+ xorq %rbx, %rbx
+ xorq %rcx, %rcx
+ xorq %rdx, %rdx
+ xorq %rsi, %rsi
+ xorq %rdi, %rdi
+ xorq %rbp, %rbp
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r9
+ xorq %r11, %r11
+ xorq %r12, %r12
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+
+ ret
+relocate_new_kernel_end:
+
+ .globl relocate_new_kernel_size
+relocate_new_kernel_size:
+ .quad relocate_new_kernel_end - relocate_new_kernel
CONFIG_MTRR=y
# CONFIG_EFI is not set
CONFIG_REGPARM=y
+CONFIG_KERN_PHYS_OFFSET=1
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
#
# Power management options (ACPI, APM)
#include <linux/devfs_fs_kernel.h>
#include <linux/ptrace.h>
#include <linux/device.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
#include <asm/uaccess.h>
#include <asm/io.h>
return 0;
}
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * Read memory corresponding to the old kernel.
+ * If we are reading from the reserved section, which is
+ * actually used by the current kernel, we just return zeroes.
+ * Or if we are reading from the first 640k, we return from the
+ * backed up area.
+ */
+static ssize_t read_oldmem(struct file * file, char * buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long pfn;
+ unsigned backup_start, backup_end, relocate_start;
+ size_t read=0, csize;
+
+ backup_start = CRASH_BACKUP_BASE / PAGE_SIZE;
+ backup_end = backup_start + (CRASH_BACKUP_SIZE / PAGE_SIZE);
+ relocate_start = (CRASH_BACKUP_BASE + CRASH_BACKUP_SIZE) / PAGE_SIZE;
+
+ while(count) {
+ pfn = *ppos / PAGE_SIZE;
+
+ csize = (count > PAGE_SIZE) ? PAGE_SIZE : count;
+
+ /* Perform translation (see comment above) */
+ if ((pfn >= backup_start) && (pfn < backup_end)) {
+ if (clear_user(buf, csize)) {
+ read = -EFAULT;
+ goto done;
+ }
+
+ goto copy_done;
+ } else if (pfn < (CRASH_RELOCATE_SIZE / PAGE_SIZE))
+ pfn += relocate_start;
+
+ if (pfn > saved_max_pfn) {
+ read = 0;
+ goto done;
+ }
+
+ if (copy_oldmem_page(pfn, buf, csize, 1)) {
+ read = -EFAULT;
+ goto done;
+ }
+
+copy_done:
+ buf += csize;
+ *ppos += csize;
+ read += csize;
+ count -= csize;
+ }
+done:
+ return read;
+}
+#endif
+
extern long vread(char *buf, char *addr, unsigned long count);
extern long vwrite(char *buf, char *addr, unsigned long count);
#define read_full read_zero
#define open_mem open_port
#define open_kmem open_mem
+#define open_oldmem open_mem
static struct file_operations mem_fops = {
.llseek = memory_lseek,
.write = write_full,
};
+#ifdef CONFIG_CRASH_DUMP
+static struct file_operations oldmem_fops = {
+ .read = read_oldmem,
+ .open = open_oldmem,
+};
+#endif
+
static ssize_t kmsg_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
case 11:
filp->f_op = &kmsg_fops;
break;
+#ifdef CONFIG_CRASH_DUMP
+ case 12:
+ filp->f_op = &oldmem_fops;
+ break;
+#endif
default:
return -ENXIO;
}
{8, "random", S_IRUGO | S_IWUSR, &random_fops},
{9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops},
{11,"kmsg", S_IRUGO | S_IWUSR, &kmsg_fops},
+#ifdef CONFIG_CRASH_DUMP
+ {12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
+#endif
};
static struct class_simple *mem_class;
buf += sizeof(struct __dump_page);
while (len) {
- addr = kmap_atomic(page, KM_DUMP);
+ addr = kmap_atomic(page, KM_CRASHDUMP);
size = bytes = (len > PAGE_SIZE) ? PAGE_SIZE : len;
/* check for compression */
if (dump_allow_compress(page, bytes)) {
size = bytes;
}
/* memset(buf, 'A', size); temporary: testing only !! */
- kunmap_atomic(addr, KM_DUMP);
+ kunmap_atomic(addr, KM_CRASHDUMP);
dp->dp_size += size;
buf += size;
len -= bytes;
free_dha_stack();
}
-extern int pfn_is_ram(unsigned long);
+extern int page_is_ram(unsigned long);
/*
* Name: __dump_page_valid()
if (!pfn_valid(index))
return 0;
- return pfn_is_ram(index);
+ return page_is_ram(index);
}
/*
pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
page = pfn_to_page(map1[i]);
set_page_count(page, 1);
- map2 = kmap_atomic(page, KM_DUMP);
+ map2 = kmap_atomic(page, KM_CRASHDUMP);
for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] &&
(off + j < last); j++) {
pr_debug("\t map[%d][%d] = 0x%lx\n", i, j,
}
if (page)
- map = kmap_atomic(page, KM_DUMP);
+ map = kmap_atomic(page, KM_CRASHDUMP);
else
return NULL;
} else {
page = NULL;
}
- kunmap_atomic(map, KM_DUMP);
+ kunmap_atomic(map, KM_CRASHDUMP);
return page;
}
};
if (*dev->curr_map) {
- map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_DUMP);
+ map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_CRASHDUMP);
if (map[i])
page = pfn_to_page(map[i]);
- kunmap_atomic(map, KM_DUMP);
+ kunmap_atomic(map, KM_CRASHDUMP);
dev->ddev.curr_offset += PAGE_SIZE;
};
/* add data space */
i = dev->curr_map_offset;
map_page = pfn_to_page(*dev->curr_map);
- map = (unsigned long *)kmap_atomic(map_page, KM_DUMP);
+ map = (unsigned long *)kmap_atomic(map_page, KM_CRASHDUMP);
map[i] = page_to_pfn(page);
- kunmap_atomic(map, KM_DUMP);
+ kunmap_atomic(map, KM_CRASHDUMP);
dev->curr_map_offset = ++i;
dev->last_offset += PAGE_SIZE;
if (i >= DUMP_MAP_SZ) {
page = dump_mem_lookup(dump_mdev, dev->curr_offset >> PAGE_SHIFT);
for (n = len; (n > 0) && page; n -= PAGE_SIZE, buf += PAGE_SIZE ) {
- addr = kmap_atomic(page, KM_DUMP);
+ addr = kmap_atomic(page, KM_CRASHDUMP);
/* memset(addr, 'x', PAGE_SIZE); */
memcpy(addr, buf, PAGE_SIZE);
- kunmap_atomic(addr, KM_DUMP);
+ kunmap_atomic(addr, KM_CRASHDUMP);
/* dev->curr_offset += PAGE_SIZE; */
page = dump_mem_next_page(dump_mdev);
}
else
count++;
/* clear the contents of page */
- /* fixme: consider using KM_DUMP instead */
+ /* fixme: consider using KM_CRASHDUMP instead */
clear_highpage(page);
}
void *addr;
while (len < sz) {
- addr = kmap_atomic(page, KM_DUMP);
+ addr = kmap_atomic(page, KM_CRASHDUMP);
bytes = (sz > len + PAGE_SIZE) ? PAGE_SIZE : sz - len;
memcpy(buf, addr, bytes);
- kunmap_atomic(addr, KM_DUMP);
+ kunmap_atomic(addr, KM_CRASHDUMP);
buf += bytes;
len += bytes;
page++;
dump_sysrq_register(void)
{
#ifdef CONFIG_MAGIC_SYSRQ
- __sysrq_lock_table();
- __sysrq_put_key_op(DUMP_SYSRQ_KEY, &sysrq_crashdump_op);
- __sysrq_unlock_table();
+ register_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op);
#endif
}
dump_sysrq_unregister(void)
{
#ifdef CONFIG_MAGIC_SYSRQ
- __sysrq_lock_table();
- if (__sysrq_get_key_op(DUMP_SYSRQ_KEY) == &sysrq_crashdump_op)
- __sysrq_put_key_op(DUMP_SYSRQ_KEY, NULL);
- __sysrq_unlock_table();
+ unregister_sysrq_key(DUMP_SYSRQ_KEY, &sysrq_crashdump_op);
#endif
}
* (Note: this routine is intended to be called only
* from a kernel thread context)
*/
-static void use_mm(struct mm_struct *mm)
+void use_mm(struct mm_struct *mm)
{
struct mm_struct *active_mm;
struct task_struct *tsk = current;
kmsg.o proc_tty.o proc_misc.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
+proc-$(CONFIG_CRASH_DUMP) += vmcore.o
proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
/*
* determine size of ELF note
*/
-static int notesize(struct memelfnote *en)
+int notesize(struct memelfnote *en)
{
int sz;
/*
* store a note in the header buffer
*/
-static char *storenote(struct memelfnote *men, char *bufp)
+char *storenote(struct memelfnote *men, char *bufp)
{
struct elf_note en;
* store an ELF coredump header in the supplied buffer
* nphdr is the number of elf_phdr to insert
*/
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
+void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff, struct kcore_list *clist)
{
struct elf_prstatus prstatus; /* NT_PRSTATUS */
struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */
nhdr->p_align = 0;
/* setup ELF PT_LOAD program header for every area */
- for (m=kclist; m; m=m->next) {
+ for (m=clist; m; m=m->next) {
phdr = (struct elf_phdr *) bufp;
bufp += sizeof(struct elf_phdr);
offset += sizeof(struct elf_phdr);
return -ENOMEM;
}
memset(elf_buf, 0, elf_buflen);
- elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
+ elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen, kclist);
read_unlock(&kclist_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf);
#include <linux/jiffies.h>
#include <linux/sysrq.h>
#include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
#include <linux/vs_base.h>
#include <linux/vs_cvirt.h>
(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
}
#endif
+ crash_create_proc_entry();
#ifdef CONFIG_MAGIC_SYSRQ
entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL);
if (entry)
entry->proc_fops = &proc_sysrq_trigger_operations;
#endif
+ crash_enable_by_proc();
#ifdef CONFIG_PPC32
{
extern struct file_operations ppc_htab_operations;
--- /dev/null
+/*
+ * fs/proc/vmcore.c Interface for accessing the crash
+ * dump from the system's previous life.
+ * Heavily borrowed from fs/proc/kcore.c
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+/* This is to re-use the kcore header creation code */
+static struct kcore_list vmcore_mem;
+
+static int open_vmcore(struct inode * inode, struct file * filp)
+{
+ return 0;
+}
+
+static ssize_t read_vmcore(struct file *,char __user *,size_t, loff_t *);
+
+#define BACKUP_START CRASH_BACKUP_BASE
+#define BACKUP_END CRASH_BACKUP_BASE + CRASH_BACKUP_SIZE
+#define REG_SIZE sizeof(elf_gregset_t)
+
+struct file_operations proc_vmcore_operations = {
+ .read = read_vmcore,
+ .open = open_vmcore,
+};
+
+struct proc_dir_entry *proc_vmcore;
+
+struct memelfnote
+{
+ const char *name;
+ int type;
+ unsigned int datasz;
+ void *data;
+};
+
+static size_t get_vmcore_size(int *nphdr, size_t *elf_buflen)
+{
+ size_t size;
+
+ /* We need 1 PT_LOAD segment headers
+ * In addition, we need one PT_NOTE header
+ */
+ *nphdr = 2;
+ size = (size_t)(saved_max_pfn << PAGE_SHIFT);
+
+ *elf_buflen = sizeof(struct elfhdr) +
+ (*nphdr + 2)*sizeof(struct elf_phdr) +
+ 3 * sizeof(struct memelfnote) +
+ sizeof(struct elf_prstatus) +
+ sizeof(struct elf_prpsinfo) +
+ sizeof(struct task_struct);
+ *elf_buflen = PAGE_ALIGN(*elf_buflen);
+ return size + *elf_buflen;
+}
+
+/*
+ * Reads a page from the oldmem device from given offset.
+ */
+static ssize_t read_from_oldmem(char *buf, size_t count,
+ loff_t *ppos, int userbuf)
+{
+ unsigned long pfn;
+ size_t read = 0;
+
+ pfn = (unsigned long)(*ppos / PAGE_SIZE);
+
+ if (pfn > saved_max_pfn) {
+ read = -EINVAL;
+ goto done;
+ }
+
+ count = (count > PAGE_SIZE) ? PAGE_SIZE : count;
+
+ if (copy_oldmem_page(pfn, buf, count, userbuf)) {
+ read = -EFAULT;
+ goto done;
+ }
+
+ *ppos += count;
+done:
+ return read;
+}
+
+/*
+ * store an ELF crash dump header in the supplied buffer
+ * nphdr is the number of elf_phdr to insert
+ */
+static void elf_vmcore_store_hdr(char *bufp, int nphdr, int dataoff)
+{
+ struct elf_prstatus prstatus; /* NT_PRSTATUS */
+ struct memelfnote notes[1];
+ char reg_buf[REG_SIZE];
+ loff_t reg_ppos;
+ char *buf = bufp;
+
+ vmcore_mem.addr = (unsigned long)__va(0);
+ vmcore_mem.size = saved_max_pfn << PAGE_SHIFT;
+ vmcore_mem.next = NULL;
+
+ /* Re-use the kcore code */
+ elf_kcore_store_hdr(bufp, nphdr, dataoff, &vmcore_mem);
+ buf += sizeof(struct elfhdr) + 2*sizeof(struct elf_phdr);
+
+ /* set up the process status */
+ notes[0].name = "CORE";
+ notes[0].type = NT_PRSTATUS;
+ notes[0].datasz = sizeof(struct elf_prstatus);
+ notes[0].data = &prstatus;
+
+ memset(&prstatus, 0, sizeof(struct elf_prstatus));
+
+ /* 1 - Get the registers from the reserved memory area */
+ reg_ppos = BACKUP_END + CRASH_RELOCATE_SIZE;
+ read_from_oldmem(reg_buf, REG_SIZE, ®_ppos, 0);
+ elf_core_copy_regs(&prstatus.pr_reg, (struct pt_regs *)reg_buf);
+ buf = storenote(¬es[0], buf);
+}
+
+/*
+ * read from the ELF header and then the crash dump
+ */
+static ssize_t read_vmcore(
+struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+{
+ ssize_t acc = 0;
+ size_t size, tsz;
+ size_t elf_buflen;
+ int nphdr;
+ unsigned long start;
+
+ tsz = get_vmcore_size(&nphdr, &elf_buflen);
+ proc_vmcore->size = size = tsz + elf_buflen;
+ if (buflen == 0 || *fpos >= size) {
+ goto done;
+ }
+
+ /* trim buflen to not go beyond EOF */
+ if (buflen > size - *fpos)
+ buflen = size - *fpos;
+
+ /* construct an ELF core header if we'll need some of it */
+ if (*fpos < elf_buflen) {
+ char * elf_buf;
+
+ tsz = elf_buflen - *fpos;
+ if (buflen < tsz)
+ tsz = buflen;
+ elf_buf = kmalloc(elf_buflen, GFP_ATOMIC);
+ if (!elf_buf) {
+ acc = -ENOMEM;
+ goto done;
+ }
+ memset(elf_buf, 0, elf_buflen);
+ elf_vmcore_store_hdr(elf_buf, nphdr, elf_buflen);
+ if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
+ kfree(elf_buf);
+ acc = -EFAULT;
+ goto done;
+ }
+ kfree(elf_buf);
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (buflen == 0) {
+ goto done;
+ }
+ }
+
+ start = *fpos - elf_buflen;
+ if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+ tsz = buflen;
+
+ while (buflen) {
+ unsigned long p;
+ loff_t pdup;
+
+ if ((start < 0) || (start >= size))
+ if (clear_user(buffer, tsz)) {
+ acc = -EFAULT;
+ goto done;
+ }
+
+ /* tsz contains actual len of dump to be read.
+ * buflen is the total len that was requested.
+ * This may contain part of ELF header. start
+ * is the fpos for the oldmem region
+ * If the file position corresponds to the second
+ * kernel's memory, we just return zeroes
+ */
+ p = start;
+ if ((p >= BACKUP_START) && (p < BACKUP_END)) {
+ if (clear_user(buffer, tsz)) {
+ acc = -EFAULT;
+ goto done;
+ }
+
+ goto read_done;
+ } else if (p < CRASH_RELOCATE_SIZE)
+ p += BACKUP_END;
+
+ pdup = p;
+ if (read_from_oldmem(buffer, tsz, &pdup, 1)) {
+ acc = -EINVAL;
+ goto done;
+ }
+
+read_done:
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+ start += tsz;
+ tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
+ }
+
+done:
+ return acc;
+}
}
#define SECURITY_INIT \
- .security_initcall.init : { \
+ .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) {\
VMLINUX_SYMBOL(__security_initcall_start) = .; \
*(.security_initcall.init) \
VMLINUX_SYMBOL(__security_initcall_end) = .; \
#define APIC_LVT_REMOTE_IRR (1<<14)
#define APIC_INPUT_POLARITY (1<<13)
#define APIC_SEND_PENDING (1<<12)
+#define APIC_MODE_MASK 0x700
#define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7)
#define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8))
#define APIC_MODE_FIXED 0x0
--- /dev/null
+/* asm-i386/crash_dump.h */
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <asm/apic.h>
+
+#ifdef CONFIG_CRASH_DUMP
+extern unsigned int dump_enabled;
+extern unsigned int crashed;
+
+extern void __crash_relocate_mem(unsigned long, unsigned long);
+extern unsigned long __init find_max_low_pfn(void);
+extern void __init find_max_pfn(void);
+
+extern struct pt_regs crash_smp_regs[NR_CPUS];
+extern long crash_smp_current_task[NR_CPUS];
+extern void crash_dump_save_this_cpu(struct pt_regs *, int);
+extern void __crash_dump_stop_cpus(void);
+extern void crash_get_current_regs(struct pt_regs *regs);
+
+#define CRASH_BACKUP_BASE ((unsigned long)CONFIG_BACKUP_BASE * 0x100000)
+#define CRASH_BACKUP_SIZE ((unsigned long)CONFIG_BACKUP_SIZE * 0x100000)
+#define CRASH_RELOCATE_SIZE 0xa0000
+
+static inline void crash_relocate_mem(void)
+{
+ if (crashed)
+ __crash_relocate_mem(CRASH_BACKUP_BASE + CRASH_BACKUP_SIZE,
+ CRASH_RELOCATE_SIZE);
+}
+
+static inline void set_saved_max_pfn(void)
+{
+ find_max_pfn();
+ saved_max_pfn = find_max_low_pfn();
+}
+
+static inline void crash_reserve_bootmem(void)
+{
+ if (!dump_enabled) {
+ reserve_bootmem(CRASH_BACKUP_BASE,
+ CRASH_BACKUP_SIZE + CRASH_RELOCATE_SIZE + PAGE_SIZE);
+ }
+}
+
+static inline void crash_dump_stop_cpus(void)
+{
+ int cpu;
+
+ if (!crashed)
+ return;
+
+ cpu = smp_processor_id();
+
+ crash_smp_current_task[cpu] = (long)current;
+ crash_get_current_regs(&crash_smp_regs[cpu]);
+
+ /* This also captures the register states of the other cpus */
+ __crash_dump_stop_cpus();
+#if defined(CONFIG_X86_IO_APIC)
+ disable_IO_APIC();
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC)
+ disconnect_bsp_APIC();
+#endif
+}
+
+static inline void crash_dump_save_registers(void)
+{
+ void *addr;
+
+ addr = __va(CRASH_BACKUP_BASE + CRASH_BACKUP_SIZE + CRASH_RELOCATE_SIZE);
+ memcpy(addr, crash_smp_regs, (sizeof(struct pt_regs)*NR_CPUS));
+ addr += sizeof(struct pt_regs)*NR_CPUS;
+ memcpy(addr, crash_smp_current_task, (sizeof(long)*NR_CPUS));
+}
+#else
+#define crash_relocate_mem() do { } while(0)
+#define set_saved_max_pfn() do { } while(0)
+#define crash_reserve_bootmem() do { } while(0)
+#define crash_dump_stop_cpus() do { } while(0)
+#define crash_dump_save_registers() do { } while(0)
+#endif
void kunmap(struct page *page);
void *kmap_atomic(struct page *page, enum km_type type);
void kunmap_atomic(void *kvaddr, enum km_type type);
+char *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
struct page *kmap_atomic_to_page(void *ptr);
#define flush_cache_kmaps() do { } while (0)
--- /dev/null
+#ifndef _I386_KEXEC_H
+#define _I386_KEXEC_H
+
+#include <asm/fixmap.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
+ * calculation for the amount of memory directly mappable into the
+ * kernel memory space.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+#define KEXEC_CONTROL_CODE_SIZE 4096
+
+#endif /* _I386_KEXEC_H */
#define INVALIDATE_TLB_VECTOR 0xfd
#define RESCHEDULE_VECTOR 0xfc
#define CALL_FUNCTION_VECTOR 0xfb
-#define DUMP_VECTOR 0xfa
+#define CRASH_DUMP_VECTOR 0xfa
#define THERMAL_APIC_VECTOR 0xf0
/*
extern void smp_invalidate_rcv(void); /* Process an NMI */
extern void (*mtrr_hook) (void);
extern void zap_low_mappings (void);
+extern void stop_this_cpu(void *);
#define MAX_APICID 256
extern u8 x86_cpu_to_apicid[];
--- /dev/null
+#ifndef _PPC_KEXEC_H
+#define _PPC_KEXEC_H
+
+#ifdef CONFIG_KEXEC
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
+ * calculation for the amount of memory directly mappable into the
+ * kernel memory space.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+#define KEXEC_CONTROL_CODE_SIZE 4096
+
+
+#ifndef __ASSEMBLY__
+
+struct kimage;
+
+extern void machine_kexec_simple(struct kimage *image);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_KEXEC */
+
+#endif /* _PPC_KEXEC_H */
#include <linux/config.h>
#include <linux/init.h>
+#include <linux/kexec.h>
#include <asm/setup.h>
/* functions for dealing with other cpus */
struct smp_ops_t *smp_ops;
#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_KEXEC
+ /* Called to shutdown machine specific hardware not already controlled
+ * by other drivers.
+ * XXX Should we move this one out of kexec scope?
+ */
+ void (*machine_shutdown)(void);
+
+ /* Called to do what every setup is needed on image and the
+ * reboot code buffer. Returns 0 on success.
+ * Provide your own (maybe dummy) implementation if your platform
+ * claims to support kexec.
+ */
+ int (*machine_kexec_prepare)(struct kimage *image);
+
+ /* Called to handle any machine specific cleanup on image */
+ void (*machine_kexec_cleanup)(struct kimage *image);
+
+ /* Called to perform the _real_ kexec.
+ * Do NOT allocate memory or fail here. We are past the point of
+ * no return.
+ */
+ void (*machine_kexec)(struct kimage *image);
+#endif /* CONFIG_KEXEC */
};
extern struct machdep_calls ppc_md;
--- /dev/null
+#ifndef _X86_64_KEXEC_H
+#define _X86_64_KEXEC_H
+
+#include <asm/page.h>
+#include <asm/proto.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * So far x86_64 is limited to 40 physical address bits.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+/* Maximum address we can use for the control pages */
+#define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+
+/* Allocate one page for the pdp and the second for the code */
+#define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL)
+
+#endif /* _X86_64_KEXEC_H */
#define __NR_mq_getsetattr 245
__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
#define __NR_kexec_load 246
-__SYSCALL(__NR_kexec_load, sys_ni_syscall)
+__SYSCALL(__NR_kexec_load, sys_kexec_load)
#define __NR_waitid 247
__SYSCALL(__NR_waitid, sys_waitid)
#define __NR_syscall_max __NR_waitid
* highest page
*/
extern unsigned long max_pfn;
+extern unsigned long saved_max_pfn;
/*
* node_bootmem_map is a map pointer - the bits represent all physical
--- /dev/null
+#include <linux/kexec.h>
+#include <linux/smp_lock.h>
+#include <linux/device.h>
+#include <linux/proc_fs.h>
+#ifdef CONFIG_CRASH_DUMP
+#include <asm/crash_dump.h>
+#endif
+
+extern unsigned long saved_max_pfn;
+extern struct memelfnote memelfnote;
+extern int notesize(struct memelfnote *);
+extern char *storenote(struct memelfnote *, char *);
+extern void elf_kcore_store_hdr(char *, int, int, struct kcore_list *);
+
+#ifdef CONFIG_CRASH_DUMP
+extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, int);
+extern void __crash_machine_kexec(void);
+extern int crash_dump_on;
+static inline void crash_machine_kexec(void)
+{
+ __crash_machine_kexec();
+}
+#else
+#define crash_machine_kexec() do { } while(0)
+#endif
+
+
+#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_PROC_FS)
+extern void crash_enable_by_proc(void);
+extern void crash_create_proc_entry(void);
+#else
+#define crash_enable_by_proc() do { } while(0)
+#define crash_create_proc_entry() do { } while(0)
+#endif
#ifndef _DUMP_H
#define _DUMP_H
-#if defined(CONFIG_CRASH_DUMP) || defined (CONFIG_CRASH_DUMP_MODULE)
+#if defined(CONFIG_CRASH_DUMP)
#include <linux/list.h>
#include <linux/notifier.h>
#define kmap_atomic(page, idx) page_address(page)
#define kunmap_atomic(addr, idx) do { } while (0)
+#define kmap_atomic_pfn(pfn, idx) ((char *)page_address(pfn_to_page(pfn)))
#define kmap_atomic_to_page(ptr) virt_to_page(ptr)
#endif /* CONFIG_HIGHMEM */
--- /dev/null
+#ifndef LINUX_KEXEC_H
+#define LINUX_KEXEC_H
+
+#ifdef CONFIG_KEXEC
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/kexec.h>
+
+/*
+ * This structure is used to hold the arguments that are used when loading
+ * kernel binaries.
+ */
+
+typedef unsigned long kimage_entry_t;
+#define IND_DESTINATION 0x1
+#define IND_INDIRECTION 0x2
+#define IND_DONE 0x4
+#define IND_SOURCE 0x8
+
+#define KEXEC_SEGMENT_MAX 8
+struct kexec_segment {
+ void *buf;
+ size_t bufsz;
+ void *mem;
+ size_t memsz;
+};
+
+struct kimage {
+ kimage_entry_t head;
+ kimage_entry_t *entry;
+ kimage_entry_t *last_entry;
+
+ unsigned long destination;
+
+ unsigned long start;
+ struct page *control_code_page;
+
+ unsigned long nr_segments;
+ struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+
+ struct list_head control_pages;
+ struct list_head dest_pages;
+ struct list_head unuseable_pages;
+};
+
+
+/* kexec interface functions */
+extern void machine_kexec(struct kimage *image);
+extern int machine_kexec_prepare(struct kimage *image);
+extern void machine_kexec_cleanup(struct kimage *image);
+extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
+ struct kexec_segment *segments);
+extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
+extern struct kimage *kexec_image;
+extern struct kimage *kexec_crash_image;
+#endif
+#endif /* LINUX_KEXEC_H */
extern void machine_halt(void);
extern void machine_power_off(void);
+extern void machine_shutdown(void);
+
#endif
#endif /* _LINUX_REBOOT_H */
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKCONFIG_PROC) += configs.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_CRASH_DUMP) += crash.o
ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
--- /dev/null
+/*
+ * kernel/crash.c - Memory preserving reboot related code.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/smp_lock.h>
+#include <linux/kexec.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_PROC_FS
+/*
+ * Enable kexec reboot upon panic; for dumping
+ */
+static ssize_t write_crash_dump_on(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ if (count) {
+ if (get_user(crash_dump_on, buf))
+ return -EFAULT;
+ }
+ return count;
+}
+
+static struct file_operations proc_crash_dump_on_operations = {
+ .write = write_crash_dump_on,
+};
+
+extern struct file_operations proc_vmcore_operations;
+extern struct proc_dir_entry *proc_vmcore;
+
+void crash_enable_by_proc(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("kexec-dump", S_IWUSR, NULL);
+ if (entry)
+ entry->proc_fops = &proc_crash_dump_on_operations;
+}
+
+void crash_create_proc_entry(void)
+{
+ if (dump_enabled) {
+ proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
+ if (proc_vmcore) {
+ proc_vmcore->proc_fops = &proc_vmcore_operations;
+ proc_vmcore->size =
+ (size_t)(saved_max_pfn << PAGE_SHIFT);
+ }
+ }
+}
+
+#endif /* CONFIG_PROC_FS */
+
+void __crash_machine_kexec(void)
+{
+ struct kimage *image;
+
+ if ((!crash_dump_on) || (crashed))
+ return;
+
+ image = xchg(&kexec_crash_image, 0);
+ if (image) {
+ crashed = 1;
+ printk(KERN_EMERG "kexec: opening parachute\n");
+ crash_dump_stop_cpus();
+ crash_dump_save_registers();
+
+ /* If we are here to do a crash dump, save the memory from
+ * 0-640k before we copy over the kexec kernel image. Otherwise
+ * our dump will show the wrong kernel entirely.
+ */
+ crash_relocate_mem();
+
+ machine_kexec(image);
+ } else {
+ printk(KERN_EMERG "kexec: No kernel image loaded!\n");
+ }
+}
+
+/*
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+ size_t csize, int userbuf)
+{
+ void *page, *vaddr;
+
+ if (!csize)
+ return 0;
+
+ page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+
+ vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+ copy_page(page, vaddr);
+ kunmap_atomic(vaddr, KM_PTE0);
+
+ if (userbuf) {
+ if (copy_to_user(buf, page, csize)) {
+ kfree(page);
+ return -EFAULT;
+ }
+ } else
+ memcpy(buf, page, csize);
+ kfree(page);
+
+ return 0;
+}
--- /dev/null
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <net/checksum.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(
+ struct kimage *image, unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+
+
+static int kimage_alloc(struct kimage **rimage,
+ unsigned long nr_segments, struct kexec_segment *segments)
+{
+ int result;
+ struct kimage *image;
+ size_t segment_bytes;
+ unsigned long i;
+
+ /* Allocate a controlling structure */
+ result = -ENOMEM;
+ image = kmalloc(sizeof(*image), GFP_KERNEL);
+ if (!image) {
+ goto out;
+ }
+ memset(image, 0, sizeof(*image));
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unuseable pages */
+ INIT_LIST_HEAD(&image->unuseable_pages);
+
+ /* Read in the segments */
+ image->nr_segments = nr_segments;
+ segment_bytes = nr_segments * sizeof*segments;
+ result = copy_from_user(image->segment, segments, segment_bytes);
+ if (result)
+ goto out;
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mend;
+ mend = ((unsigned long)(image->segment[i].mem)) +
+ image->segment[i].memsz;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ goto out;
+ }
+
+ /*
+ * Find a location for the control code buffer, and add it
+ * the vector of segments so that it's pages will also be
+ * counted as destination pages.
+ */
+ result = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_CODE_SIZE));
+ if (!image->control_code_page) {
+ printk(KERN_ERR "Could not allocate control_code_buffer\n");
+ goto out;
+ }
+
+ result = 0;
+ out:
+ if (result == 0) {
+ *rimage = image;
+ } else {
+ kfree(image);
+ }
+ return result;
+}
+
+static int kimage_is_destination_range(
+ struct kimage *image, unsigned long start, unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+ mstart = (unsigned long)image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+ struct page *pages;
+ pages = alloc_pages(gfp_mask, order);
+ if (pages) {
+ unsigned int count, i;
+ pages->mapping = NULL;
+ pages->private = order;
+ count = 1 << order;
+ for(i = 0; i < count; i++) {
+ SetPageReserved(pages + i);
+ }
+ }
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+ order = page->private;
+ count = 1 << order;
+ for(i = 0; i < count; i++) {
+ ClearPageReserved(page + i);
+ }
+ __free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+ struct list_head *pos, *next;
+ list_for_each_safe(pos, next, list) {
+ struct page *page;
+
+ page = list_entry(pos, struct page, lru);
+ list_del(&page->lru);
+
+ kimage_free_pages(page);
+ }
+}
+
+struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+ pages = kimage_alloc_pages(GFP_KERNEL, order);
+ if (!pages)
+ break;
+ pfn = page_to_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = epfn << PAGE_SHIFT;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr))
+ {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while(!pages);
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everyting to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+ return pages;
+
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0) {
+ image->entry++;
+ }
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page) {
+ return -ENOMEM;
+ }
+ ind_page = page_address(page);
+ *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry =
+ ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+ return 0;
+}
+
+static int kimage_set_destination(
+ struct kimage *image, unsigned long destination)
+{
+ int result;
+
+ destination &= PAGE_MASK;
+ result = kimage_add_entry(image, destination | IND_DESTINATION);
+ if (result == 0) {
+ image->destination = destination;
+ }
+ return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ int result;
+
+ page &= PAGE_MASK;
+ result = kimage_add_entry(image, page | IND_SOURCE);
+ if (result == 0) {
+ image->destination += PAGE_SIZE;
+ }
+ return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unuseable pages I have cached */
+ kimage_free_page_list(&image->unuseable_pages);
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+ int result;
+
+ result = kimage_add_entry(image, IND_DONE);
+ if (result == 0) {
+ /* Point at the terminating element */
+ image->entry--;
+ kimage_free_extra_pages(image);
+ }
+ return result;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION)? \
+ phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION) {
+ kimage_free_entry(ind);
+ }
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ }
+ else if (entry & IND_SOURCE) {
+ kimage_free_entry(entry);
+ }
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION) {
+ kimage_free_entry(ind);
+ }
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION) {
+ destination = entry & PAGE_MASK;
+ }
+ else if (entry & IND_SOURCE) {
+ if (page == destination) {
+ return ptr;
+ }
+ destination += PAGE_SIZE;
+ }
+ }
+ return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page) {
+ return 0;
+ }
+ /* If the page cannot be used file it away */
+ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unuseable_pages);
+ continue;
+ }
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it.
+ */
+ addr = old_addr;
+ page = old_page;
+ break;
+ }
+ else {
+ /* Place the page on the destination list I
+ * will use it later.
+ */
+ list_add(&page->lru, &image->dest_pages);
+ }
+ }
+ return page;
+}
+
+static int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ unsigned long mstart;
+ int result;
+ unsigned long offset;
+ unsigned long offset_end;
+ unsigned char *buf;
+
+ result = 0;
+ buf = segment->buf;
+ mstart = (unsigned long)segment->mem;
+
+ offset_end = segment->memsz;
+
+ result = kimage_set_destination(image, mstart);
+ if (result < 0) {
+ goto out;
+ }
+ for (offset = 0; offset < segment->memsz; offset += PAGE_SIZE) {
+ struct page *page;
+ char *ptr;
+ size_t size, leader;
+ page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
+ if (page == 0) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
+ if (result < 0) {
+ goto out;
+ }
+ ptr = kmap(page);
+ if (segment->bufsz < offset) {
+ /* We are past the end zero the whole page */
+ memset(ptr, 0, PAGE_SIZE);
+ kunmap(page);
+ continue;
+ }
+ size = PAGE_SIZE;
+ leader = 0;
+ if ((offset == 0)) {
+ leader = mstart & ~PAGE_MASK;
+ }
+ if (leader) {
+ /* We are on the first page zero the unused portion */
+ memset(ptr, 0, leader);
+ size -= leader;
+ ptr += leader;
+ }
+ if (size > (segment->bufsz - offset)) {
+ size = segment->bufsz - offset;
+ }
+ if (size < (PAGE_SIZE - leader)) {
+ /* zero the trailing part of the page */
+ memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
+ }
+ result = copy_from_user(ptr, buf + offset, size);
+ kunmap(page);
+ if (result) {
+ result = (result < 0) ? result : -EIO;
+ goto out;
+ }
+ }
+ out:
+ return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ * address space, and very carefully places the data in the
+ * allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ * the devices to shut down. Preventing on-going dmas, and placing
+ * the devices in a consistent state so a later kernel can
+ * reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ * and the copies the image to it's final destination. And
+ * jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = NULL;
+struct kimage *kexec_crash_image = NULL;
+
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+ struct kexec_segment *segments, unsigned long flags)
+{
+ struct kimage *image;
+ int result;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (nr_segments > KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ image = NULL;
+ result = 0;
+
+ if (nr_segments > 0) {
+ unsigned long i;
+ result = kimage_alloc(&image, nr_segments, segments);
+ if (result) {
+ goto out;
+ }
+ result = machine_kexec_prepare(image);
+ if (result) {
+ goto out;
+ }
+ image->start = entry;
+ for (i = 0; i < nr_segments; i++) {
+ result = kimage_load_segment(image, &image->segment[i]);
+ if (result) {
+ goto out;
+ }
+ }
+ result = kimage_terminate(image);
+ if (result) {
+ goto out;
+ }
+ }
+
+ if (!flags)
+ image = xchg(&kexec_image, image);
+ else
+ image = xchg(&kexec_crash_image, image);
+
+ out:
+ kimage_free(image);
+ return result;
+}
#include <linux/sysrq.h>
#include <linux/interrupt.h>
#include <linux/nmi.h>
-#ifdef CONFIG_KEXEC
#include <linux/kexec.h>
-#endif
+#include <linux/crash_dump.h>
int panic_timeout = 900;
int panic_on_oops = 1;
int tainted;
+unsigned int crashed;
+int crash_dump_on;
void (*dump_function_ptr)(const char *, const struct pt_regs *) = 0;
EXPORT_SYMBOL(panic_timeout);
BUG();
bust_spinlocks(0);
+ /* If we have crashed, perform a kexec reboot, for dump write-out */
+ crash_machine_kexec();
+
notifier_call_chain(&panic_notifier_list, 0, buf);
#ifdef CONFIG_SMP
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
#include <linux/workqueue.h>
#include <linux/device.h>
#include <linux/key.h>
cond_syscall(sys_lookup_dcookie)
cond_syscall(sys_swapon)
cond_syscall(sys_swapoff)
+cond_syscall(sys_kexec_load)
cond_syscall(sys_init_module)
cond_syscall(sys_delete_module)
cond_syscall(sys_socketpair)
unsigned long min_low_pfn;
EXPORT_SYMBOL(min_low_pfn);
unsigned long max_pfn;
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
EXPORT_SYMBOL(max_pfn); /* This is exported so
* dma_get_required_mask(), which uses
EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
-#ifdef CONFIG_CRASH_DUMP_MODULE
+#ifdef CONFIG_CRASH_DUMP
/* This symbol has to be exported to use 'for_each_pgdat' macro by modules. */
EXPORT_SYMBOL(pgdat_list);
#endif
tainted |= TAINT_BAD_PAGE;
}
-#if !defined(CONFIG_HUGETLB_PAGE) && !defined(CONFIG_CRASH_DUMP) \
- && !defined(CONFIG_CRASH_DUMP_MODULE)
+#if !defined(CONFIG_HUGETLB_PAGE) && !defined(CONFIG_CRASH_DUMP)
#define prep_compound_page(page, order) do { } while (0)
#define destroy_compound_page(page, order) do { } while (0)
#else