2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
10 #include <linux/file.h>
11 #include <linux/slab.h>
13 #include <linux/kexec.h>
14 #include <linux/spinlock.h>
15 #include <linux/list.h>
16 #include <linux/highmem.h>
17 #include <net/checksum.h>
19 #include <asm/uaccess.h>
21 #include <asm/system.h>
24 * When kexec transitions to the new kernel there is a one-to-one
25 * mapping between physical and virtual addresses. On processors
26 * where you can disable the MMU this is trivial, and easy. For
27 * others it is still a simple predictable page table to setup.
29 * In that environment kexec copies the new kernel to its final
30 * resting place. This means I can only support memory whose
31 * physical address can fit in an unsigned long. In particular
32 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
33 * If the assembly stub has more restrictive requirements
34 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
35 * defined more restrictively in <asm/kexec.h>.
37 * The code for the transition from the current kernel to the
38 * the new kernel is placed in the control_code_buffer, whose size
39 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
40 * page of memory is necessary, but some architectures require more.
41 * Because this memory must be identity mapped in the transition from
42 * virtual to physical addresses it must live in the range
43 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
46 * The assembly stub in the control code buffer is passed a linked list
47 * of descriptor pages detailing the source pages of the new kernel,
48 * and the destination addresses of those source pages. As this data
49 * structure is not used in the context of the current OS, it must
52 * The code has been made to work with highmem pages and will use a
53 * destination page in its final resting place (if it happens
54 * to allocate it). The end product of this is that most of the
55 * physical address space, and most of RAM can be used.
57 * Future directions include:
58 * - allocating a page table with the control code buffer identity
59 * mapped, to simplify machine_kexec and make kexec_on_panic more
64 * KIMAGE_NO_DEST is an impossible destination address..., for
65 * allocating pages whose destination address we do not care about.
67 #define KIMAGE_NO_DEST (-1UL)
69 static int kimage_is_destination_range(
70 struct kimage *image, unsigned long start, unsigned long end);
71 static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
74 static int kimage_alloc(struct kimage **rimage,
75 unsigned long nr_segments, struct kexec_segment *segments)
82 /* Allocate a controlling structure */
84 image = kmalloc(sizeof(*image), GFP_KERNEL);
88 memset(image, 0, sizeof(*image));
90 image->entry = &image->head;
91 image->last_entry = &image->head;
93 /* Initialize the list of control pages */
94 INIT_LIST_HEAD(&image->control_pages);
96 /* Initialize the list of destination pages */
97 INIT_LIST_HEAD(&image->dest_pages);
99 /* Initialize the list of unuseable pages */
100 INIT_LIST_HEAD(&image->unuseable_pages);
102 /* Read in the segments */
103 image->nr_segments = nr_segments;
104 segment_bytes = nr_segments * sizeof*segments;
105 result = copy_from_user(image->segment, segments, segment_bytes);
110 * Verify we have good destination addresses. The caller is
111 * responsible for making certain we don't attempt to load
112 * the new image into invalid or reserved areas of RAM. This
113 * just verifies it is an address we can use.
115 result = -EADDRNOTAVAIL;
116 for (i = 0; i < nr_segments; i++) {
118 mend = ((unsigned long)(image->segment[i].mem)) +
119 image->segment[i].memsz;
120 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
125 * Find a location for the control code buffer, and add it
126 * the vector of segments so that it's pages will also be
127 * counted as destination pages.
130 image->control_code_page = kimage_alloc_control_pages(image,
131 get_order(KEXEC_CONTROL_CODE_SIZE));
132 if (!image->control_code_page) {
133 printk(KERN_ERR "Could not allocate control_code_buffer\n");
147 static int kimage_is_destination_range(
148 struct kimage *image, unsigned long start, unsigned long end)
152 for (i = 0; i < image->nr_segments; i++) {
153 unsigned long mstart, mend;
154 mstart = (unsigned long)image->segment[i].mem;
155 mend = mstart + image->segment[i].memsz;
156 if ((end > mstart) && (start < mend)) {
163 static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
166 pages = alloc_pages(gfp_mask, order);
168 unsigned int count, i;
169 pages->mapping = NULL;
170 pages->private = order;
172 for(i = 0; i < count; i++) {
173 SetPageReserved(pages + i);
179 static void kimage_free_pages(struct page *page)
181 unsigned int order, count, i;
182 order = page->private;
184 for(i = 0; i < count; i++) {
185 ClearPageReserved(page + i);
187 __free_pages(page, order);
190 static void kimage_free_page_list(struct list_head *list)
192 struct list_head *pos, *next;
193 list_for_each_safe(pos, next, list) {
196 page = list_entry(pos, struct page, lru);
197 list_del(&page->lru);
199 kimage_free_pages(page);
203 struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order)
205 /* Control pages are special, they are the intermediaries
206 * that are needed while we copy the rest of the pages
207 * to their final resting place. As such they must
208 * not conflict with either the destination addresses
209 * or memory the kernel is already using.
211 * The only case where we really need more than one of
212 * these are for architectures where we cannot disable
213 * the MMU and must instead generate an identity mapped
214 * page table for all of the memory.
216 * At worst this runs in O(N) of the image size.
218 struct list_head extra_pages;
223 INIT_LIST_HEAD(&extra_pages);
225 /* Loop while I can allocate a page and the page allocated
226 * is a destination page.
229 unsigned long pfn, epfn, addr, eaddr;
230 pages = kimage_alloc_pages(GFP_KERNEL, order);
233 pfn = page_to_pfn(pages);
235 addr = pfn << PAGE_SHIFT;
236 eaddr = epfn << PAGE_SHIFT;
237 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
238 kimage_is_destination_range(image, addr, eaddr))
240 list_add(&pages->lru, &extra_pages);
245 /* Remember the allocated page... */
246 list_add(&pages->lru, &image->control_pages);
248 /* Because the page is already in it's destination
249 * location we will never allocate another page at
250 * that address. Therefore kimage_alloc_pages
251 * will not return it (again) and we don't need
252 * to give it an entry in image->segment[].
255 /* Deal with the destination pages I have inadvertently allocated.
257 * Ideally I would convert multi-page allocations into single
258 * page allocations, and add everyting to image->dest_pages.
260 * For now it is simpler to just free the pages.
262 kimage_free_page_list(&extra_pages);
267 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
269 if (*image->entry != 0) {
272 if (image->entry == image->last_entry) {
273 kimage_entry_t *ind_page;
275 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
279 ind_page = page_address(page);
280 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
281 image->entry = ind_page;
283 ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
285 *image->entry = entry;
291 static int kimage_set_destination(
292 struct kimage *image, unsigned long destination)
296 destination &= PAGE_MASK;
297 result = kimage_add_entry(image, destination | IND_DESTINATION);
299 image->destination = destination;
305 static int kimage_add_page(struct kimage *image, unsigned long page)
310 result = kimage_add_entry(image, page | IND_SOURCE);
312 image->destination += PAGE_SIZE;
318 static void kimage_free_extra_pages(struct kimage *image)
320 /* Walk through and free any extra destination pages I may have */
321 kimage_free_page_list(&image->dest_pages);
323 /* Walk through and free any unuseable pages I have cached */
324 kimage_free_page_list(&image->unuseable_pages);
327 static int kimage_terminate(struct kimage *image)
331 result = kimage_add_entry(image, IND_DONE);
333 /* Point at the terminating element */
335 kimage_free_extra_pages(image);
340 #define for_each_kimage_entry(image, ptr, entry) \
341 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
342 ptr = (entry & IND_INDIRECTION)? \
343 phys_to_virt((entry & PAGE_MASK)): ptr +1)
345 static void kimage_free_entry(kimage_entry_t entry)
349 page = pfn_to_page(entry >> PAGE_SHIFT);
350 kimage_free_pages(page);
353 static void kimage_free(struct kimage *image)
355 kimage_entry_t *ptr, entry;
356 kimage_entry_t ind = 0;
360 kimage_free_extra_pages(image);
361 for_each_kimage_entry(image, ptr, entry) {
362 if (entry & IND_INDIRECTION) {
363 /* Free the previous indirection page */
364 if (ind & IND_INDIRECTION) {
365 kimage_free_entry(ind);
367 /* Save this indirection page until we are
372 else if (entry & IND_SOURCE) {
373 kimage_free_entry(entry);
376 /* Free the final indirection page */
377 if (ind & IND_INDIRECTION) {
378 kimage_free_entry(ind);
381 /* Handle any machine specific cleanup */
382 machine_kexec_cleanup(image);
384 /* Free the kexec control pages... */
385 kimage_free_page_list(&image->control_pages);
389 static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
391 kimage_entry_t *ptr, entry;
392 unsigned long destination = 0;
394 for_each_kimage_entry(image, ptr, entry) {
395 if (entry & IND_DESTINATION) {
396 destination = entry & PAGE_MASK;
398 else if (entry & IND_SOURCE) {
399 if (page == destination) {
402 destination += PAGE_SIZE;
408 static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
411 * Here we implement safeguards to ensure that a source page
412 * is not copied to its destination page before the data on
413 * the destination page is no longer useful.
415 * To do this we maintain the invariant that a source page is
416 * either its own destination page, or it is not a
417 * destination page at all.
419 * That is slightly stronger than required, but the proof
420 * that no problems will not occur is trivial, and the
421 * implementation is simply to verify.
423 * When allocating all pages normally this algorithm will run
424 * in O(N) time, but in the worst case it will run in O(N^2)
425 * time. If the runtime is a problem the data structures can
432 * Walk through the list of destination pages, and see if I
435 list_for_each_entry(page, &image->dest_pages, lru) {
436 addr = page_to_pfn(page) << PAGE_SHIFT;
437 if (addr == destination) {
438 list_del(&page->lru);
446 /* Allocate a page, if we run out of memory give up */
447 page = kimage_alloc_pages(gfp_mask, 0);
451 /* If the page cannot be used file it away */
452 if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
453 list_add(&page->lru, &image->unuseable_pages);
456 addr = page_to_pfn(page) << PAGE_SHIFT;
458 /* If it is the destination page we want use it */
459 if (addr == destination)
462 /* If the page is not a destination page use it */
463 if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
467 * I know that the page is someones destination page.
468 * See if there is already a source page for this
469 * destination page. And if so swap the source pages.
471 old = kimage_dst_used(image, addr);
474 unsigned long old_addr;
475 struct page *old_page;
477 old_addr = *old & PAGE_MASK;
478 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
479 copy_highpage(page, old_page);
480 *old = addr | (*old & ~PAGE_MASK);
482 /* The old page I have found cannot be a
483 * destination page, so return it.
490 /* Place the page on the destination list I
493 list_add(&page->lru, &image->dest_pages);
499 static int kimage_load_segment(struct kimage *image,
500 struct kexec_segment *segment)
502 unsigned long mstart;
504 unsigned long offset;
505 unsigned long offset_end;
510 mstart = (unsigned long)segment->mem;
512 offset_end = segment->memsz;
514 result = kimage_set_destination(image, mstart);
518 for (offset = 0; offset < segment->memsz; offset += PAGE_SIZE) {
522 page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
527 result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
532 if (segment->bufsz < offset) {
533 /* We are past the end zero the whole page */
534 memset(ptr, 0, PAGE_SIZE);
541 leader = mstart & ~PAGE_MASK;
544 /* We are on the first page zero the unused portion */
545 memset(ptr, 0, leader);
549 if (size > (segment->bufsz - offset)) {
550 size = segment->bufsz - offset;
552 if (size < (PAGE_SIZE - leader)) {
553 /* zero the trailing part of the page */
554 memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
556 result = copy_from_user(ptr, buf + offset, size);
559 result = (result < 0) ? result : -EIO;
568 * Exec Kernel system call: for obvious reasons only root may call it.
570 * This call breaks up into three pieces.
571 * - A generic part which loads the new kernel from the current
572 * address space, and very carefully places the data in the
575 * - A generic part that interacts with the kernel and tells all of
576 * the devices to shut down. Preventing on-going dmas, and placing
577 * the devices in a consistent state so a later kernel can
580 * - A machine specific part that includes the syscall number
581 * and the copies the image to it's final destination. And
582 * jumps into the image at entry.
584 * kexec does not sync, or unmount filesystems so if you need
585 * that to happen you need to do that yourself.
587 struct kimage *kexec_image = NULL;
588 struct kimage *kexec_crash_image = NULL;
590 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
591 struct kexec_segment *segments, unsigned long flags)
593 struct kimage *image;
596 /* We only trust the superuser with rebooting the system. */
597 if (!capable(CAP_SYS_BOOT))
600 if (nr_segments > KEXEC_SEGMENT_MAX)
606 if (nr_segments > 0) {
608 result = kimage_alloc(&image, nr_segments, segments);
612 result = machine_kexec_prepare(image);
616 image->start = entry;
617 for (i = 0; i < nr_segments; i++) {
618 result = kimage_load_segment(image, &image->segment[i]);
623 result = kimage_terminate(image);
630 image = xchg(&kexec_image, image);
632 image = xchg(&kexec_crash_image, image);