kernel/kexec.c

   1 /*
   2  * kexec.c - kexec system call
   3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4  *
   5  * This source code is licensed under the GNU General Public License,
   6  * Version 2.  See the file COPYING for more details.
   7  */
   8
   9 #include <linux/mm.h>
  10 #include <linux/file.h>
  11 #include <linux/slab.h>
  12 #include <linux/fs.h>
  13 #include <linux/kexec.h>
  14 #include <linux/spinlock.h>
  15 #include <linux/list.h>
  16 #include <linux/highmem.h>
  17 #include <net/checksum.h>
  18 #include <asm/page.h>
  19 #include <asm/uaccess.h>
  20 #include <asm/io.h>
  21 #include <asm/system.h>
  22
  23 /*
  24  * When kexec transitions to the new kernel there is a one-to-one
  25  * mapping between physical and virtual addresses.  On processors
  26  * where you can disable the MMU this is trivial, and easy.  For
  27  * others it is still a simple predictable page table to setup.
  28  *
  29  * In that environment kexec copies the new kernel to its final
  30  * resting place.  This means I can only support memory whose
  31  * physical address can fit in an unsigned long.  In particular
  32  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  33  * If the assembly stub has more restrictive requirements
  34  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  35  * defined more restrictively in <asm/kexec.h>.
  36  *
  37  * The code for the transition from the current kernel to the
  38  * the new kernel is placed in the control_code_buffer, whose size
  39  * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
  40  * page of memory is necessary, but some architectures require more.
  41  * Because this memory must be identity mapped in the transition from
  42  * virtual to physical addresses it must live in the range
  43  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  44  * modifiable.
  45  *
  46  * The assembly stub in the control code buffer is passed a linked list
  47  * of descriptor pages detailing the source pages of the new kernel,
  48  * and the destination addresses of those source pages.  As this data
  49  * structure is not used in the context of the current OS, it must
  50  * be self-contained.
  51  *
  52  * The code has been made to work with highmem pages and will use a
  53  * destination page in its final resting place (if it happens
  54  * to allocate it).  The end product of this is that most of the
  55  * physical address space, and most of RAM can be used.
  56  *
  57  * Future directions include:
  58  *  - allocating a page table with the control code buffer identity
  59  *    mapped, to simplify machine_kexec and make kexec_on_panic more
  60  *    reliable.
  61  */
  62
  63 /*
  64  * KIMAGE_NO_DEST is an impossible destination address..., for
  65  * allocating pages whose destination address we do not care about.
  66  */
  67 #define KIMAGE_NO_DEST (-1UL)
  68
  69 static int kimage_is_destination_range(
  70         struct kimage *image, unsigned long start, unsigned long end);
  71 static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
  72
  73
  74 static int kimage_alloc(struct kimage **rimage,
  75         unsigned long nr_segments, struct kexec_segment *segments)
  76 {
  77         int result;
  78         struct kimage *image;
  79         size_t segment_bytes;
  80         unsigned long i;
  81
  82         /* Allocate a controlling structure */
  83         result = -ENOMEM;
  84         image = kmalloc(sizeof(*image), GFP_KERNEL);
  85         if (!image) {
  86                 goto out;
  87         }
  88         memset(image, 0, sizeof(*image));
  89         image->head = 0;
  90         image->entry = &image->head;
  91         image->last_entry = &image->head;
  92
  93         /* Initialize the list of control pages */
  94         INIT_LIST_HEAD(&image->control_pages);
  95
  96         /* Initialize the list of destination pages */
  97         INIT_LIST_HEAD(&image->dest_pages);
  98
  99         /* Initialize the list of unuseable pages */
 100         INIT_LIST_HEAD(&image->unuseable_pages);
 101
 102         /* Read in the segments */
 103         image->nr_segments = nr_segments;
 104         segment_bytes = nr_segments * sizeof*segments;
 105         result = copy_from_user(image->segment, segments, segment_bytes);
 106         if (result)
 107                 goto out;
 108
 109         /*
 110          * Verify we have good destination addresses.  The caller is
 111          * responsible for making certain we don't attempt to load
 112          * the new image into invalid or reserved areas of RAM.  This
 113          * just verifies it is an address we can use.
 114          */
 115         result = -EADDRNOTAVAIL;
 116         for (i = 0; i < nr_segments; i++) {
 117                 unsigned long mend;
 118                 mend = ((unsigned long)(image->segment[i].mem)) +
 119                         image->segment[i].memsz;
 120                 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 121                         goto out;
 122         }
 123
 124         /*
 125          * Find a location for the control code buffer, and add it
 126          * the vector of segments so that it's pages will also be
 127          * counted as destination pages.
 128          */
 129         result = -ENOMEM;
 130         image->control_code_page = kimage_alloc_control_pages(image,
 131                 get_order(KEXEC_CONTROL_CODE_SIZE));
 132         if (!image->control_code_page) {
 133                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 134                 goto out;
 135         }
 136
 137         result = 0;
 138  out:
 139         if (result == 0) {
 140                 *rimage = image;
 141         } else {
 142                 kfree(image);
 143         }
 144         return result;
 145 }
 146
 147 static int kimage_is_destination_range(
 148         struct kimage *image, unsigned long start, unsigned long end)
 149 {
 150         unsigned long i;
 151
 152         for (i = 0; i < image->nr_segments; i++) {
 153                 unsigned long mstart, mend;
 154                 mstart = (unsigned long)image->segment[i].mem;
 155                 mend   = mstart + image->segment[i].memsz;
 156                 if ((end > mstart) && (start < mend)) {
 157                         return 1;
 158                 }
 159         }
 160         return 0;
 161 }
 162
 163 static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
 164 {
 165         struct page *pages;
 166         pages = alloc_pages(gfp_mask, order);
 167         if (pages) {
 168                 unsigned int count, i;
 169                 pages->mapping = NULL;
 170                 pages->private = order;
 171                 count = 1 << order;
 172                 for(i = 0; i < count; i++) {
 173                         SetPageReserved(pages + i);
 174                 }
 175         }
 176         return pages;
 177 }
 178
 179 static void kimage_free_pages(struct page *page)
 180 {
 181         unsigned int order, count, i;
 182         order = page->private;
 183         count = 1 << order;
 184         for(i = 0; i < count; i++) {
 185                 ClearPageReserved(page + i);
 186         }
 187         __free_pages(page, order);
 188 }
 189
 190 static void kimage_free_page_list(struct list_head *list)
 191 {
 192         struct list_head *pos, *next;
 193         list_for_each_safe(pos, next, list) {
 194                 struct page *page;
 195
 196                 page = list_entry(pos, struct page, lru);
 197                 list_del(&page->lru);
 198
 199                 kimage_free_pages(page);
 200         }
 201 }
 202
 203 struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order)
 204 {
 205         /* Control pages are special, they are the intermediaries
 206          * that are needed while we copy the rest of the pages
 207          * to their final resting place.  As such they must
 208          * not conflict with either the destination addresses
 209          * or memory the kernel is already using.
 210          *
 211          * The only case where we really need more than one of
 212          * these are for architectures where we cannot disable
 213          * the MMU and must instead generate an identity mapped
 214          * page table for all of the memory.
 215          *
 216          * At worst this runs in O(N) of the image size.
 217          */
 218         struct list_head extra_pages;
 219         struct page *pages;
 220         unsigned int count;
 221
 222         count = 1 << order;
 223         INIT_LIST_HEAD(&extra_pages);
 224
 225         /* Loop while I can allocate a page and the page allocated
 226          * is a destination page.
 227          */
 228         do {
 229                 unsigned long pfn, epfn, addr, eaddr;
 230                 pages = kimage_alloc_pages(GFP_KERNEL, order);
 231                 if (!pages)
 232                         break;
 233                 pfn   = page_to_pfn(pages);
 234                 epfn  = pfn + count;
 235                 addr  = pfn << PAGE_SHIFT;
 236                 eaddr = epfn << PAGE_SHIFT;
 237                 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 238                         kimage_is_destination_range(image, addr, eaddr))
 239                 {
 240                         list_add(&pages->lru, &extra_pages);
 241                         pages = NULL;
 242                 }
 243         } while(!pages);
 244         if (pages) {
 245                 /* Remember the allocated page... */
 246                 list_add(&pages->lru, &image->control_pages);
 247
 248                 /* Because the page is already in it's destination
 249                  * location we will never allocate another page at
 250                  * that address.  Therefore kimage_alloc_pages
 251                  * will not return it (again) and we don't need
 252                  * to give it an entry in image->segment[].
 253                  */
 254         }
 255         /* Deal with the destination pages I have inadvertently allocated.
 256          *
 257          * Ideally I would convert multi-page allocations into single
 258          * page allocations, and add everyting to image->dest_pages.
 259          *
 260          * For now it is simpler to just free the pages.
 261          */
 262         kimage_free_page_list(&extra_pages);
 263         return pages;
 264
 265 }
 266
 267 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 268 {
 269         if (*image->entry != 0) {
 270                 image->entry++;
 271         }
 272         if (image->entry == image->last_entry) {
 273                 kimage_entry_t *ind_page;
 274                 struct page *page;
 275                 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 276                 if (!page) {
 277                         return -ENOMEM;
 278                 }
 279                 ind_page = page_address(page);
 280                 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 281                 image->entry = ind_page;
 282                 image->last_entry =
 283                         ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 284         }
 285         *image->entry = entry;
 286         image->entry++;
 287         *image->entry = 0;
 288         return 0;
 289 }
 290
 291 static int kimage_set_destination(
 292         struct kimage *image, unsigned long destination)
 293 {
 294         int result;
 295
 296         destination &= PAGE_MASK;
 297         result = kimage_add_entry(image, destination | IND_DESTINATION);
 298         if (result == 0) {
 299                 image->destination = destination;
 300         }
 301         return result;
 302 }
 303
 304
 305 static int kimage_add_page(struct kimage *image, unsigned long page)
 306 {
 307         int result;
 308
 309         page &= PAGE_MASK;
 310         result = kimage_add_entry(image, page | IND_SOURCE);
 311         if (result == 0) {
 312                 image->destination += PAGE_SIZE;
 313         }
 314         return result;
 315 }
 316
 317
 318 static void kimage_free_extra_pages(struct kimage *image)
 319 {
 320         /* Walk through and free any extra destination pages I may have */
 321         kimage_free_page_list(&image->dest_pages);
 322
 323         /* Walk through and free any unuseable pages I have cached */
 324         kimage_free_page_list(&image->unuseable_pages);
 325
 326 }
 327 static int kimage_terminate(struct kimage *image)
 328 {
 329         int result;
 330
 331         result = kimage_add_entry(image, IND_DONE);
 332         if (result == 0) {
 333                 /* Point at the terminating element */
 334                 image->entry--;
 335                 kimage_free_extra_pages(image);
 336         }
 337         return result;
 338 }
 339
 340 #define for_each_kimage_entry(image, ptr, entry) \
 341         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 342                 ptr = (entry & IND_INDIRECTION)? \
 343                         phys_to_virt((entry & PAGE_MASK)): ptr +1)
 344
 345 static void kimage_free_entry(kimage_entry_t entry)
 346 {
 347         struct page *page;
 348
 349         page = pfn_to_page(entry >> PAGE_SHIFT);
 350         kimage_free_pages(page);
 351 }
 352
 353 static void kimage_free(struct kimage *image)
 354 {
 355         kimage_entry_t *ptr, entry;
 356         kimage_entry_t ind = 0;
 357
 358         if (!image)
 359                 return;
 360         kimage_free_extra_pages(image);
 361         for_each_kimage_entry(image, ptr, entry) {
 362                 if (entry & IND_INDIRECTION) {
 363                         /* Free the previous indirection page */
 364                         if (ind & IND_INDIRECTION) {
 365                                 kimage_free_entry(ind);
 366                         }
 367                         /* Save this indirection page until we are
 368                          * done with it.
 369                          */
 370                         ind = entry;
 371                 }
 372                 else if (entry & IND_SOURCE) {
 373                         kimage_free_entry(entry);
 374                 }
 375         }
 376         /* Free the final indirection page */
 377         if (ind & IND_INDIRECTION) {
 378                 kimage_free_entry(ind);
 379         }
 380
 381         /* Handle any machine specific cleanup */
 382         machine_kexec_cleanup(image);
 383
 384         /* Free the kexec control pages... */
 385         kimage_free_page_list(&image->control_pages);
 386         kfree(image);
 387 }
 388
 389 static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
 390 {
 391         kimage_entry_t *ptr, entry;
 392         unsigned long destination = 0;
 393
 394         for_each_kimage_entry(image, ptr, entry) {
 395                 if (entry & IND_DESTINATION) {
 396                         destination = entry & PAGE_MASK;
 397                 }
 398                 else if (entry & IND_SOURCE) {
 399                         if (page == destination) {
 400                                 return ptr;
 401                         }
 402                         destination += PAGE_SIZE;
 403                 }
 404         }
 405         return 0;
 406 }
 407
 408 static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
 409 {
 410         /*
 411          * Here we implement safeguards to ensure that a source page
 412          * is not copied to its destination page before the data on
 413          * the destination page is no longer useful.
 414          *
 415          * To do this we maintain the invariant that a source page is
 416          * either its own destination page, or it is not a
 417          * destination page at all.
 418          *
 419          * That is slightly stronger than required, but the proof
 420          * that no problems will not occur is trivial, and the
 421          * implementation is simply to verify.
 422          *
 423          * When allocating all pages normally this algorithm will run
 424          * in O(N) time, but in the worst case it will run in O(N^2)
 425          * time.   If the runtime is a problem the data structures can
 426          * be fixed.
 427          */
 428         struct page *page;
 429         unsigned long addr;
 430
 431         /*
 432          * Walk through the list of destination pages, and see if I
 433          * have a match.
 434          */
 435         list_for_each_entry(page, &image->dest_pages, lru) {
 436                 addr = page_to_pfn(page) << PAGE_SHIFT;
 437                 if (addr == destination) {
 438                         list_del(&page->lru);
 439                         return page;
 440                 }
 441         }
 442         page = NULL;
 443         while (1) {
 444                 kimage_entry_t *old;
 445
 446                 /* Allocate a page, if we run out of memory give up */
 447                 page = kimage_alloc_pages(gfp_mask, 0);
 448                 if (!page) {
 449                         return 0;
 450                 }
 451                 /* If the page cannot be used file it away */
 452                 if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 453                         list_add(&page->lru, &image->unuseable_pages);
 454                         continue;
 455                 }
 456                 addr = page_to_pfn(page) << PAGE_SHIFT;
 457
 458                 /* If it is the destination page we want use it */
 459                 if (addr == destination)
 460                         break;
 461
 462                 /* If the page is not a destination page use it */
 463                 if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
 464                         break;
 465
 466                 /*
 467                  * I know that the page is someones destination page.
 468                  * See if there is already a source page for this
 469                  * destination page.  And if so swap the source pages.
 470                  */
 471                 old = kimage_dst_used(image, addr);
 472                 if (old) {
 473                         /* If so move it */
 474                         unsigned long old_addr;
 475                         struct page *old_page;
 476
 477                         old_addr = *old & PAGE_MASK;
 478                         old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 479                         copy_highpage(page, old_page);
 480                         *old = addr | (*old & ~PAGE_MASK);
 481
 482                         /* The old page I have found cannot be a
 483                          * destination page, so return it.
 484                          */
 485                         addr = old_addr;
 486                         page = old_page;
 487                         break;
 488                 }
 489                 else {
 490                         /* Place the page on the destination list I
 491                          * will use it later.
 492                          */
 493                         list_add(&page->lru, &image->dest_pages);
 494                 }
 495         }
 496         return page;
 497 }
 498
 499 static int kimage_load_segment(struct kimage *image,
 500         struct kexec_segment *segment)
 501 {
 502         unsigned long mstart;
 503         int result;
 504         unsigned long offset;
 505         unsigned long offset_end;
 506         unsigned char *buf;
 507
 508         result = 0;
 509         buf = segment->buf;
 510         mstart = (unsigned long)segment->mem;
 511
 512         offset_end = segment->memsz;
 513
 514         result = kimage_set_destination(image, mstart);
 515         if (result < 0) {
 516                 goto out;
 517         }
 518         for (offset = 0;  offset < segment->memsz; offset += PAGE_SIZE) {
 519                 struct page *page;
 520                 char *ptr;
 521                 size_t size, leader;
 522                 page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
 523                 if (page == 0) {
 524                         result  = -ENOMEM;
 525                         goto out;
 526                 }
 527                 result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
 528                 if (result < 0) {
 529                         goto out;
 530                 }
 531                 ptr = kmap(page);
 532                 if (segment->bufsz < offset) {
 533                         /* We are past the end zero the whole page */
 534                         memset(ptr, 0, PAGE_SIZE);
 535                         kunmap(page);
 536                         continue;
 537                 }
 538                 size = PAGE_SIZE;
 539                 leader = 0;
 540                 if ((offset == 0)) {
 541                         leader = mstart & ~PAGE_MASK;
 542                 }
 543                 if (leader) {
 544                         /* We are on the first page zero the unused portion */
 545                         memset(ptr, 0, leader);
 546                         size -= leader;
 547                         ptr += leader;
 548                 }
 549                 if (size > (segment->bufsz - offset)) {
 550                         size = segment->bufsz - offset;
 551                 }
 552                 if (size < (PAGE_SIZE - leader)) {
 553                         /* zero the trailing part of the page */
 554                         memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
 555                 }
 556                 result = copy_from_user(ptr, buf + offset, size);
 557                 kunmap(page);
 558                 if (result) {
 559                         result = (result < 0) ? result : -EIO;
 560                         goto out;
 561                 }
 562         }
 563  out:
 564         return result;
 565 }
 566
 567 /*
 568  * Exec Kernel system call: for obvious reasons only root may call it.
 569  *
 570  * This call breaks up into three pieces.
 571  * - A generic part which loads the new kernel from the current
 572  *   address space, and very carefully places the data in the
 573  *   allocated pages.
 574  *
 575  * - A generic part that interacts with the kernel and tells all of
 576  *   the devices to shut down.  Preventing on-going dmas, and placing
 577  *   the devices in a consistent state so a later kernel can
 578  *   reinitialize them.
 579  *
 580  * - A machine specific part that includes the syscall number
 581  *   and the copies the image to it's final destination.  And
 582  *   jumps into the image at entry.
 583  *
 584  * kexec does not sync, or unmount filesystems so if you need
 585  * that to happen you need to do that yourself.
 586  */
 587 struct kimage *kexec_image = NULL;
 588
 589 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 590         struct kexec_segment *segments, unsigned long flags)
 591 {
 592         struct kimage *image;
 593         int result;
 594
 595         /* We only trust the superuser with rebooting the system. */
 596         if (!capable(CAP_SYS_BOOT))
 597                 return -EPERM;
 598
 599         /*
 600          * In case we need just a little bit of special behavior for
 601          * reboot on panic.
 602          */
 603         if (flags != 0)
 604                 return -EINVAL;
 605
 606         if (nr_segments > KEXEC_SEGMENT_MAX)
 607                 return -EINVAL;
 608
 609         image = NULL;
 610         result = 0;
 611
 612         if (nr_segments > 0) {
 613                 unsigned long i;
 614                 result = kimage_alloc(&image, nr_segments, segments);
 615                 if (result) {
 616                         goto out;
 617                 }
 618                 result = machine_kexec_prepare(image);
 619                 if (result) {
 620                         goto out;
 621                 }
 622                 image->start = entry;
 623                 for (i = 0; i < nr_segments; i++) {
 624                         result = kimage_load_segment(image, &image->segment[i]);
 625                         if (result) {
 626                                 goto out;
 627                         }
 628                 }
 629                 result = kimage_terminate(image);
 630                 if (result) {
 631                         goto out;
 632                 }
 633         }
 634
 635         image = xchg(&kexec_image, image);
 636
 637  out:
 638         kimage_free(image);
 639         return result;
 640 }