2 * Implements the dump driver interface for saving a dump in available
3 * memory areas. The saved pages may be written out to persistent storage
6 * Started: Oct 2002 - Suparna Bhattacharya <suparna@in.ibm.com>
8 * Copyright (C) 2002 International Business Machines Corp.
10 * This code is released under version 2 of the GNU GPL.
12 * The approach of tracking pages containing saved dump using map pages
13 * allocated as needed has been derived from the Mission Critical Linux
14 * mcore dump implementation.
16 * Credits and a big thanks for letting the lkcd project make use of
17 * the excellent piece of work and also helping with clarifications
18 * and tips along the way are due to:
19 * Dave Winchell <winchell@mclx.com> (primary author of mcore)
20 * Jeff Moyer <moyer@mclx.com>
21 * Josh Huber <huber@mclx.com>
23 * For those familiar with the mcore code, the main differences worth
24 * noting here (besides the dump device abstraction) result from enabling
25 * "high" memory pages (pages not permanently mapped in the kernel
26 * address space) to be used for saving dump data (because of which a
27 * simple virtual address based linked list cannot be used anymore for
28 * managing free pages), an added level of indirection for faster
29 * lookups during the post-boot stage, and the idea of pages being
30 * made available as they get freed up while dump to memory progresses
31 * rather than one time before starting the dump. The last point enables
32 * a full memory snapshot to be saved starting with an initial set of
33 * bootstrap pages given a good compression ratio. (See dump_overlay.c)
38 * -----------------MEMORY LAYOUT ------------------
39 * The memory space consists of a set of discontiguous pages, and
40 * discontiguous map pages as well, rooted in a chain of indirect
41 * map pages (also discontiguous). Except for the indirect maps
42 * (which must be preallocated in advance), the rest of the pages
43 * could be in high memory.
46 * | --------- -------- --------
47 * --> | . . +|--->| . +|------->| . . | indirect
48 * --|--|--- ---|---- --|-|--- maps
50 * ------ ------ ------- ------ -------
51 * | . | | . | | . . | | . | | . . | maps
52 * --|--- --|--- --|--|-- --|--- ---|-|--
53 * page page page page page page page data
56 * Writes to the dump device happen sequentially in append mode.
57 * The main reason for the existence of the indirect map is
58 * to enable a quick way to lookup a specific logical offset in
59 * the saved data post-soft-boot, e.g. to writeout pages
60 * with more critical data first, even though such pages
61 * would have been compressed and copied last, being the lowest
62 * ranked candidates for reuse due to their criticality.
63 * (See dump_overlay.c)
66 #include <linux/highmem.h>
67 #include <linux/bootmem.h>
68 #include <linux/dump.h>
69 #include "dump_methods.h"
71 #define DUMP_MAP_SZ (PAGE_SIZE / sizeof(unsigned long)) /* direct map size */
72 #define DUMP_IND_MAP_SZ DUMP_MAP_SZ - 1 /* indirect map size */
73 #define DUMP_NR_BOOTSTRAP 64 /* no of bootstrap pages */
75 extern int dump_low_page(struct page *);
77 /* check if the next entry crosses a page boundary */
78 static inline int is_last_map_entry(unsigned long *map)
80 unsigned long addr = (unsigned long)(map + 1);
82 return (!(addr & (PAGE_SIZE - 1)));
85 /* Todo: should have some validation checks */
86 /* The last entry in the indirect map points to the next indirect map */
87 /* Indirect maps are referred to directly by virtual address */
88 static inline unsigned long *next_indirect_map(unsigned long *map)
90 return (unsigned long *)map[DUMP_IND_MAP_SZ];
93 #ifdef CONFIG_CRASH_DUMP_SOFTBOOT
94 /* Called during early bootup - fixme: make this __init */
95 void dump_early_reserve_map(struct dump_memdev *dev)
97 unsigned long *map1, *map2;
98 loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT;
101 printk("Reserve bootmap space holding previous dump of %lld pages\n",
103 map1= (unsigned long *)dev->indirect_map_root;
105 while (map1 && (off < last)) {
106 reserve_bootmem(virt_to_phys((void *)map1), PAGE_SIZE);
107 for (i=0; (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last);
108 i++, off += DUMP_MAP_SZ) {
109 pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
110 if (map1[i] >= max_low_pfn)
112 reserve_bootmem(map1[i] << PAGE_SHIFT, PAGE_SIZE);
113 map2 = pfn_to_kaddr(map1[i]);
114 for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] &&
115 (off + j < last); j++) {
116 pr_debug("\t map[%d][%d] = 0x%lx\n", i, j,
118 if (map2[j] < max_low_pfn) {
119 reserve_bootmem(map2[j] << PAGE_SHIFT,
124 map1 = next_indirect_map(map1);
126 dev->nr_free = 0; /* these pages don't belong to this boot */
130 /* mark dump pages so that they aren't used by this kernel */
131 void dump_mark_map(struct dump_memdev *dev)
133 unsigned long *map1, *map2;
134 loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT;
138 printk("Dump: marking pages in use by previous dump\n");
139 map1= (unsigned long *)dev->indirect_map_root;
141 while (map1 && (off < last)) {
142 page = virt_to_page(map1);
143 set_page_count(page, 1);
144 for (i=0; (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last);
145 i++, off += DUMP_MAP_SZ) {
146 pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
147 page = pfn_to_page(map1[i]);
148 set_page_count(page, 1);
149 map2 = kmap_atomic(page, KM_DUMP);
150 for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] &&
151 (off + j < last); j++) {
152 pr_debug("\t map[%d][%d] = 0x%lx\n", i, j,
154 page = pfn_to_page(map2[j]);
155 set_page_count(page, 1);
158 map1 = next_indirect_map(map1);
164 * Given a logical offset into the mem device lookup the
166 * loc is specified in units of pages
167 * Note: affects curr_map (even in the case where lookup fails)
169 struct page *dump_mem_lookup(struct dump_memdev *dump_mdev, unsigned long loc)
172 unsigned long i, index = loc / DUMP_MAP_SZ;
173 struct page *page = NULL;
174 unsigned long curr_pfn, curr_map, *curr_map_ptr = NULL;
176 map = (unsigned long *)dump_mdev->indirect_map_root;
180 if (loc > dump_mdev->last_offset >> PAGE_SHIFT)
184 * first locate the right indirect map
185 * in the chain of indirect maps
187 for (i = 0; i + DUMP_IND_MAP_SZ < index ; i += DUMP_IND_MAP_SZ) {
188 if (!(map = next_indirect_map(map)))
191 /* then the right direct map */
192 /* map entries are referred to by page index */
193 if ((curr_map = map[index - i])) {
194 page = pfn_to_page(curr_map);
195 /* update the current traversal index */
196 /* dump_mdev->curr_map = &map[index - i];*/
197 curr_map_ptr = &map[index - i];
201 map = kmap_atomic(page, KM_DUMP);
205 /* and finally the right entry therein */
206 /* data pages are referred to by page index */
207 i = index * DUMP_MAP_SZ;
208 if ((curr_pfn = map[loc - i])) {
209 page = pfn_to_page(curr_pfn);
210 dump_mdev->curr_map = curr_map_ptr;
211 dump_mdev->curr_map_offset = loc - i;
212 dump_mdev->ddev.curr_offset = loc << PAGE_SHIFT;
216 kunmap_atomic(map, KM_DUMP);
222 * Retrieves a pointer to the next page in the dump device
223 * Used during the lookup pass post-soft-reboot
225 struct page *dump_mem_next_page(struct dump_memdev *dev)
229 struct page *page = NULL;
231 if (dev->ddev.curr_offset + PAGE_SIZE >= dev->last_offset) {
235 if ((i = (unsigned long)(++dev->curr_map_offset)) >= DUMP_MAP_SZ) {
236 /* move to next map */
237 if (is_last_map_entry(++dev->curr_map)) {
238 /* move to the next indirect map page */
239 printk("dump_mem_next_page: go to next indirect map\n");
240 dev->curr_map = (unsigned long *)*dev->curr_map;
244 i = dev->curr_map_offset = 0;
245 pr_debug("dump_mem_next_page: next map 0x%lx, entry 0x%lx\n",
246 dev->curr_map, *dev->curr_map);
250 if (*dev->curr_map) {
251 map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_DUMP);
253 page = pfn_to_page(map[i]);
254 kunmap_atomic(map, KM_DUMP);
255 dev->ddev.curr_offset += PAGE_SIZE;
261 /* Copied from dump_filters.c */
262 static inline int kernel_page(struct page *p)
264 /* FIXME: Need to exclude hugetlb pages. Clue: reserved but inuse */
265 return (PageReserved(p) && !PageInuse(p)) || (!PageLRU(p) && PageInuse(p));
268 static inline int user_page(struct page *p)
270 return PageInuse(p) && (!PageReserved(p) && PageLRU(p));
273 int dump_reused_by_boot(struct page *page)
278 * if < __end + bootmem_bootmap_pages for this boot + allowance
279 * if overwritten by initrd (how to check ?)
280 * Also, add more checks in early boot code
281 * e.g. bootmem bootmap alloc verify not overwriting dump, and if
282 * so then realloc or move the dump pages out accordingly.
285 /* Temporary proof of concept hack, avoid overwriting kern pages */
287 return (kernel_page(page) || dump_low_page(page) || user_page(page));
291 /* Uses the free page passed in to expand available space */
292 int dump_mem_add_space(struct dump_memdev *dev, struct page *page)
294 struct page *map_page;
299 return -ENOMEM; /* must've exhausted indirect map */
301 if (!*dev->curr_map || dev->curr_map_offset >= DUMP_MAP_SZ) {
303 *dev->curr_map = page_to_pfn(page);
304 dev->curr_map_offset = 0;
309 i = dev->curr_map_offset;
310 map_page = pfn_to_page(*dev->curr_map);
311 map = (unsigned long *)kmap_atomic(map_page, KM_DUMP);
312 map[i] = page_to_pfn(page);
313 kunmap_atomic(map, KM_DUMP);
314 dev->curr_map_offset = ++i;
315 dev->last_offset += PAGE_SIZE;
316 if (i >= DUMP_MAP_SZ) {
317 /* move to next map */
318 if (is_last_map_entry(++dev->curr_map)) {
319 /* move to the next indirect map page */
320 pr_debug("dump_mem_add_space: using next"
322 dev->curr_map = (unsigned long *)*dev->curr_map;
329 /* Caution: making a dest page invalidates existing contents of the page */
330 int dump_check_and_free_page(struct dump_memdev *dev, struct page *page)
335 * the page can be used as a destination only if we are sure
336 * it won't get overwritten by the soft-boot, and is not
337 * critical for us right now.
339 if (dump_reused_by_boot(page))
342 if ((err = dump_mem_add_space(dev, page))) {
343 printk("Warning: Unable to extend memdev space. Err %d\n",
353 /* Set up the initial maps and bootstrap space */
354 /* Must be called only after any previous dump is written out */
355 int dump_mem_open(struct dump_dev *dev, unsigned long devid)
357 struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
358 unsigned long nr_maps, *map, *prev_map = &dump_mdev->indirect_map_root;
364 /* Todo: sanity check for unwritten previous dump */
366 /* allocate pages for indirect map (non highmem area) */
367 nr_maps = num_physpages / DUMP_MAP_SZ; /* maps to cover entire mem */
368 for (i = 0; i < nr_maps; i += DUMP_IND_MAP_SZ) {
369 if (!(map = (unsigned long *)dump_alloc_mem(PAGE_SIZE))) {
370 printk("Unable to alloc indirect map %ld\n",
371 i / DUMP_IND_MAP_SZ);
375 *prev_map = (unsigned long)map;
376 prev_map = &map[DUMP_IND_MAP_SZ];
379 dump_mdev->curr_map = (unsigned long *)dump_mdev->indirect_map_root;
380 dump_mdev->curr_map_offset = 0;
383 * allocate a few bootstrap pages: at least 1 map and 1 data page
384 * plus enough to save the dump header
388 if (!(addr = dump_alloc_mem(PAGE_SIZE))) {
389 printk("Unable to alloc bootstrap page %ld\n", i);
393 page = virt_to_page(addr);
394 if (dump_low_page(page)) {
399 if (dump_mem_add_space(dump_mdev, page)) {
400 printk("Warning: Unable to extend memdev "
401 "space. Err %d\n", err);
406 } while (i < DUMP_NR_BOOTSTRAP);
408 printk("dump memdev init: %ld maps, %ld bootstrap pgs, %ld free pgs\n",
409 nr_maps, i, dump_mdev->last_offset >> PAGE_SHIFT);
411 dump_mdev->last_bs_offset = dump_mdev->last_offset;
416 /* Releases all pre-alloc'd pages */
417 int dump_mem_release(struct dump_dev *dev)
419 struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
420 struct page *page, *map_page;
421 unsigned long *map, *prev_map;
425 if (!dump_mdev->nr_free)
428 pr_debug("dump_mem_release\n");
429 page = dump_mem_lookup(dump_mdev, 0);
430 for (i = 0; page && (i < DUMP_NR_BOOTSTRAP - 1); i++) {
431 if (PageHighMem(page))
433 addr = page_address(page);
435 printk("page_address(%p) = NULL\n", page);
438 pr_debug("Freeing page at 0x%lx\n", addr);
440 if (dump_mdev->curr_map_offset >= DUMP_MAP_SZ - 1) {
441 map_page = pfn_to_page(*dump_mdev->curr_map);
442 if (PageHighMem(map_page))
444 page = dump_mem_next_page(dump_mdev);
445 addr = page_address(map_page);
447 printk("page_address(%p) = NULL\n",
451 pr_debug("Freeing map page at 0x%lx\n", addr);
455 page = dump_mem_next_page(dump_mdev);
459 /* now for the last used bootstrap page used as a map page */
460 if ((i < DUMP_NR_BOOTSTRAP) && (*dump_mdev->curr_map)) {
461 map_page = pfn_to_page(*dump_mdev->curr_map);
462 if ((map_page) && !PageHighMem(map_page)) {
463 addr = page_address(map_page);
465 printk("page_address(%p) = NULL\n", map_page);
467 pr_debug("Freeing map page at 0x%lx\n", addr);
474 printk("Freed %d bootstrap pages\n", i);
476 /* free the indirect maps */
477 map = (unsigned long *)dump_mdev->indirect_map_root;
482 map = next_indirect_map(map);
483 dump_free_mem(prev_map);
487 printk("Freed %d indirect map(s)\n", i);
489 /* Reset the indirect map */
490 dump_mdev->indirect_map_root = 0;
491 dump_mdev->curr_map = 0;
493 /* Reset the free list */
494 dump_mdev->nr_free = 0;
496 dump_mdev->last_offset = dump_mdev->ddev.curr_offset = 0;
497 dump_mdev->last_used_offset = 0;
498 dump_mdev->curr_map = NULL;
499 dump_mdev->curr_map_offset = 0;
505 * It is critical for this to be very strict. Cannot afford
506 * to have anything running and accessing memory while we overwrite
507 * memory (potential risk of data corruption).
508 * If in doubt (e.g if a cpu is hung and not responding) just give
509 * up and refuse to proceed with this scheme.
511 * Note: I/O will only happen after soft-boot/switchover, so we can
512 * safely disable interrupts and force stop other CPUs if this is
513 * going to be a disruptive dump, no matter what they
514 * are in the middle of.
517 * ATM Most of this is already taken care of in the nmi handler
518 * We may halt the cpus rightaway if we know this is going to be disruptive
519 * For now, since we've limited ourselves to overwriting free pages we
520 * aren't doing much here. Eventually, we'd have to wait to make sure other
521 * cpus aren't using memory we could be overwriting
523 int dump_mem_silence(struct dump_dev *dev)
525 struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
527 if (dump_mdev->last_offset > dump_mdev->last_bs_offset) {
528 /* prefer to run lkcd config & start with a clean slate */
534 extern int dump_overlay_resume(void);
536 /* Trigger the next stage of dumping */
537 int dump_mem_resume(struct dump_dev *dev)
539 dump_overlay_resume();
544 * Allocate mem dev pages as required and copy buffer contents into it.
545 * Fails if the no free pages are available
546 * Keeping it simple and limited for starters (can modify this over time)
547 * Does not handle holes or a sparse layout
548 * Data must be in multiples of PAGE_SIZE
550 int dump_mem_write(struct dump_dev *dev, void *buf, unsigned long len)
552 struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
556 unsigned long *saved_curr_map, saved_map_offset;
559 pr_debug("dump_mem_write: offset 0x%llx, size %ld\n",
560 dev->curr_offset, len);
562 if (dev->curr_offset + len > dump_mdev->last_offset) {
563 printk("Out of space to write\n");
567 if ((len & (PAGE_SIZE - 1)) || (dev->curr_offset & (PAGE_SIZE - 1)))
568 return -EINVAL; /* not aligned in units of page size */
570 saved_curr_map = dump_mdev->curr_map;
571 saved_map_offset = dump_mdev->curr_map_offset;
572 page = dump_mem_lookup(dump_mdev, dev->curr_offset >> PAGE_SHIFT);
574 for (n = len; (n > 0) && page; n -= PAGE_SIZE, buf += PAGE_SIZE ) {
575 addr = kmap_atomic(page, KM_DUMP);
576 /* memset(addr, 'x', PAGE_SIZE); */
577 memcpy(addr, buf, PAGE_SIZE);
578 kunmap_atomic(addr, KM_DUMP);
579 /* dev->curr_offset += PAGE_SIZE; */
580 page = dump_mem_next_page(dump_mdev);
583 dump_mdev->curr_map = saved_curr_map;
584 dump_mdev->curr_map_offset = saved_map_offset;
586 if (dump_mdev->last_used_offset < dev->curr_offset)
587 dump_mdev->last_used_offset = dev->curr_offset;
589 return (len - n) ? (len - n) : ret ;
592 /* dummy - always ready */
593 int dump_mem_ready(struct dump_dev *dev, void *buf)
599 * Should check for availability of space to write upto the offset
600 * affects only the curr_offset; last_offset untouched
601 * Keep it simple: Only allow multiples of PAGE_SIZE for now
603 int dump_mem_seek(struct dump_dev *dev, loff_t offset)
605 struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
607 if (offset & (PAGE_SIZE - 1))
608 return -EINVAL; /* allow page size units only for now */
610 /* Are we exceeding available space ? */
611 if (offset > dump_mdev->last_offset) {
612 printk("dump_mem_seek failed for offset 0x%llx\n",
617 dump_mdev->ddev.curr_offset = offset;
621 struct dump_dev_ops dump_memdev_ops = {
622 .open = dump_mem_open,
623 .release = dump_mem_release,
624 .silence = dump_mem_silence,
625 .resume = dump_mem_resume,
626 .seek = dump_mem_seek,
627 .write = dump_mem_write,
628 .read = NULL, /* not implemented at the moment */
629 .ready = dump_mem_ready
632 static struct dump_memdev default_dump_memdev = {
633 .ddev = {.type_name = "memdev", .ops = &dump_memdev_ops,
635 /* assume the rest of the fields are zeroed by default */
638 /* may be overwritten if a previous dump exists */
639 struct dump_memdev *dump_memdev = &default_dump_memdev;