]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - kernel/kexec_core.c
kexec: split kexec_load syscall from kexec core code
[mirror_ubuntu-bionic-kernel.git] / kernel / kexec_core.c
CommitLineData
2965faa5
DY
1/*
2 * kexec.c - kexec system call core code.
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#define pr_fmt(fmt) "kexec: " fmt
10
11#include <linux/capability.h>
12#include <linux/mm.h>
13#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/fs.h>
16#include <linux/kexec.h>
17#include <linux/mutex.h>
18#include <linux/list.h>
19#include <linux/highmem.h>
20#include <linux/syscalls.h>
21#include <linux/reboot.h>
22#include <linux/ioport.h>
23#include <linux/hardirq.h>
24#include <linux/elf.h>
25#include <linux/elfcore.h>
26#include <linux/utsname.h>
27#include <linux/numa.h>
28#include <linux/suspend.h>
29#include <linux/device.h>
30#include <linux/freezer.h>
31#include <linux/pm.h>
32#include <linux/cpu.h>
33#include <linux/uaccess.h>
34#include <linux/io.h>
35#include <linux/console.h>
36#include <linux/vmalloc.h>
37#include <linux/swap.h>
38#include <linux/syscore_ops.h>
39#include <linux/compiler.h>
40#include <linux/hugetlb.h>
41
42#include <asm/page.h>
43#include <asm/sections.h>
44
45#include <crypto/hash.h>
46#include <crypto/sha.h>
47#include "kexec_internal.h"
48
49DEFINE_MUTEX(kexec_mutex);
50
51/* Per cpu memory for storing cpu states in case of system crash. */
52note_buf_t __percpu *crash_notes;
53
54/* vmcoreinfo stuff */
55static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
56u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
57size_t vmcoreinfo_size;
58size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
59
60/* Flag to indicate we are going to kexec a new kernel */
61bool kexec_in_progress = false;
62
63
64/* Location of the reserved area for the crash kernel */
65struct resource crashk_res = {
66 .name = "Crash kernel",
67 .start = 0,
68 .end = 0,
69 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
70};
71struct resource crashk_low_res = {
72 .name = "Crash kernel",
73 .start = 0,
74 .end = 0,
75 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
76};
77
78int kexec_should_crash(struct task_struct *p)
79{
80 /*
81 * If crash_kexec_post_notifiers is enabled, don't run
82 * crash_kexec() here yet, which must be run after panic
83 * notifiers in panic().
84 */
85 if (crash_kexec_post_notifiers)
86 return 0;
87 /*
88 * There are 4 panic() calls in do_exit() path, each of which
89 * corresponds to each of these 4 conditions.
90 */
91 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
92 return 1;
93 return 0;
94}
95
96/*
97 * When kexec transitions to the new kernel there is a one-to-one
98 * mapping between physical and virtual addresses. On processors
99 * where you can disable the MMU this is trivial, and easy. For
100 * others it is still a simple predictable page table to setup.
101 *
102 * In that environment kexec copies the new kernel to its final
103 * resting place. This means I can only support memory whose
104 * physical address can fit in an unsigned long. In particular
105 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
106 * If the assembly stub has more restrictive requirements
107 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
108 * defined more restrictively in <asm/kexec.h>.
109 *
110 * The code for the transition from the current kernel to the
111 * the new kernel is placed in the control_code_buffer, whose size
112 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
113 * page of memory is necessary, but some architectures require more.
114 * Because this memory must be identity mapped in the transition from
115 * virtual to physical addresses it must live in the range
116 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
117 * modifiable.
118 *
119 * The assembly stub in the control code buffer is passed a linked list
120 * of descriptor pages detailing the source pages of the new kernel,
121 * and the destination addresses of those source pages. As this data
122 * structure is not used in the context of the current OS, it must
123 * be self-contained.
124 *
125 * The code has been made to work with highmem pages and will use a
126 * destination page in its final resting place (if it happens
127 * to allocate it). The end product of this is that most of the
128 * physical address space, and most of RAM can be used.
129 *
130 * Future directions include:
131 * - allocating a page table with the control code buffer identity
132 * mapped, to simplify machine_kexec and make kexec_on_panic more
133 * reliable.
134 */
135
136/*
137 * KIMAGE_NO_DEST is an impossible destination address..., for
138 * allocating pages whose destination address we do not care about.
139 */
140#define KIMAGE_NO_DEST (-1UL)
141
142static struct page *kimage_alloc_page(struct kimage *image,
143 gfp_t gfp_mask,
144 unsigned long dest);
145
146int sanity_check_segment_list(struct kimage *image)
147{
148 int result, i;
149 unsigned long nr_segments = image->nr_segments;
150
151 /*
152 * Verify we have good destination addresses. The caller is
153 * responsible for making certain we don't attempt to load
154 * the new image into invalid or reserved areas of RAM. This
155 * just verifies it is an address we can use.
156 *
157 * Since the kernel does everything in page size chunks ensure
158 * the destination addresses are page aligned. Too many
159 * special cases crop of when we don't do this. The most
160 * insidious is getting overlapping destination addresses
161 * simply because addresses are changed to page size
162 * granularity.
163 */
164 result = -EADDRNOTAVAIL;
165 for (i = 0; i < nr_segments; i++) {
166 unsigned long mstart, mend;
167
168 mstart = image->segment[i].mem;
169 mend = mstart + image->segment[i].memsz;
170 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
171 return result;
172 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
173 return result;
174 }
175
176 /* Verify our destination addresses do not overlap.
177 * If we alloed overlapping destination addresses
178 * through very weird things can happen with no
179 * easy explanation as one segment stops on another.
180 */
181 result = -EINVAL;
182 for (i = 0; i < nr_segments; i++) {
183 unsigned long mstart, mend;
184 unsigned long j;
185
186 mstart = image->segment[i].mem;
187 mend = mstart + image->segment[i].memsz;
188 for (j = 0; j < i; j++) {
189 unsigned long pstart, pend;
190
191 pstart = image->segment[j].mem;
192 pend = pstart + image->segment[j].memsz;
193 /* Do the segments overlap ? */
194 if ((mend > pstart) && (mstart < pend))
195 return result;
196 }
197 }
198
199 /* Ensure our buffer sizes are strictly less than
200 * our memory sizes. This should always be the case,
201 * and it is easier to check up front than to be surprised
202 * later on.
203 */
204 result = -EINVAL;
205 for (i = 0; i < nr_segments; i++) {
206 if (image->segment[i].bufsz > image->segment[i].memsz)
207 return result;
208 }
209
210 /*
211 * Verify we have good destination addresses. Normally
212 * the caller is responsible for making certain we don't
213 * attempt to load the new image into invalid or reserved
214 * areas of RAM. But crash kernels are preloaded into a
215 * reserved area of ram. We must ensure the addresses
216 * are in the reserved area otherwise preloading the
217 * kernel could corrupt things.
218 */
219
220 if (image->type == KEXEC_TYPE_CRASH) {
221 result = -EADDRNOTAVAIL;
222 for (i = 0; i < nr_segments; i++) {
223 unsigned long mstart, mend;
224
225 mstart = image->segment[i].mem;
226 mend = mstart + image->segment[i].memsz - 1;
227 /* Ensure we are within the crash kernel limits */
228 if ((mstart < crashk_res.start) ||
229 (mend > crashk_res.end))
230 return result;
231 }
232 }
233
234 return 0;
235}
236
237struct kimage *do_kimage_alloc_init(void)
238{
239 struct kimage *image;
240
241 /* Allocate a controlling structure */
242 image = kzalloc(sizeof(*image), GFP_KERNEL);
243 if (!image)
244 return NULL;
245
246 image->head = 0;
247 image->entry = &image->head;
248 image->last_entry = &image->head;
249 image->control_page = ~0; /* By default this does not apply */
250 image->type = KEXEC_TYPE_DEFAULT;
251
252 /* Initialize the list of control pages */
253 INIT_LIST_HEAD(&image->control_pages);
254
255 /* Initialize the list of destination pages */
256 INIT_LIST_HEAD(&image->dest_pages);
257
258 /* Initialize the list of unusable pages */
259 INIT_LIST_HEAD(&image->unusable_pages);
260
261 return image;
262}
263
264int kimage_is_destination_range(struct kimage *image,
265 unsigned long start,
266 unsigned long end)
267{
268 unsigned long i;
269
270 for (i = 0; i < image->nr_segments; i++) {
271 unsigned long mstart, mend;
272
273 mstart = image->segment[i].mem;
274 mend = mstart + image->segment[i].memsz;
275 if ((end > mstart) && (start < mend))
276 return 1;
277 }
278
279 return 0;
280}
281
282static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
283{
284 struct page *pages;
285
286 pages = alloc_pages(gfp_mask, order);
287 if (pages) {
288 unsigned int count, i;
289
290 pages->mapping = NULL;
291 set_page_private(pages, order);
292 count = 1 << order;
293 for (i = 0; i < count; i++)
294 SetPageReserved(pages + i);
295 }
296
297 return pages;
298}
299
300static void kimage_free_pages(struct page *page)
301{
302 unsigned int order, count, i;
303
304 order = page_private(page);
305 count = 1 << order;
306 for (i = 0; i < count; i++)
307 ClearPageReserved(page + i);
308 __free_pages(page, order);
309}
310
311void kimage_free_page_list(struct list_head *list)
312{
313 struct list_head *pos, *next;
314
315 list_for_each_safe(pos, next, list) {
316 struct page *page;
317
318 page = list_entry(pos, struct page, lru);
319 list_del(&page->lru);
320 kimage_free_pages(page);
321 }
322}
323
324static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
325 unsigned int order)
326{
327 /* Control pages are special, they are the intermediaries
328 * that are needed while we copy the rest of the pages
329 * to their final resting place. As such they must
330 * not conflict with either the destination addresses
331 * or memory the kernel is already using.
332 *
333 * The only case where we really need more than one of
334 * these are for architectures where we cannot disable
335 * the MMU and must instead generate an identity mapped
336 * page table for all of the memory.
337 *
338 * At worst this runs in O(N) of the image size.
339 */
340 struct list_head extra_pages;
341 struct page *pages;
342 unsigned int count;
343
344 count = 1 << order;
345 INIT_LIST_HEAD(&extra_pages);
346
347 /* Loop while I can allocate a page and the page allocated
348 * is a destination page.
349 */
350 do {
351 unsigned long pfn, epfn, addr, eaddr;
352
353 pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
354 if (!pages)
355 break;
356 pfn = page_to_pfn(pages);
357 epfn = pfn + count;
358 addr = pfn << PAGE_SHIFT;
359 eaddr = epfn << PAGE_SHIFT;
360 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
361 kimage_is_destination_range(image, addr, eaddr)) {
362 list_add(&pages->lru, &extra_pages);
363 pages = NULL;
364 }
365 } while (!pages);
366
367 if (pages) {
368 /* Remember the allocated page... */
369 list_add(&pages->lru, &image->control_pages);
370
371 /* Because the page is already in it's destination
372 * location we will never allocate another page at
373 * that address. Therefore kimage_alloc_pages
374 * will not return it (again) and we don't need
375 * to give it an entry in image->segment[].
376 */
377 }
378 /* Deal with the destination pages I have inadvertently allocated.
379 *
380 * Ideally I would convert multi-page allocations into single
381 * page allocations, and add everything to image->dest_pages.
382 *
383 * For now it is simpler to just free the pages.
384 */
385 kimage_free_page_list(&extra_pages);
386
387 return pages;
388}
389
390static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
391 unsigned int order)
392{
393 /* Control pages are special, they are the intermediaries
394 * that are needed while we copy the rest of the pages
395 * to their final resting place. As such they must
396 * not conflict with either the destination addresses
397 * or memory the kernel is already using.
398 *
399 * Control pages are also the only pags we must allocate
400 * when loading a crash kernel. All of the other pages
401 * are specified by the segments and we just memcpy
402 * into them directly.
403 *
404 * The only case where we really need more than one of
405 * these are for architectures where we cannot disable
406 * the MMU and must instead generate an identity mapped
407 * page table for all of the memory.
408 *
409 * Given the low demand this implements a very simple
410 * allocator that finds the first hole of the appropriate
411 * size in the reserved memory region, and allocates all
412 * of the memory up to and including the hole.
413 */
414 unsigned long hole_start, hole_end, size;
415 struct page *pages;
416
417 pages = NULL;
418 size = (1 << order) << PAGE_SHIFT;
419 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
420 hole_end = hole_start + size - 1;
421 while (hole_end <= crashk_res.end) {
422 unsigned long i;
423
424 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
425 break;
426 /* See if I overlap any of the segments */
427 for (i = 0; i < image->nr_segments; i++) {
428 unsigned long mstart, mend;
429
430 mstart = image->segment[i].mem;
431 mend = mstart + image->segment[i].memsz - 1;
432 if ((hole_end >= mstart) && (hole_start <= mend)) {
433 /* Advance the hole to the end of the segment */
434 hole_start = (mend + (size - 1)) & ~(size - 1);
435 hole_end = hole_start + size - 1;
436 break;
437 }
438 }
439 /* If I don't overlap any segments I have found my hole! */
440 if (i == image->nr_segments) {
441 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
442 break;
443 }
444 }
445 if (pages)
446 image->control_page = hole_end;
447
448 return pages;
449}
450
451
452struct page *kimage_alloc_control_pages(struct kimage *image,
453 unsigned int order)
454{
455 struct page *pages = NULL;
456
457 switch (image->type) {
458 case KEXEC_TYPE_DEFAULT:
459 pages = kimage_alloc_normal_control_pages(image, order);
460 break;
461 case KEXEC_TYPE_CRASH:
462 pages = kimage_alloc_crash_control_pages(image, order);
463 break;
464 }
465
466 return pages;
467}
468
469static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
470{
471 if (*image->entry != 0)
472 image->entry++;
473
474 if (image->entry == image->last_entry) {
475 kimage_entry_t *ind_page;
476 struct page *page;
477
478 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
479 if (!page)
480 return -ENOMEM;
481
482 ind_page = page_address(page);
483 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
484 image->entry = ind_page;
485 image->last_entry = ind_page +
486 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
487 }
488 *image->entry = entry;
489 image->entry++;
490 *image->entry = 0;
491
492 return 0;
493}
494
495static int kimage_set_destination(struct kimage *image,
496 unsigned long destination)
497{
498 int result;
499
500 destination &= PAGE_MASK;
501 result = kimage_add_entry(image, destination | IND_DESTINATION);
502
503 return result;
504}
505
506
507static int kimage_add_page(struct kimage *image, unsigned long page)
508{
509 int result;
510
511 page &= PAGE_MASK;
512 result = kimage_add_entry(image, page | IND_SOURCE);
513
514 return result;
515}
516
517
518static void kimage_free_extra_pages(struct kimage *image)
519{
520 /* Walk through and free any extra destination pages I may have */
521 kimage_free_page_list(&image->dest_pages);
522
523 /* Walk through and free any unusable pages I have cached */
524 kimage_free_page_list(&image->unusable_pages);
525
526}
527void kimage_terminate(struct kimage *image)
528{
529 if (*image->entry != 0)
530 image->entry++;
531
532 *image->entry = IND_DONE;
533}
534
535#define for_each_kimage_entry(image, ptr, entry) \
536 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
537 ptr = (entry & IND_INDIRECTION) ? \
538 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
539
540static void kimage_free_entry(kimage_entry_t entry)
541{
542 struct page *page;
543
544 page = pfn_to_page(entry >> PAGE_SHIFT);
545 kimage_free_pages(page);
546}
547
548void kimage_free(struct kimage *image)
549{
550 kimage_entry_t *ptr, entry;
551 kimage_entry_t ind = 0;
552
553 if (!image)
554 return;
555
556 kimage_free_extra_pages(image);
557 for_each_kimage_entry(image, ptr, entry) {
558 if (entry & IND_INDIRECTION) {
559 /* Free the previous indirection page */
560 if (ind & IND_INDIRECTION)
561 kimage_free_entry(ind);
562 /* Save this indirection page until we are
563 * done with it.
564 */
565 ind = entry;
566 } else if (entry & IND_SOURCE)
567 kimage_free_entry(entry);
568 }
569 /* Free the final indirection page */
570 if (ind & IND_INDIRECTION)
571 kimage_free_entry(ind);
572
573 /* Handle any machine specific cleanup */
574 machine_kexec_cleanup(image);
575
576 /* Free the kexec control pages... */
577 kimage_free_page_list(&image->control_pages);
578
579 /*
580 * Free up any temporary buffers allocated. This might hit if
581 * error occurred much later after buffer allocation.
582 */
583 if (image->file_mode)
584 kimage_file_post_load_cleanup(image);
585
586 kfree(image);
587}
588
589static kimage_entry_t *kimage_dst_used(struct kimage *image,
590 unsigned long page)
591{
592 kimage_entry_t *ptr, entry;
593 unsigned long destination = 0;
594
595 for_each_kimage_entry(image, ptr, entry) {
596 if (entry & IND_DESTINATION)
597 destination = entry & PAGE_MASK;
598 else if (entry & IND_SOURCE) {
599 if (page == destination)
600 return ptr;
601 destination += PAGE_SIZE;
602 }
603 }
604
605 return NULL;
606}
607
608static struct page *kimage_alloc_page(struct kimage *image,
609 gfp_t gfp_mask,
610 unsigned long destination)
611{
612 /*
613 * Here we implement safeguards to ensure that a source page
614 * is not copied to its destination page before the data on
615 * the destination page is no longer useful.
616 *
617 * To do this we maintain the invariant that a source page is
618 * either its own destination page, or it is not a
619 * destination page at all.
620 *
621 * That is slightly stronger than required, but the proof
622 * that no problems will not occur is trivial, and the
623 * implementation is simply to verify.
624 *
625 * When allocating all pages normally this algorithm will run
626 * in O(N) time, but in the worst case it will run in O(N^2)
627 * time. If the runtime is a problem the data structures can
628 * be fixed.
629 */
630 struct page *page;
631 unsigned long addr;
632
633 /*
634 * Walk through the list of destination pages, and see if I
635 * have a match.
636 */
637 list_for_each_entry(page, &image->dest_pages, lru) {
638 addr = page_to_pfn(page) << PAGE_SHIFT;
639 if (addr == destination) {
640 list_del(&page->lru);
641 return page;
642 }
643 }
644 page = NULL;
645 while (1) {
646 kimage_entry_t *old;
647
648 /* Allocate a page, if we run out of memory give up */
649 page = kimage_alloc_pages(gfp_mask, 0);
650 if (!page)
651 return NULL;
652 /* If the page cannot be used file it away */
653 if (page_to_pfn(page) >
654 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
655 list_add(&page->lru, &image->unusable_pages);
656 continue;
657 }
658 addr = page_to_pfn(page) << PAGE_SHIFT;
659
660 /* If it is the destination page we want use it */
661 if (addr == destination)
662 break;
663
664 /* If the page is not a destination page use it */
665 if (!kimage_is_destination_range(image, addr,
666 addr + PAGE_SIZE))
667 break;
668
669 /*
670 * I know that the page is someones destination page.
671 * See if there is already a source page for this
672 * destination page. And if so swap the source pages.
673 */
674 old = kimage_dst_used(image, addr);
675 if (old) {
676 /* If so move it */
677 unsigned long old_addr;
678 struct page *old_page;
679
680 old_addr = *old & PAGE_MASK;
681 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
682 copy_highpage(page, old_page);
683 *old = addr | (*old & ~PAGE_MASK);
684
685 /* The old page I have found cannot be a
686 * destination page, so return it if it's
687 * gfp_flags honor the ones passed in.
688 */
689 if (!(gfp_mask & __GFP_HIGHMEM) &&
690 PageHighMem(old_page)) {
691 kimage_free_pages(old_page);
692 continue;
693 }
694 addr = old_addr;
695 page = old_page;
696 break;
697 }
698 /* Place the page on the destination list, to be used later */
699 list_add(&page->lru, &image->dest_pages);
700 }
701
702 return page;
703}
704
705static int kimage_load_normal_segment(struct kimage *image,
706 struct kexec_segment *segment)
707{
708 unsigned long maddr;
709 size_t ubytes, mbytes;
710 int result;
711 unsigned char __user *buf = NULL;
712 unsigned char *kbuf = NULL;
713
714 result = 0;
715 if (image->file_mode)
716 kbuf = segment->kbuf;
717 else
718 buf = segment->buf;
719 ubytes = segment->bufsz;
720 mbytes = segment->memsz;
721 maddr = segment->mem;
722
723 result = kimage_set_destination(image, maddr);
724 if (result < 0)
725 goto out;
726
727 while (mbytes) {
728 struct page *page;
729 char *ptr;
730 size_t uchunk, mchunk;
731
732 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
733 if (!page) {
734 result = -ENOMEM;
735 goto out;
736 }
737 result = kimage_add_page(image, page_to_pfn(page)
738 << PAGE_SHIFT);
739 if (result < 0)
740 goto out;
741
742 ptr = kmap(page);
743 /* Start with a clear page */
744 clear_page(ptr);
745 ptr += maddr & ~PAGE_MASK;
746 mchunk = min_t(size_t, mbytes,
747 PAGE_SIZE - (maddr & ~PAGE_MASK));
748 uchunk = min(ubytes, mchunk);
749
750 /* For file based kexec, source pages are in kernel memory */
751 if (image->file_mode)
752 memcpy(ptr, kbuf, uchunk);
753 else
754 result = copy_from_user(ptr, buf, uchunk);
755 kunmap(page);
756 if (result) {
757 result = -EFAULT;
758 goto out;
759 }
760 ubytes -= uchunk;
761 maddr += mchunk;
762 if (image->file_mode)
763 kbuf += mchunk;
764 else
765 buf += mchunk;
766 mbytes -= mchunk;
767 }
768out:
769 return result;
770}
771
772static int kimage_load_crash_segment(struct kimage *image,
773 struct kexec_segment *segment)
774{
775 /* For crash dumps kernels we simply copy the data from
776 * user space to it's destination.
777 * We do things a page at a time for the sake of kmap.
778 */
779 unsigned long maddr;
780 size_t ubytes, mbytes;
781 int result;
782 unsigned char __user *buf = NULL;
783 unsigned char *kbuf = NULL;
784
785 result = 0;
786 if (image->file_mode)
787 kbuf = segment->kbuf;
788 else
789 buf = segment->buf;
790 ubytes = segment->bufsz;
791 mbytes = segment->memsz;
792 maddr = segment->mem;
793 while (mbytes) {
794 struct page *page;
795 char *ptr;
796 size_t uchunk, mchunk;
797
798 page = pfn_to_page(maddr >> PAGE_SHIFT);
799 if (!page) {
800 result = -ENOMEM;
801 goto out;
802 }
803 ptr = kmap(page);
804 ptr += maddr & ~PAGE_MASK;
805 mchunk = min_t(size_t, mbytes,
806 PAGE_SIZE - (maddr & ~PAGE_MASK));
807 uchunk = min(ubytes, mchunk);
808 if (mchunk > uchunk) {
809 /* Zero the trailing part of the page */
810 memset(ptr + uchunk, 0, mchunk - uchunk);
811 }
812
813 /* For file based kexec, source pages are in kernel memory */
814 if (image->file_mode)
815 memcpy(ptr, kbuf, uchunk);
816 else
817 result = copy_from_user(ptr, buf, uchunk);
818 kexec_flush_icache_page(page);
819 kunmap(page);
820 if (result) {
821 result = -EFAULT;
822 goto out;
823 }
824 ubytes -= uchunk;
825 maddr += mchunk;
826 if (image->file_mode)
827 kbuf += mchunk;
828 else
829 buf += mchunk;
830 mbytes -= mchunk;
831 }
832out:
833 return result;
834}
835
836int kimage_load_segment(struct kimage *image,
837 struct kexec_segment *segment)
838{
839 int result = -ENOMEM;
840
841 switch (image->type) {
842 case KEXEC_TYPE_DEFAULT:
843 result = kimage_load_normal_segment(image, segment);
844 break;
845 case KEXEC_TYPE_CRASH:
846 result = kimage_load_crash_segment(image, segment);
847 break;
848 }
849
850 return result;
851}
852
853struct kimage *kexec_image;
854struct kimage *kexec_crash_image;
855int kexec_load_disabled;
856
857void crash_kexec(struct pt_regs *regs)
858{
859 /* Take the kexec_mutex here to prevent sys_kexec_load
860 * running on one cpu from replacing the crash kernel
861 * we are using after a panic on a different cpu.
862 *
863 * If the crash kernel was not located in a fixed area
864 * of memory the xchg(&kexec_crash_image) would be
865 * sufficient. But since I reuse the memory...
866 */
867 if (mutex_trylock(&kexec_mutex)) {
868 if (kexec_crash_image) {
869 struct pt_regs fixed_regs;
870
871 crash_setup_regs(&fixed_regs, regs);
872 crash_save_vmcoreinfo();
873 machine_crash_shutdown(&fixed_regs);
874 machine_kexec(kexec_crash_image);
875 }
876 mutex_unlock(&kexec_mutex);
877 }
878}
879
880size_t crash_get_memory_size(void)
881{
882 size_t size = 0;
883
884 mutex_lock(&kexec_mutex);
885 if (crashk_res.end != crashk_res.start)
886 size = resource_size(&crashk_res);
887 mutex_unlock(&kexec_mutex);
888 return size;
889}
890
891void __weak crash_free_reserved_phys_range(unsigned long begin,
892 unsigned long end)
893{
894 unsigned long addr;
895
896 for (addr = begin; addr < end; addr += PAGE_SIZE)
897 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
898}
899
900int crash_shrink_memory(unsigned long new_size)
901{
902 int ret = 0;
903 unsigned long start, end;
904 unsigned long old_size;
905 struct resource *ram_res;
906
907 mutex_lock(&kexec_mutex);
908
909 if (kexec_crash_image) {
910 ret = -ENOENT;
911 goto unlock;
912 }
913 start = crashk_res.start;
914 end = crashk_res.end;
915 old_size = (end == 0) ? 0 : end - start + 1;
916 if (new_size >= old_size) {
917 ret = (new_size == old_size) ? 0 : -EINVAL;
918 goto unlock;
919 }
920
921 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
922 if (!ram_res) {
923 ret = -ENOMEM;
924 goto unlock;
925 }
926
927 start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
928 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
929
930 crash_map_reserved_pages();
931 crash_free_reserved_phys_range(end, crashk_res.end);
932
933 if ((start == end) && (crashk_res.parent != NULL))
934 release_resource(&crashk_res);
935
936 ram_res->start = end;
937 ram_res->end = crashk_res.end;
938 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
939 ram_res->name = "System RAM";
940
941 crashk_res.end = end - 1;
942
943 insert_resource(&iomem_resource, ram_res);
944 crash_unmap_reserved_pages();
945
946unlock:
947 mutex_unlock(&kexec_mutex);
948 return ret;
949}
950
951static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
952 size_t data_len)
953{
954 struct elf_note note;
955
956 note.n_namesz = strlen(name) + 1;
957 note.n_descsz = data_len;
958 note.n_type = type;
959 memcpy(buf, &note, sizeof(note));
960 buf += (sizeof(note) + 3)/4;
961 memcpy(buf, name, note.n_namesz);
962 buf += (note.n_namesz + 3)/4;
963 memcpy(buf, data, note.n_descsz);
964 buf += (note.n_descsz + 3)/4;
965
966 return buf;
967}
968
969static void final_note(u32 *buf)
970{
971 struct elf_note note;
972
973 note.n_namesz = 0;
974 note.n_descsz = 0;
975 note.n_type = 0;
976 memcpy(buf, &note, sizeof(note));
977}
978
979void crash_save_cpu(struct pt_regs *regs, int cpu)
980{
981 struct elf_prstatus prstatus;
982 u32 *buf;
983
984 if ((cpu < 0) || (cpu >= nr_cpu_ids))
985 return;
986
987 /* Using ELF notes here is opportunistic.
988 * I need a well defined structure format
989 * for the data I pass, and I need tags
990 * on the data to indicate what information I have
991 * squirrelled away. ELF notes happen to provide
992 * all of that, so there is no need to invent something new.
993 */
994 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
995 if (!buf)
996 return;
997 memset(&prstatus, 0, sizeof(prstatus));
998 prstatus.pr_pid = current->pid;
999 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1000 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1001 &prstatus, sizeof(prstatus));
1002 final_note(buf);
1003}
1004
1005static int __init crash_notes_memory_init(void)
1006{
1007 /* Allocate memory for saving cpu registers. */
1008 crash_notes = alloc_percpu(note_buf_t);
1009 if (!crash_notes) {
1010 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
1011 return -ENOMEM;
1012 }
1013 return 0;
1014}
1015subsys_initcall(crash_notes_memory_init);
1016
1017
1018/*
1019 * parsing the "crashkernel" commandline
1020 *
1021 * this code is intended to be called from architecture specific code
1022 */
1023
1024
1025/*
1026 * This function parses command lines in the format
1027 *
1028 * crashkernel=ramsize-range:size[,...][@offset]
1029 *
1030 * The function returns 0 on success and -EINVAL on failure.
1031 */
1032static int __init parse_crashkernel_mem(char *cmdline,
1033 unsigned long long system_ram,
1034 unsigned long long *crash_size,
1035 unsigned long long *crash_base)
1036{
1037 char *cur = cmdline, *tmp;
1038
1039 /* for each entry of the comma-separated list */
1040 do {
1041 unsigned long long start, end = ULLONG_MAX, size;
1042
1043 /* get the start of the range */
1044 start = memparse(cur, &tmp);
1045 if (cur == tmp) {
1046 pr_warn("crashkernel: Memory value expected\n");
1047 return -EINVAL;
1048 }
1049 cur = tmp;
1050 if (*cur != '-') {
1051 pr_warn("crashkernel: '-' expected\n");
1052 return -EINVAL;
1053 }
1054 cur++;
1055
1056 /* if no ':' is here, than we read the end */
1057 if (*cur != ':') {
1058 end = memparse(cur, &tmp);
1059 if (cur == tmp) {
1060 pr_warn("crashkernel: Memory value expected\n");
1061 return -EINVAL;
1062 }
1063 cur = tmp;
1064 if (end <= start) {
1065 pr_warn("crashkernel: end <= start\n");
1066 return -EINVAL;
1067 }
1068 }
1069
1070 if (*cur != ':') {
1071 pr_warn("crashkernel: ':' expected\n");
1072 return -EINVAL;
1073 }
1074 cur++;
1075
1076 size = memparse(cur, &tmp);
1077 if (cur == tmp) {
1078 pr_warn("Memory value expected\n");
1079 return -EINVAL;
1080 }
1081 cur = tmp;
1082 if (size >= system_ram) {
1083 pr_warn("crashkernel: invalid size\n");
1084 return -EINVAL;
1085 }
1086
1087 /* match ? */
1088 if (system_ram >= start && system_ram < end) {
1089 *crash_size = size;
1090 break;
1091 }
1092 } while (*cur++ == ',');
1093
1094 if (*crash_size > 0) {
1095 while (*cur && *cur != ' ' && *cur != '@')
1096 cur++;
1097 if (*cur == '@') {
1098 cur++;
1099 *crash_base = memparse(cur, &tmp);
1100 if (cur == tmp) {
1101 pr_warn("Memory value expected after '@'\n");
1102 return -EINVAL;
1103 }
1104 }
1105 }
1106
1107 return 0;
1108}
1109
1110/*
1111 * That function parses "simple" (old) crashkernel command lines like
1112 *
1113 * crashkernel=size[@offset]
1114 *
1115 * It returns 0 on success and -EINVAL on failure.
1116 */
1117static int __init parse_crashkernel_simple(char *cmdline,
1118 unsigned long long *crash_size,
1119 unsigned long long *crash_base)
1120{
1121 char *cur = cmdline;
1122
1123 *crash_size = memparse(cmdline, &cur);
1124 if (cmdline == cur) {
1125 pr_warn("crashkernel: memory value expected\n");
1126 return -EINVAL;
1127 }
1128
1129 if (*cur == '@')
1130 *crash_base = memparse(cur+1, &cur);
1131 else if (*cur != ' ' && *cur != '\0') {
1132 pr_warn("crashkernel: unrecognized char\n");
1133 return -EINVAL;
1134 }
1135
1136 return 0;
1137}
1138
1139#define SUFFIX_HIGH 0
1140#define SUFFIX_LOW 1
1141#define SUFFIX_NULL 2
1142static __initdata char *suffix_tbl[] = {
1143 [SUFFIX_HIGH] = ",high",
1144 [SUFFIX_LOW] = ",low",
1145 [SUFFIX_NULL] = NULL,
1146};
1147
1148/*
1149 * That function parses "suffix" crashkernel command lines like
1150 *
1151 * crashkernel=size,[high|low]
1152 *
1153 * It returns 0 on success and -EINVAL on failure.
1154 */
1155static int __init parse_crashkernel_suffix(char *cmdline,
1156 unsigned long long *crash_size,
1157 const char *suffix)
1158{
1159 char *cur = cmdline;
1160
1161 *crash_size = memparse(cmdline, &cur);
1162 if (cmdline == cur) {
1163 pr_warn("crashkernel: memory value expected\n");
1164 return -EINVAL;
1165 }
1166
1167 /* check with suffix */
1168 if (strncmp(cur, suffix, strlen(suffix))) {
1169 pr_warn("crashkernel: unrecognized char\n");
1170 return -EINVAL;
1171 }
1172 cur += strlen(suffix);
1173 if (*cur != ' ' && *cur != '\0') {
1174 pr_warn("crashkernel: unrecognized char\n");
1175 return -EINVAL;
1176 }
1177
1178 return 0;
1179}
1180
1181static __init char *get_last_crashkernel(char *cmdline,
1182 const char *name,
1183 const char *suffix)
1184{
1185 char *p = cmdline, *ck_cmdline = NULL;
1186
1187 /* find crashkernel and use the last one if there are more */
1188 p = strstr(p, name);
1189 while (p) {
1190 char *end_p = strchr(p, ' ');
1191 char *q;
1192
1193 if (!end_p)
1194 end_p = p + strlen(p);
1195
1196 if (!suffix) {
1197 int i;
1198
1199 /* skip the one with any known suffix */
1200 for (i = 0; suffix_tbl[i]; i++) {
1201 q = end_p - strlen(suffix_tbl[i]);
1202 if (!strncmp(q, suffix_tbl[i],
1203 strlen(suffix_tbl[i])))
1204 goto next;
1205 }
1206 ck_cmdline = p;
1207 } else {
1208 q = end_p - strlen(suffix);
1209 if (!strncmp(q, suffix, strlen(suffix)))
1210 ck_cmdline = p;
1211 }
1212next:
1213 p = strstr(p+1, name);
1214 }
1215
1216 if (!ck_cmdline)
1217 return NULL;
1218
1219 return ck_cmdline;
1220}
1221
1222static int __init __parse_crashkernel(char *cmdline,
1223 unsigned long long system_ram,
1224 unsigned long long *crash_size,
1225 unsigned long long *crash_base,
1226 const char *name,
1227 const char *suffix)
1228{
1229 char *first_colon, *first_space;
1230 char *ck_cmdline;
1231
1232 BUG_ON(!crash_size || !crash_base);
1233 *crash_size = 0;
1234 *crash_base = 0;
1235
1236 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1237
1238 if (!ck_cmdline)
1239 return -EINVAL;
1240
1241 ck_cmdline += strlen(name);
1242
1243 if (suffix)
1244 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1245 suffix);
1246 /*
1247 * if the commandline contains a ':', then that's the extended
1248 * syntax -- if not, it must be the classic syntax
1249 */
1250 first_colon = strchr(ck_cmdline, ':');
1251 first_space = strchr(ck_cmdline, ' ');
1252 if (first_colon && (!first_space || first_colon < first_space))
1253 return parse_crashkernel_mem(ck_cmdline, system_ram,
1254 crash_size, crash_base);
1255
1256 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1257}
1258
1259/*
1260 * That function is the entry point for command line parsing and should be
1261 * called from the arch-specific code.
1262 */
1263int __init parse_crashkernel(char *cmdline,
1264 unsigned long long system_ram,
1265 unsigned long long *crash_size,
1266 unsigned long long *crash_base)
1267{
1268 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1269 "crashkernel=", NULL);
1270}
1271
1272int __init parse_crashkernel_high(char *cmdline,
1273 unsigned long long system_ram,
1274 unsigned long long *crash_size,
1275 unsigned long long *crash_base)
1276{
1277 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1278 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1279}
1280
1281int __init parse_crashkernel_low(char *cmdline,
1282 unsigned long long system_ram,
1283 unsigned long long *crash_size,
1284 unsigned long long *crash_base)
1285{
1286 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1287 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1288}
1289
1290static void update_vmcoreinfo_note(void)
1291{
1292 u32 *buf = vmcoreinfo_note;
1293
1294 if (!vmcoreinfo_size)
1295 return;
1296 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1297 vmcoreinfo_size);
1298 final_note(buf);
1299}
1300
1301void crash_save_vmcoreinfo(void)
1302{
1303 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1304 update_vmcoreinfo_note();
1305}
1306
1307void vmcoreinfo_append_str(const char *fmt, ...)
1308{
1309 va_list args;
1310 char buf[0x50];
1311 size_t r;
1312
1313 va_start(args, fmt);
1314 r = vscnprintf(buf, sizeof(buf), fmt, args);
1315 va_end(args);
1316
1317 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1318
1319 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1320
1321 vmcoreinfo_size += r;
1322}
1323
1324/*
1325 * provide an empty default implementation here -- architecture
1326 * code may override this
1327 */
1328void __weak arch_crash_save_vmcoreinfo(void)
1329{}
1330
1331unsigned long __weak paddr_vmcoreinfo_note(void)
1332{
1333 return __pa((unsigned long)(char *)&vmcoreinfo_note);
1334}
1335
1336static int __init crash_save_vmcoreinfo_init(void)
1337{
1338 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1339 VMCOREINFO_PAGESIZE(PAGE_SIZE);
1340
1341 VMCOREINFO_SYMBOL(init_uts_ns);
1342 VMCOREINFO_SYMBOL(node_online_map);
1343#ifdef CONFIG_MMU
1344 VMCOREINFO_SYMBOL(swapper_pg_dir);
1345#endif
1346 VMCOREINFO_SYMBOL(_stext);
1347 VMCOREINFO_SYMBOL(vmap_area_list);
1348
1349#ifndef CONFIG_NEED_MULTIPLE_NODES
1350 VMCOREINFO_SYMBOL(mem_map);
1351 VMCOREINFO_SYMBOL(contig_page_data);
1352#endif
1353#ifdef CONFIG_SPARSEMEM
1354 VMCOREINFO_SYMBOL(mem_section);
1355 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1356 VMCOREINFO_STRUCT_SIZE(mem_section);
1357 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1358#endif
1359 VMCOREINFO_STRUCT_SIZE(page);
1360 VMCOREINFO_STRUCT_SIZE(pglist_data);
1361 VMCOREINFO_STRUCT_SIZE(zone);
1362 VMCOREINFO_STRUCT_SIZE(free_area);
1363 VMCOREINFO_STRUCT_SIZE(list_head);
1364 VMCOREINFO_SIZE(nodemask_t);
1365 VMCOREINFO_OFFSET(page, flags);
1366 VMCOREINFO_OFFSET(page, _count);
1367 VMCOREINFO_OFFSET(page, mapping);
1368 VMCOREINFO_OFFSET(page, lru);
1369 VMCOREINFO_OFFSET(page, _mapcount);
1370 VMCOREINFO_OFFSET(page, private);
1371 VMCOREINFO_OFFSET(pglist_data, node_zones);
1372 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1373#ifdef CONFIG_FLAT_NODE_MEM_MAP
1374 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1375#endif
1376 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1377 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1378 VMCOREINFO_OFFSET(pglist_data, node_id);
1379 VMCOREINFO_OFFSET(zone, free_area);
1380 VMCOREINFO_OFFSET(zone, vm_stat);
1381 VMCOREINFO_OFFSET(zone, spanned_pages);
1382 VMCOREINFO_OFFSET(free_area, free_list);
1383 VMCOREINFO_OFFSET(list_head, next);
1384 VMCOREINFO_OFFSET(list_head, prev);
1385 VMCOREINFO_OFFSET(vmap_area, va_start);
1386 VMCOREINFO_OFFSET(vmap_area, list);
1387 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1388 log_buf_kexec_setup();
1389 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1390 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1391 VMCOREINFO_NUMBER(PG_lru);
1392 VMCOREINFO_NUMBER(PG_private);
1393 VMCOREINFO_NUMBER(PG_swapcache);
1394 VMCOREINFO_NUMBER(PG_slab);
1395#ifdef CONFIG_MEMORY_FAILURE
1396 VMCOREINFO_NUMBER(PG_hwpoison);
1397#endif
1398 VMCOREINFO_NUMBER(PG_head_mask);
1399 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1400#ifdef CONFIG_HUGETLBFS
1401 VMCOREINFO_SYMBOL(free_huge_page);
1402#endif
1403
1404 arch_crash_save_vmcoreinfo();
1405 update_vmcoreinfo_note();
1406
1407 return 0;
1408}
1409
1410subsys_initcall(crash_save_vmcoreinfo_init);
1411
1412/*
1413 * Move into place and start executing a preloaded standalone
1414 * executable. If nothing was preloaded return an error.
1415 */
1416int kernel_kexec(void)
1417{
1418 int error = 0;
1419
1420 if (!mutex_trylock(&kexec_mutex))
1421 return -EBUSY;
1422 if (!kexec_image) {
1423 error = -EINVAL;
1424 goto Unlock;
1425 }
1426
1427#ifdef CONFIG_KEXEC_JUMP
1428 if (kexec_image->preserve_context) {
1429 lock_system_sleep();
1430 pm_prepare_console();
1431 error = freeze_processes();
1432 if (error) {
1433 error = -EBUSY;
1434 goto Restore_console;
1435 }
1436 suspend_console();
1437 error = dpm_suspend_start(PMSG_FREEZE);
1438 if (error)
1439 goto Resume_console;
1440 /* At this point, dpm_suspend_start() has been called,
1441 * but *not* dpm_suspend_end(). We *must* call
1442 * dpm_suspend_end() now. Otherwise, drivers for
1443 * some devices (e.g. interrupt controllers) become
1444 * desynchronized with the actual state of the
1445 * hardware at resume time, and evil weirdness ensues.
1446 */
1447 error = dpm_suspend_end(PMSG_FREEZE);
1448 if (error)
1449 goto Resume_devices;
1450 error = disable_nonboot_cpus();
1451 if (error)
1452 goto Enable_cpus;
1453 local_irq_disable();
1454 error = syscore_suspend();
1455 if (error)
1456 goto Enable_irqs;
1457 } else
1458#endif
1459 {
1460 kexec_in_progress = true;
1461 kernel_restart_prepare(NULL);
1462 migrate_to_reboot_cpu();
1463
1464 /*
1465 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1466 * no further code needs to use CPU hotplug (which is true in
1467 * the reboot case). However, the kexec path depends on using
1468 * CPU hotplug again; so re-enable it here.
1469 */
1470 cpu_hotplug_enable();
1471 pr_emerg("Starting new kernel\n");
1472 machine_shutdown();
1473 }
1474
1475 machine_kexec(kexec_image);
1476
1477#ifdef CONFIG_KEXEC_JUMP
1478 if (kexec_image->preserve_context) {
1479 syscore_resume();
1480 Enable_irqs:
1481 local_irq_enable();
1482 Enable_cpus:
1483 enable_nonboot_cpus();
1484 dpm_resume_start(PMSG_RESTORE);
1485 Resume_devices:
1486 dpm_resume_end(PMSG_RESTORE);
1487 Resume_console:
1488 resume_console();
1489 thaw_processes();
1490 Restore_console:
1491 pm_restore_console();
1492 unlock_system_sleep();
1493 }
1494#endif
1495
1496 Unlock:
1497 mutex_unlock(&kexec_mutex);
1498 return error;
1499}
1500
1501/*
1502 * Add and remove page tables for crashkernel memory
1503 *
1504 * Provide an empty default implementation here -- architecture
1505 * code may override this
1506 */
1507void __weak crash_map_reserved_pages(void)
1508{}
1509
1510void __weak crash_unmap_reserved_pages(void)
1511{}