]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/x86/kernel/machine_kexec_64.c
UBUNTU: Ubuntu-4.15.0-96.97
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kernel / machine_kexec_64.c
1 /*
2 * handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9 #define pr_fmt(fmt) "kexec: " fmt
10
11 #include <linux/mm.h>
12 #include <linux/kexec.h>
13 #include <linux/string.h>
14 #include <linux/gfp.h>
15 #include <linux/reboot.h>
16 #include <linux/numa.h>
17 #include <linux/ftrace.h>
18 #include <linux/io.h>
19 #include <linux/suspend.h>
20 #include <linux/vmalloc.h>
21
22 #include <asm/init.h>
23 #include <asm/pgtable.h>
24 #include <asm/tlbflush.h>
25 #include <asm/mmu_context.h>
26 #include <asm/io_apic.h>
27 #include <asm/debugreg.h>
28 #include <asm/kexec-bzimage64.h>
29 #include <asm/setup.h>
30 #include <asm/set_memory.h>
31
32 #ifdef CONFIG_KEXEC_FILE
33 static struct kexec_file_ops *kexec_file_loaders[] = {
34 &kexec_bzImage64_ops,
35 };
36 #endif
37
38 static void free_transition_pgtable(struct kimage *image)
39 {
40 free_page((unsigned long)image->arch.p4d);
41 image->arch.p4d = NULL;
42 free_page((unsigned long)image->arch.pud);
43 image->arch.pud = NULL;
44 free_page((unsigned long)image->arch.pmd);
45 image->arch.pmd = NULL;
46 free_page((unsigned long)image->arch.pte);
47 image->arch.pte = NULL;
48 }
49
50 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
51 {
52 p4d_t *p4d;
53 pud_t *pud;
54 pmd_t *pmd;
55 pte_t *pte;
56 unsigned long vaddr, paddr;
57 int result = -ENOMEM;
58
59 vaddr = (unsigned long)relocate_kernel;
60 paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
61 pgd += pgd_index(vaddr);
62 if (!pgd_present(*pgd)) {
63 p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
64 if (!p4d)
65 goto err;
66 image->arch.p4d = p4d;
67 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
68 }
69 p4d = p4d_offset(pgd, vaddr);
70 if (!p4d_present(*p4d)) {
71 pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
72 if (!pud)
73 goto err;
74 image->arch.pud = pud;
75 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
76 }
77 pud = pud_offset(p4d, vaddr);
78 if (!pud_present(*pud)) {
79 pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
80 if (!pmd)
81 goto err;
82 image->arch.pmd = pmd;
83 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
84 }
85 pmd = pmd_offset(pud, vaddr);
86 if (!pmd_present(*pmd)) {
87 pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
88 if (!pte)
89 goto err;
90 image->arch.pte = pte;
91 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
92 }
93 pte = pte_offset_kernel(pmd, vaddr);
94 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
95 return 0;
96 err:
97 return result;
98 }
99
100 static void *alloc_pgt_page(void *data)
101 {
102 struct kimage *image = (struct kimage *)data;
103 struct page *page;
104 void *p = NULL;
105
106 page = kimage_alloc_control_pages(image, 0);
107 if (page) {
108 p = page_address(page);
109 clear_page(p);
110 }
111
112 return p;
113 }
114
115 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
116 {
117 struct x86_mapping_info info = {
118 .alloc_pgt_page = alloc_pgt_page,
119 .context = image,
120 .page_flag = __PAGE_KERNEL_LARGE_EXEC,
121 .kernpg_flag = _KERNPG_TABLE_NOENC,
122 };
123 unsigned long mstart, mend;
124 pgd_t *level4p;
125 int result;
126 int i;
127
128 level4p = (pgd_t *)__va(start_pgtable);
129 clear_page(level4p);
130
131 if (direct_gbpages)
132 info.direct_gbpages = true;
133
134 for (i = 0; i < nr_pfn_mapped; i++) {
135 mstart = pfn_mapped[i].start << PAGE_SHIFT;
136 mend = pfn_mapped[i].end << PAGE_SHIFT;
137
138 result = kernel_ident_mapping_init(&info,
139 level4p, mstart, mend);
140 if (result)
141 return result;
142 }
143
144 /*
145 * segments's mem ranges could be outside 0 ~ max_pfn,
146 * for example when jump back to original kernel from kexeced kernel.
147 * or first kernel is booted with user mem map, and second kernel
148 * could be loaded out of that range.
149 */
150 for (i = 0; i < image->nr_segments; i++) {
151 mstart = image->segment[i].mem;
152 mend = mstart + image->segment[i].memsz;
153
154 result = kernel_ident_mapping_init(&info,
155 level4p, mstart, mend);
156
157 if (result)
158 return result;
159 }
160
161 return init_transition_pgtable(image, level4p);
162 }
163
164 static void set_idt(void *newidt, u16 limit)
165 {
166 struct desc_ptr curidt;
167
168 /* x86-64 supports unaliged loads & stores */
169 curidt.size = limit;
170 curidt.address = (unsigned long)newidt;
171
172 __asm__ __volatile__ (
173 "lidtq %0\n"
174 : : "m" (curidt)
175 );
176 };
177
178
179 static void set_gdt(void *newgdt, u16 limit)
180 {
181 struct desc_ptr curgdt;
182
183 /* x86-64 supports unaligned loads & stores */
184 curgdt.size = limit;
185 curgdt.address = (unsigned long)newgdt;
186
187 __asm__ __volatile__ (
188 "lgdtq %0\n"
189 : : "m" (curgdt)
190 );
191 };
192
193 static void load_segments(void)
194 {
195 __asm__ __volatile__ (
196 "\tmovl %0,%%ds\n"
197 "\tmovl %0,%%es\n"
198 "\tmovl %0,%%ss\n"
199 "\tmovl %0,%%fs\n"
200 "\tmovl %0,%%gs\n"
201 : : "a" (__KERNEL_DS) : "memory"
202 );
203 }
204
205 #ifdef CONFIG_KEXEC_FILE
206 /* Update purgatory as needed after various image segments have been prepared */
207 static int arch_update_purgatory(struct kimage *image)
208 {
209 int ret = 0;
210
211 if (!image->file_mode)
212 return 0;
213
214 /* Setup copying of backup region */
215 if (image->type == KEXEC_TYPE_CRASH) {
216 ret = kexec_purgatory_get_set_symbol(image,
217 "purgatory_backup_dest",
218 &image->arch.backup_load_addr,
219 sizeof(image->arch.backup_load_addr), 0);
220 if (ret)
221 return ret;
222
223 ret = kexec_purgatory_get_set_symbol(image,
224 "purgatory_backup_src",
225 &image->arch.backup_src_start,
226 sizeof(image->arch.backup_src_start), 0);
227 if (ret)
228 return ret;
229
230 ret = kexec_purgatory_get_set_symbol(image,
231 "purgatory_backup_sz",
232 &image->arch.backup_src_sz,
233 sizeof(image->arch.backup_src_sz), 0);
234 if (ret)
235 return ret;
236 }
237
238 return ret;
239 }
240 #else /* !CONFIG_KEXEC_FILE */
241 static inline int arch_update_purgatory(struct kimage *image)
242 {
243 return 0;
244 }
245 #endif /* CONFIG_KEXEC_FILE */
246
247 int machine_kexec_prepare(struct kimage *image)
248 {
249 unsigned long start_pgtable;
250 int result;
251
252 /* Calculate the offsets */
253 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
254
255 /* Setup the identity mapped 64bit page table */
256 result = init_pgtable(image, start_pgtable);
257 if (result)
258 return result;
259
260 /* update purgatory as needed */
261 result = arch_update_purgatory(image);
262 if (result)
263 return result;
264
265 return 0;
266 }
267
268 void machine_kexec_cleanup(struct kimage *image)
269 {
270 free_transition_pgtable(image);
271 }
272
273 /*
274 * Do not allocate memory (or fail in any way) in machine_kexec().
275 * We are past the point of no return, committed to rebooting now.
276 */
277 void machine_kexec(struct kimage *image)
278 {
279 unsigned long page_list[PAGES_NR];
280 void *control_page;
281 int save_ftrace_enabled;
282
283 #ifdef CONFIG_KEXEC_JUMP
284 if (image->preserve_context)
285 save_processor_state();
286 #endif
287
288 save_ftrace_enabled = __ftrace_enabled_save();
289
290 /* Interrupts aren't acceptable while we reboot */
291 local_irq_disable();
292 hw_breakpoint_disable();
293
294 if (image->preserve_context) {
295 #ifdef CONFIG_X86_IO_APIC
296 /*
297 * We need to put APICs in legacy mode so that we can
298 * get timer interrupts in second kernel. kexec/kdump
299 * paths already have calls to disable_IO_APIC() in
300 * one form or other. kexec jump path also need
301 * one.
302 */
303 disable_IO_APIC();
304 #endif
305 }
306
307 control_page = page_address(image->control_code_page) + PAGE_SIZE;
308 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
309
310 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
311 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
312 page_list[PA_TABLE_PAGE] =
313 (unsigned long)__pa(page_address(image->control_code_page));
314
315 if (image->type == KEXEC_TYPE_DEFAULT)
316 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
317 << PAGE_SHIFT);
318
319 /*
320 * The segment registers are funny things, they have both a
321 * visible and an invisible part. Whenever the visible part is
322 * set to a specific selector, the invisible part is loaded
323 * with from a table in memory. At no other time is the
324 * descriptor table in memory accessed.
325 *
326 * I take advantage of this here by force loading the
327 * segments, before I zap the gdt with an invalid value.
328 */
329 load_segments();
330 /*
331 * The gdt & idt are now invalid.
332 * If you want to load them you must set up your own idt & gdt.
333 */
334 set_gdt(phys_to_virt(0), 0);
335 set_idt(phys_to_virt(0), 0);
336
337 /* now call it */
338 image->start = relocate_kernel((unsigned long)image->head,
339 (unsigned long)page_list,
340 image->start,
341 image->preserve_context,
342 sme_active());
343
344 #ifdef CONFIG_KEXEC_JUMP
345 if (image->preserve_context)
346 restore_processor_state();
347 #endif
348
349 __ftrace_enabled_restore(save_ftrace_enabled);
350 }
351
352 void arch_crash_save_vmcoreinfo(void)
353 {
354 VMCOREINFO_NUMBER(phys_base);
355 VMCOREINFO_SYMBOL(init_top_pgt);
356
357 #ifdef CONFIG_NUMA
358 VMCOREINFO_SYMBOL(node_data);
359 VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
360 #endif
361 vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
362 kaslr_offset());
363 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
364 }
365
366 /* arch-dependent functionality related to kexec file-based syscall */
367
368 #ifdef CONFIG_KEXEC_FILE
369 int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
370 unsigned long buf_len)
371 {
372 int i, ret = -ENOEXEC;
373 struct kexec_file_ops *fops;
374
375 for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
376 fops = kexec_file_loaders[i];
377 if (!fops || !fops->probe)
378 continue;
379
380 ret = fops->probe(buf, buf_len);
381 if (!ret) {
382 image->fops = fops;
383 return ret;
384 }
385 }
386
387 return ret;
388 }
389
390 void *arch_kexec_kernel_image_load(struct kimage *image)
391 {
392 vfree(image->arch.elf_headers);
393 image->arch.elf_headers = NULL;
394
395 if (!image->fops || !image->fops->load)
396 return ERR_PTR(-ENOEXEC);
397
398 return image->fops->load(image, image->kernel_buf,
399 image->kernel_buf_len, image->initrd_buf,
400 image->initrd_buf_len, image->cmdline_buf,
401 image->cmdline_buf_len);
402 }
403
404 int arch_kimage_file_post_load_cleanup(struct kimage *image)
405 {
406 if (!image->fops || !image->fops->cleanup)
407 return 0;
408
409 return image->fops->cleanup(image->image_loader_data);
410 }
411
412 #ifdef CONFIG_KEXEC_VERIFY_SIG
413 int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
414 unsigned long kernel_len)
415 {
416 if (!image->fops || !image->fops->verify_sig) {
417 pr_debug("kernel loader does not support signature verification.");
418 return -EKEYREJECTED;
419 }
420
421 return image->fops->verify_sig(kernel, kernel_len);
422 }
423 #endif
424
425 /*
426 * Apply purgatory relocations.
427 *
428 * ehdr: Pointer to elf headers
429 * sechdrs: Pointer to section headers.
430 * relsec: section index of SHT_RELA section.
431 *
432 * TODO: Some of the code belongs to generic code. Move that in kexec.c.
433 */
434 int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
435 Elf64_Shdr *sechdrs, unsigned int relsec)
436 {
437 unsigned int i;
438 Elf64_Rela *rel;
439 Elf64_Sym *sym;
440 void *location;
441 Elf64_Shdr *section, *symtabsec;
442 unsigned long address, sec_base, value;
443 const char *strtab, *name, *shstrtab;
444
445 /*
446 * ->sh_offset has been modified to keep the pointer to section
447 * contents in memory
448 */
449 rel = (void *)sechdrs[relsec].sh_offset;
450
451 /* Section to which relocations apply */
452 section = &sechdrs[sechdrs[relsec].sh_info];
453
454 pr_debug("Applying relocate section %u to %u\n", relsec,
455 sechdrs[relsec].sh_info);
456
457 /* Associated symbol table */
458 symtabsec = &sechdrs[sechdrs[relsec].sh_link];
459
460 /* String table */
461 if (symtabsec->sh_link >= ehdr->e_shnum) {
462 /* Invalid strtab section number */
463 pr_err("Invalid string table section index %d\n",
464 symtabsec->sh_link);
465 return -ENOEXEC;
466 }
467
468 strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset;
469
470 /* section header string table */
471 shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset;
472
473 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
474
475 /*
476 * rel[i].r_offset contains byte offset from beginning
477 * of section to the storage unit affected.
478 *
479 * This is location to update (->sh_offset). This is temporary
480 * buffer where section is currently loaded. This will finally
481 * be loaded to a different address later, pointed to by
482 * ->sh_addr. kexec takes care of moving it
483 * (kexec_load_segment()).
484 */
485 location = (void *)(section->sh_offset + rel[i].r_offset);
486
487 /* Final address of the location */
488 address = section->sh_addr + rel[i].r_offset;
489
490 /*
491 * rel[i].r_info contains information about symbol table index
492 * w.r.t which relocation must be made and type of relocation
493 * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
494 * these respectively.
495 */
496 sym = (Elf64_Sym *)symtabsec->sh_offset +
497 ELF64_R_SYM(rel[i].r_info);
498
499 if (sym->st_name)
500 name = strtab + sym->st_name;
501 else
502 name = shstrtab + sechdrs[sym->st_shndx].sh_name;
503
504 pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
505 name, sym->st_info, sym->st_shndx, sym->st_value,
506 sym->st_size);
507
508 if (sym->st_shndx == SHN_UNDEF) {
509 pr_err("Undefined symbol: %s\n", name);
510 return -ENOEXEC;
511 }
512
513 if (sym->st_shndx == SHN_COMMON) {
514 pr_err("symbol '%s' in common section\n", name);
515 return -ENOEXEC;
516 }
517
518 if (sym->st_shndx == SHN_ABS)
519 sec_base = 0;
520 else if (sym->st_shndx >= ehdr->e_shnum) {
521 pr_err("Invalid section %d for symbol %s\n",
522 sym->st_shndx, name);
523 return -ENOEXEC;
524 } else
525 sec_base = sechdrs[sym->st_shndx].sh_addr;
526
527 value = sym->st_value;
528 value += sec_base;
529 value += rel[i].r_addend;
530
531 switch (ELF64_R_TYPE(rel[i].r_info)) {
532 case R_X86_64_NONE:
533 break;
534 case R_X86_64_64:
535 *(u64 *)location = value;
536 break;
537 case R_X86_64_32:
538 *(u32 *)location = value;
539 if (value != *(u32 *)location)
540 goto overflow;
541 break;
542 case R_X86_64_32S:
543 *(s32 *)location = value;
544 if ((s64)value != *(s32 *)location)
545 goto overflow;
546 break;
547 case R_X86_64_PC32:
548 case R_X86_64_PLT32:
549 value -= (u64)address;
550 *(u32 *)location = value;
551 break;
552 default:
553 pr_err("Unknown rela relocation: %llu\n",
554 ELF64_R_TYPE(rel[i].r_info));
555 return -ENOEXEC;
556 }
557 }
558 return 0;
559
560 overflow:
561 pr_err("Overflow in relocation type %d value 0x%lx\n",
562 (int)ELF64_R_TYPE(rel[i].r_info), value);
563 return -ENOEXEC;
564 }
565 #endif /* CONFIG_KEXEC_FILE */
566
567 static int
568 kexec_mark_range(unsigned long start, unsigned long end, bool protect)
569 {
570 struct page *page;
571 unsigned int nr_pages;
572
573 /*
574 * For physical range: [start, end]. We must skip the unassigned
575 * crashk resource with zero-valued "end" member.
576 */
577 if (!end || start > end)
578 return 0;
579
580 page = pfn_to_page(start >> PAGE_SHIFT);
581 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
582 if (protect)
583 return set_pages_ro(page, nr_pages);
584 else
585 return set_pages_rw(page, nr_pages);
586 }
587
588 static void kexec_mark_crashkres(bool protect)
589 {
590 unsigned long control;
591
592 kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
593
594 /* Don't touch the control code page used in crash_kexec().*/
595 control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
596 /* Control code page is located in the 2nd page. */
597 kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
598 control += KEXEC_CONTROL_PAGE_SIZE;
599 kexec_mark_range(control, crashk_res.end, protect);
600 }
601
602 void arch_kexec_protect_crashkres(void)
603 {
604 kexec_mark_crashkres(true);
605 }
606
607 void arch_kexec_unprotect_crashkres(void)
608 {
609 kexec_mark_crashkres(false);
610 }
611
612 int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
613 {
614 /*
615 * If SME is active we need to be sure that kexec pages are
616 * not encrypted because when we boot to the new kernel the
617 * pages won't be accessed encrypted (initially).
618 */
619 return set_memory_decrypted((unsigned long)vaddr, pages);
620 }
621
622 void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
623 {
624 /*
625 * If SME is active we need to reset the pages back to being
626 * an encrypted mapping before freeing them.
627 */
628 set_memory_encrypted((unsigned long)vaddr, pages);
629 }