]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/kernel/sev-es.c
x86/sev-es: Add a Runtime #VC Exception Handler
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kernel / sev-es.c
CommitLineData
f980f9c3
JR
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * AMD Memory Encryption Support
4 *
5 * Copyright (C) 2019 SUSE
6 *
7 * Author: Joerg Roedel <jroedel@suse.de>
8 */
9
0786138c
TL
10#define pr_fmt(fmt) "SEV-ES: " fmt
11
1aa9aa8e 12#include <linux/sched/debug.h> /* For show_regs() */
885689e4
TL
13#include <linux/percpu-defs.h>
14#include <linux/mem_encrypt.h>
0786138c 15#include <linux/lockdep.h>
1aa9aa8e 16#include <linux/printk.h>
885689e4
TL
17#include <linux/mm_types.h>
18#include <linux/set_memory.h>
19#include <linux/memblock.h>
20#include <linux/kernel.h>
f980f9c3
JR
21#include <linux/mm.h>
22
02772fb9 23#include <asm/cpu_entry_area.h>
f980f9c3
JR
24#include <asm/sev-es.h>
25#include <asm/insn-eval.h>
26#include <asm/fpu/internal.h>
27#include <asm/processor.h>
0786138c
TL
28#include <asm/realmode.h>
29#include <asm/traps.h>
f980f9c3
JR
30#include <asm/svm.h>
31
1aa9aa8e
JR
32/* For early boot hypervisor communication in SEV-ES enabled guests */
33static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
34
35/*
36 * Needs to be in the .data section because we need it NULL before bss is
37 * cleared
38 */
39static struct ghcb __initdata *boot_ghcb;
40
885689e4
TL
41/* #VC handler runtime per-CPU data */
42struct sev_es_runtime_data {
43 struct ghcb ghcb_page;
02772fb9
JR
44
45 /* Physical storage for the per-CPU IST stack of the #VC handler */
46 char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
47
48 /*
49 * Physical storage for the per-CPU fall-back stack of the #VC handler.
50 * The fall-back stack is used when it is not safe to switch back to the
51 * interrupted stack in the #VC entry code.
52 */
53 char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
0786138c
TL
54
55 /*
56 * Reserve one page per CPU as backup storage for the unencrypted GHCB.
57 * It is needed when an NMI happens while the #VC handler uses the real
58 * GHCB, and the NMI handler itself is causing another #VC exception. In
59 * that case the GHCB content of the first handler needs to be backed up
60 * and restored.
61 */
62 struct ghcb backup_ghcb;
63
64 /*
65 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
66 * There is no need for it to be atomic, because nothing is written to
67 * the GHCB between the read and the write of ghcb_active. So it is safe
68 * to use it when a nested #VC exception happens before the write.
69 *
70 * This is necessary for example in the #VC->NMI->#VC case when the NMI
71 * happens while the first #VC handler uses the GHCB. When the NMI code
72 * raises a second #VC handler it might overwrite the contents of the
73 * GHCB written by the first handler. To avoid this the content of the
74 * GHCB is saved and restored when the GHCB is detected to be in use
75 * already.
76 */
77 bool ghcb_active;
78 bool backup_ghcb_active;
79};
80
81struct ghcb_state {
82 struct ghcb *ghcb;
885689e4
TL
83};
84
85static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
315562c9 86DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
885689e4 87
0786138c
TL
88/* Needed in vc_early_forward_exception */
89void do_early_exception(struct pt_regs *regs, int trapnr);
90
02772fb9
JR
91static void __init setup_vc_stacks(int cpu)
92{
93 struct sev_es_runtime_data *data;
94 struct cpu_entry_area *cea;
95 unsigned long vaddr;
96 phys_addr_t pa;
97
98 data = per_cpu(runtime_data, cpu);
99 cea = get_cpu_entry_area(cpu);
100
101 /* Map #VC IST stack */
102 vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
103 pa = __pa(data->ist_stack);
104 cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
105
106 /* Map VC fall-back stack */
107 vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
108 pa = __pa(data->fallback_stack);
109 cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
110}
111
315562c9
JR
112static __always_inline bool on_vc_stack(unsigned long sp)
113{
114 return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
115}
116
117/*
118 * This function handles the case when an NMI is raised in the #VC exception
119 * handler entry code. In this case, the IST entry for #VC must be adjusted, so
120 * that any subsequent #VC exception will not overwrite the stack contents of the
121 * interrupted #VC handler.
122 *
123 * The IST entry is adjusted unconditionally so that it can be also be
124 * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
125 * sev_es_ist_exit() call may adjust back the IST entry too early.
126 */
127void noinstr __sev_es_ist_enter(struct pt_regs *regs)
128{
129 unsigned long old_ist, new_ist;
130
131 /* Read old IST entry */
132 old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
133
134 /* Make room on the IST stack */
135 if (on_vc_stack(regs->sp))
136 new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
137 else
138 new_ist = old_ist - sizeof(old_ist);
139
140 /* Store old IST entry */
141 *(unsigned long *)new_ist = old_ist;
142
143 /* Set new IST entry */
144 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
145}
146
147void noinstr __sev_es_ist_exit(void)
148{
149 unsigned long ist;
150
151 /* Read IST entry */
152 ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
153
154 if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
155 return;
156
157 /* Read back old IST entry and write it to the TSS */
158 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
159}
160
0786138c
TL
161static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
162{
163 struct sev_es_runtime_data *data;
164 struct ghcb *ghcb;
165
166 data = this_cpu_read(runtime_data);
167 ghcb = &data->ghcb_page;
168
169 if (unlikely(data->ghcb_active)) {
170 /* GHCB is already in use - save its contents */
171
172 if (unlikely(data->backup_ghcb_active))
173 return NULL;
174
175 /* Mark backup_ghcb active before writing to it */
176 data->backup_ghcb_active = true;
177
178 state->ghcb = &data->backup_ghcb;
179
180 /* Backup GHCB content */
181 *state->ghcb = *ghcb;
182 } else {
183 state->ghcb = NULL;
184 data->ghcb_active = true;
185 }
186
187 return ghcb;
188}
189
190static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
191{
192 struct sev_es_runtime_data *data;
193 struct ghcb *ghcb;
194
195 data = this_cpu_read(runtime_data);
196 ghcb = &data->ghcb_page;
197
198 if (state->ghcb) {
199 /* Restore GHCB from Backup */
200 *ghcb = *state->ghcb;
201 data->backup_ghcb_active = false;
202 state->ghcb = NULL;
203 } else {
204 data->ghcb_active = false;
205 }
206}
1aa9aa8e 207
f980f9c3
JR
208static inline u64 sev_es_rd_ghcb_msr(void)
209{
210 return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
211}
212
213static inline void sev_es_wr_ghcb_msr(u64 val)
214{
215 u32 low, high;
216
217 low = (u32)(val);
218 high = (u32)(val >> 32);
219
220 native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
221}
222
223static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
224 unsigned char *buffer)
225{
226 return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
227}
228
229static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
230{
231 char buffer[MAX_INSN_SIZE];
232 enum es_result ret;
233 int res;
234
235 res = vc_fetch_insn_kernel(ctxt, buffer);
236 if (unlikely(res == -EFAULT)) {
237 ctxt->fi.vector = X86_TRAP_PF;
238 ctxt->fi.error_code = 0;
239 ctxt->fi.cr2 = ctxt->regs->ip;
240 return ES_EXCEPTION;
241 }
242
243 insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
244 insn_get_length(&ctxt->insn);
245
246 ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
247
248 return ret;
249}
250
251static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
252 char *dst, char *buf, size_t size)
253{
254 unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
255 char __user *target = (char __user *)dst;
256 u64 d8;
257 u32 d4;
258 u16 d2;
259 u8 d1;
260
261 switch (size) {
262 case 1:
263 memcpy(&d1, buf, 1);
264 if (put_user(d1, target))
265 goto fault;
266 break;
267 case 2:
268 memcpy(&d2, buf, 2);
269 if (put_user(d2, target))
270 goto fault;
271 break;
272 case 4:
273 memcpy(&d4, buf, 4);
274 if (put_user(d4, target))
275 goto fault;
276 break;
277 case 8:
278 memcpy(&d8, buf, 8);
279 if (put_user(d8, target))
280 goto fault;
281 break;
282 default:
283 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
284 return ES_UNSUPPORTED;
285 }
286
287 return ES_OK;
288
289fault:
290 if (user_mode(ctxt->regs))
291 error_code |= X86_PF_USER;
292
293 ctxt->fi.vector = X86_TRAP_PF;
294 ctxt->fi.error_code = error_code;
295 ctxt->fi.cr2 = (unsigned long)dst;
296
297 return ES_EXCEPTION;
298}
299
300static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
301 char *src, char *buf, size_t size)
302{
303 unsigned long error_code = X86_PF_PROT;
304 char __user *s = (char __user *)src;
305 u64 d8;
306 u32 d4;
307 u16 d2;
308 u8 d1;
309
310 switch (size) {
311 case 1:
312 if (get_user(d1, s))
313 goto fault;
314 memcpy(buf, &d1, 1);
315 break;
316 case 2:
317 if (get_user(d2, s))
318 goto fault;
319 memcpy(buf, &d2, 2);
320 break;
321 case 4:
322 if (get_user(d4, s))
323 goto fault;
324 memcpy(buf, &d4, 4);
325 break;
326 case 8:
327 if (get_user(d8, s))
328 goto fault;
329 memcpy(buf, &d8, 8);
330 break;
331 default:
332 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
333 return ES_UNSUPPORTED;
334 }
335
336 return ES_OK;
337
338fault:
339 if (user_mode(ctxt->regs))
340 error_code |= X86_PF_USER;
341
342 ctxt->fi.vector = X86_TRAP_PF;
343 ctxt->fi.error_code = error_code;
344 ctxt->fi.cr2 = (unsigned long)src;
345
346 return ES_EXCEPTION;
347}
348
349/* Include code shared with pre-decompression boot stage */
350#include "sev-es-shared.c"
1aa9aa8e
JR
351
352/*
353 * This function runs on the first #VC exception after the kernel
354 * switched to virtual addresses.
355 */
356static bool __init sev_es_setup_ghcb(void)
357{
358 /* First make sure the hypervisor talks a supported protocol. */
359 if (!sev_es_negotiate_protocol())
360 return false;
361
362 /*
363 * Clear the boot_ghcb. The first exception comes in before the bss
364 * section is cleared.
365 */
366 memset(&boot_ghcb_page, 0, PAGE_SIZE);
367
368 /* Alright - Make the boot-ghcb public */
369 boot_ghcb = &boot_ghcb_page;
370
371 return true;
372}
373
885689e4
TL
374static void __init alloc_runtime_data(int cpu)
375{
376 struct sev_es_runtime_data *data;
377
378 data = memblock_alloc(sizeof(*data), PAGE_SIZE);
379 if (!data)
380 panic("Can't allocate SEV-ES runtime data");
381
382 per_cpu(runtime_data, cpu) = data;
383}
384
385static void __init init_ghcb(int cpu)
386{
387 struct sev_es_runtime_data *data;
388 int err;
389
390 data = per_cpu(runtime_data, cpu);
391
392 err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
393 sizeof(data->ghcb_page));
394 if (err)
395 panic("Can't map GHCBs unencrypted");
396
397 memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
0786138c
TL
398
399 data->ghcb_active = false;
400 data->backup_ghcb_active = false;
885689e4
TL
401}
402
403void __init sev_es_init_vc_handling(void)
404{
405 int cpu;
406
407 BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
408
409 if (!sev_es_active())
410 return;
411
315562c9
JR
412 /* Enable SEV-ES special handling */
413 static_branch_enable(&sev_es_enable_key);
414
885689e4
TL
415 /* Initialize per-cpu GHCB pages */
416 for_each_possible_cpu(cpu) {
417 alloc_runtime_data(cpu);
418 init_ghcb(cpu);
02772fb9 419 setup_vc_stacks(cpu);
885689e4 420 }
0786138c
TL
421
422 /* Secondary CPUs use the runtime #VC handler */
423 initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
885689e4
TL
424}
425
1aa9aa8e
JR
426static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
427{
428 int trapnr = ctxt->fi.vector;
429
430 if (trapnr == X86_TRAP_PF)
431 native_write_cr2(ctxt->fi.cr2);
432
433 ctxt->regs->orig_ax = ctxt->fi.error_code;
434 do_early_exception(ctxt->regs, trapnr);
435}
436
437static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
438 struct ghcb *ghcb,
439 unsigned long exit_code)
440{
441 enum es_result result;
442
443 switch (exit_code) {
444 default:
445 /*
446 * Unexpected #VC exception
447 */
448 result = ES_UNSUPPORTED;
449 }
450
451 return result;
452}
453
0786138c
TL
454static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
455{
456 long error_code = ctxt->fi.error_code;
457 int trapnr = ctxt->fi.vector;
458
459 ctxt->regs->orig_ax = ctxt->fi.error_code;
460
461 switch (trapnr) {
462 case X86_TRAP_GP:
463 exc_general_protection(ctxt->regs, error_code);
464 break;
465 case X86_TRAP_UD:
466 exc_invalid_op(ctxt->regs);
467 break;
468 default:
469 pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
470 BUG();
471 }
472}
473
474static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
475{
476 unsigned long sp = (unsigned long)regs;
477
478 return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
479}
480
481/*
482 * Main #VC exception handler. It is called when the entry code was able to
483 * switch off the IST to a safe kernel stack.
484 *
485 * With the current implementation it is always possible to switch to a safe
486 * stack because #VC exceptions only happen at known places, like intercepted
487 * instructions or accesses to MMIO areas/IO ports. They can also happen with
488 * code instrumentation when the hypervisor intercepts #DB, but the critical
489 * paths are forbidden to be instrumented, so #DB exceptions currently also
490 * only happen in safe places.
491 */
492DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
493{
494 struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
495 struct ghcb_state state;
496 struct es_em_ctxt ctxt;
497 enum es_result result;
498 struct ghcb *ghcb;
499
500 lockdep_assert_irqs_disabled();
501 instrumentation_begin();
502
503 /*
504 * This is invoked through an interrupt gate, so IRQs are disabled. The
505 * code below might walk page-tables for user or kernel addresses, so
506 * keep the IRQs disabled to protect us against concurrent TLB flushes.
507 */
508
509 ghcb = sev_es_get_ghcb(&state);
510 if (!ghcb) {
511 /*
512 * Mark GHCBs inactive so that panic() is able to print the
513 * message.
514 */
515 data->ghcb_active = false;
516 data->backup_ghcb_active = false;
517
518 panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
519 }
520
521 vc_ghcb_invalidate(ghcb);
522 result = vc_init_em_ctxt(&ctxt, regs, error_code);
523
524 if (result == ES_OK)
525 result = vc_handle_exitcode(&ctxt, ghcb, error_code);
526
527 sev_es_put_ghcb(&state);
528
529 /* Done - now check the result */
530 switch (result) {
531 case ES_OK:
532 vc_finish_insn(&ctxt);
533 break;
534 case ES_UNSUPPORTED:
535 pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
536 error_code, regs->ip);
537 goto fail;
538 case ES_VMM_ERROR:
539 pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
540 error_code, regs->ip);
541 goto fail;
542 case ES_DECODE_FAILED:
543 pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
544 error_code, regs->ip);
545 goto fail;
546 case ES_EXCEPTION:
547 vc_forward_exception(&ctxt);
548 break;
549 case ES_RETRY:
550 /* Nothing to do */
551 break;
552 default:
553 pr_emerg("Unknown result in %s():%d\n", __func__, result);
554 /*
555 * Emulating the instruction which caused the #VC exception
556 * failed - can't continue so print debug information
557 */
558 BUG();
559 }
560
561out:
562 instrumentation_end();
563
564 return;
565
566fail:
567 if (user_mode(regs)) {
568 /*
569 * Do not kill the machine if user-space triggered the
570 * exception. Send SIGBUS instead and let user-space deal with
571 * it.
572 */
573 force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
574 } else {
575 pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
576 result);
577
578 /* Show some debug info */
579 show_regs(regs);
580
581 /* Ask hypervisor to sev_es_terminate */
582 sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
583
584 /* If that fails and we get here - just panic */
585 panic("Returned from Terminate-Request to Hypervisor\n");
586 }
587
588 goto out;
589}
590
591/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
592DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
593{
594 instrumentation_begin();
595 panic("Can't handle #VC exception from unsupported context\n");
596 instrumentation_end();
597}
598
599DEFINE_IDTENTRY_VC(exc_vmm_communication)
600{
601 if (likely(!on_vc_fallback_stack(regs)))
602 safe_stack_exc_vmm_communication(regs, error_code);
603 else
604 ist_exc_vmm_communication(regs, error_code);
605}
606
1aa9aa8e
JR
607bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
608{
609 unsigned long exit_code = regs->orig_ax;
610 struct es_em_ctxt ctxt;
611 enum es_result result;
612
613 /* Do initial setup or terminate the guest */
614 if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
615 sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
616
617 vc_ghcb_invalidate(boot_ghcb);
618
619 result = vc_init_em_ctxt(&ctxt, regs, exit_code);
620 if (result == ES_OK)
621 result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
622
623 /* Done - now check the result */
624 switch (result) {
625 case ES_OK:
626 vc_finish_insn(&ctxt);
627 break;
628 case ES_UNSUPPORTED:
629 early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
630 exit_code, regs->ip);
631 goto fail;
632 case ES_VMM_ERROR:
633 early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
634 exit_code, regs->ip);
635 goto fail;
636 case ES_DECODE_FAILED:
637 early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
638 exit_code, regs->ip);
639 goto fail;
640 case ES_EXCEPTION:
641 vc_early_forward_exception(&ctxt);
642 break;
643 case ES_RETRY:
644 /* Nothing to do */
645 break;
646 default:
647 BUG();
648 }
649
650 return true;
651
652fail:
653 show_regs(regs);
654
655 while (true)
656 halt();
657}