[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kernel / sev-es.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * AMD Memory Encryption Support
 *
 * Copyright (C) 2019 SUSE
 *
 * Author: Joerg Roedel <jroedel@suse.de>
 */

#define pr_fmt(fmt)	"SEV-ES: " fmt

#include <linux/sched/debug.h>	/* For show_regs() */
#include <linux/percpu-defs.h>
#include <linux/mem_encrypt.h>
#include <linux/lockdep.h>
#include <linux/printk.h>
#include <linux/mm_types.h>
#include <linux/set_memory.h>
#include <linux/memblock.h>
#include <linux/kernel.h>
#include <linux/mm.h>

#include <asm/cpu_entry_area.h>
#include <asm/sev-es.h>
#include <asm/insn-eval.h>
#include <asm/fpu/internal.h>
#include <asm/processor.h>
#include <asm/realmode.h>
#include <asm/traps.h>
#include <asm/svm.h>

/* For early boot hypervisor communication in SEV-ES enabled guests */
static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);

/*
 * Needs to be in the .data section because we need it NULL before bss is
 * cleared
 */
static struct ghcb __initdata *boot_ghcb;

/* #VC handler runtime per-CPU data */
struct sev_es_runtime_data {
	struct ghcb ghcb_page;

	/* Physical storage for the per-CPU IST stack of the #VC handler */
	char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);

	/*
	 * Physical storage for the per-CPU fall-back stack of the #VC handler.
	 * The fall-back stack is used when it is not safe to switch back to the
	 * interrupted stack in the #VC entry code.
	 */
	char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);

	/*
	 * Reserve one page per CPU as backup storage for the unencrypted GHCB.
	 * It is needed when an NMI happens while the #VC handler uses the real
	 * GHCB, and the NMI handler itself is causing another #VC exception. In
	 * that case the GHCB content of the first handler needs to be backed up
	 * and restored.
	 */
	struct ghcb backup_ghcb;

	/*
	 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
	 * There is no need for it to be atomic, because nothing is written to
	 * the GHCB between the read and the write of ghcb_active. So it is safe
	 * to use it when a nested #VC exception happens before the write.
	 *
	 * This is necessary for example in the #VC->NMI->#VC case when the NMI
	 * happens while the first #VC handler uses the GHCB. When the NMI code
	 * raises a second #VC handler it might overwrite the contents of the
	 * GHCB written by the first handler. To avoid this the content of the
	 * GHCB is saved and restored when the GHCB is detected to be in use
	 * already.
	 */
	bool ghcb_active;
	bool backup_ghcb_active;
};

struct ghcb_state {
	struct ghcb *ghcb;
};

static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);

/* Needed in vc_early_forward_exception */
void do_early_exception(struct pt_regs *regs, int trapnr);

static void __init setup_vc_stacks(int cpu)
{
	struct sev_es_runtime_data *data;
	struct cpu_entry_area *cea;
	unsigned long vaddr;
	phys_addr_t pa;

	data = per_cpu(runtime_data, cpu);
	cea  = get_cpu_entry_area(cpu);

	/* Map #VC IST stack */
	vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
	pa    = __pa(data->ist_stack);
	cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);

	/* Map VC fall-back stack */
	vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
	pa    = __pa(data->fallback_stack);
	cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
}

static __always_inline bool on_vc_stack(unsigned long sp)
{
	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
}

/*
 * This function handles the case when an NMI is raised in the #VC exception
 * handler entry code. In this case, the IST entry for #VC must be adjusted, so
 * that any subsequent #VC exception will not overwrite the stack contents of the
 * interrupted #VC handler.
 *
 * The IST entry is adjusted unconditionally so that it can be also be
 * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
 * sev_es_ist_exit() call may adjust back the IST entry too early.
 */
void noinstr __sev_es_ist_enter(struct pt_regs *regs)
{
	unsigned long old_ist, new_ist;

	/* Read old IST entry */
	old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);

	/* Make room on the IST stack */
	if (on_vc_stack(regs->sp))
		new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
	else
		new_ist = old_ist - sizeof(old_ist);

	/* Store old IST entry */
	*(unsigned long *)new_ist = old_ist;

	/* Set new IST entry */
	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
}

void noinstr __sev_es_ist_exit(void)
{
	unsigned long ist;

	/* Read IST entry */
	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);

	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
		return;

	/* Read back old IST entry and write it to the TSS */
	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
}

static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
{
	struct sev_es_runtime_data *data;
	struct ghcb *ghcb;

	data = this_cpu_read(runtime_data);
	ghcb = &data->ghcb_page;

	if (unlikely(data->ghcb_active)) {
		/* GHCB is already in use - save its contents */

		if (unlikely(data->backup_ghcb_active))
			return NULL;

		/* Mark backup_ghcb active before writing to it */
		data->backup_ghcb_active = true;

		state->ghcb = &data->backup_ghcb;

		/* Backup GHCB content */
		*state->ghcb = *ghcb;
	} else {
		state->ghcb = NULL;
		data->ghcb_active = true;
	}

	return ghcb;
}

static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
{
	struct sev_es_runtime_data *data;
	struct ghcb *ghcb;

	data = this_cpu_read(runtime_data);
	ghcb = &data->ghcb_page;

	if (state->ghcb) {
		/* Restore GHCB from Backup */
		*ghcb = *state->ghcb;
		data->backup_ghcb_active = false;
		state->ghcb = NULL;
	} else {
		data->ghcb_active = false;
	}
}

static inline u64 sev_es_rd_ghcb_msr(void)
{
	return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
}

static inline void sev_es_wr_ghcb_msr(u64 val)
{
	u32 low, high;

	low  = (u32)(val);
	high = (u32)(val >> 32);

	native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
}

static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
				unsigned char *buffer)
{
	return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
}

static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
{
	char buffer[MAX_INSN_SIZE];
	enum es_result ret;
	int res;

	res = vc_fetch_insn_kernel(ctxt, buffer);
	if (unlikely(res == -EFAULT)) {
		ctxt->fi.vector     = X86_TRAP_PF;
		ctxt->fi.error_code = 0;
		ctxt->fi.cr2        = ctxt->regs->ip;
		return ES_EXCEPTION;
	}

	insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
	insn_get_length(&ctxt->insn);

	ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;

	return ret;
}

static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
				   char *dst, char *buf, size_t size)
{
	unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
	char __user *target = (char __user *)dst;
	u64 d8;
	u32 d4;
	u16 d2;
	u8  d1;

	switch (size) {
	case 1:
		memcpy(&d1, buf, 1);
		if (put_user(d1, target))
			goto fault;
		break;
	case 2:
		memcpy(&d2, buf, 2);
		if (put_user(d2, target))
			goto fault;
		break;
	case 4:
		memcpy(&d4, buf, 4);
		if (put_user(d4, target))
			goto fault;
		break;
	case 8:
		memcpy(&d8, buf, 8);
		if (put_user(d8, target))
			goto fault;
		break;
	default:
		WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
		return ES_UNSUPPORTED;
	}

	return ES_OK;

fault:
	if (user_mode(ctxt->regs))
		error_code |= X86_PF_USER;

	ctxt->fi.vector = X86_TRAP_PF;
	ctxt->fi.error_code = error_code;
	ctxt->fi.cr2 = (unsigned long)dst;

	return ES_EXCEPTION;
}

static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
				  char *src, char *buf, size_t size)
{
	unsigned long error_code = X86_PF_PROT;
	char __user *s = (char __user *)src;
	u64 d8;
	u32 d4;
	u16 d2;
	u8  d1;

	switch (size) {
	case 1:
		if (get_user(d1, s))
			goto fault;
		memcpy(buf, &d1, 1);
		break;
	case 2:
		if (get_user(d2, s))
			goto fault;
		memcpy(buf, &d2, 2);
		break;
	case 4:
		if (get_user(d4, s))
			goto fault;
		memcpy(buf, &d4, 4);
		break;
	case 8:
		if (get_user(d8, s))
			goto fault;
		memcpy(buf, &d8, 8);
		break;
	default:
		WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
		return ES_UNSUPPORTED;
	}

	return ES_OK;

fault:
	if (user_mode(ctxt->regs))
		error_code |= X86_PF_USER;

	ctxt->fi.vector = X86_TRAP_PF;
	ctxt->fi.error_code = error_code;
	ctxt->fi.cr2 = (unsigned long)src;

	return ES_EXCEPTION;
}

/* Include code shared with pre-decompression boot stage */
#include "sev-es-shared.c"

/*
 * This function runs on the first #VC exception after the kernel
 * switched to virtual addresses.
 */
static bool __init sev_es_setup_ghcb(void)
{
	/* First make sure the hypervisor talks a supported protocol. */
	if (!sev_es_negotiate_protocol())
		return false;

	/*
	 * Clear the boot_ghcb. The first exception comes in before the bss
	 * section is cleared.
	 */
	memset(&boot_ghcb_page, 0, PAGE_SIZE);

	/* Alright - Make the boot-ghcb public */
	boot_ghcb = &boot_ghcb_page;

	return true;
}

static void __init alloc_runtime_data(int cpu)
{
	struct sev_es_runtime_data *data;

	data = memblock_alloc(sizeof(*data), PAGE_SIZE);
	if (!data)
		panic("Can't allocate SEV-ES runtime data");

	per_cpu(runtime_data, cpu) = data;
}

static void __init init_ghcb(int cpu)
{
	struct sev_es_runtime_data *data;
	int err;

	data = per_cpu(runtime_data, cpu);

	err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
					 sizeof(data->ghcb_page));
	if (err)
		panic("Can't map GHCBs unencrypted");

	memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));

	data->ghcb_active = false;
	data->backup_ghcb_active = false;
}

void __init sev_es_init_vc_handling(void)
{
	int cpu;

	BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);

	if (!sev_es_active())
		return;

	/* Enable SEV-ES special handling */
	static_branch_enable(&sev_es_enable_key);

	/* Initialize per-cpu GHCB pages */
	for_each_possible_cpu(cpu) {
		alloc_runtime_data(cpu);
		init_ghcb(cpu);
		setup_vc_stacks(cpu);
	}

	/* Secondary CPUs use the runtime #VC handler */
	initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
}

static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
{
	int trapnr = ctxt->fi.vector;

	if (trapnr == X86_TRAP_PF)
		native_write_cr2(ctxt->fi.cr2);

	ctxt->regs->orig_ax = ctxt->fi.error_code;
	do_early_exception(ctxt->regs, trapnr);
}

static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
					 struct ghcb *ghcb,
					 unsigned long exit_code)
{
	enum es_result result;

	switch (exit_code) {
	default:
		/*
		 * Unexpected #VC exception
		 */
		result = ES_UNSUPPORTED;
	}

	return result;
}

static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
{
	long error_code = ctxt->fi.error_code;
	int trapnr = ctxt->fi.vector;

	ctxt->regs->orig_ax = ctxt->fi.error_code;

	switch (trapnr) {
	case X86_TRAP_GP:
		exc_general_protection(ctxt->regs, error_code);
		break;
	case X86_TRAP_UD:
		exc_invalid_op(ctxt->regs);
		break;
	default:
		pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
		BUG();
	}
}

static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
{
	unsigned long sp = (unsigned long)regs;

	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
}

/*
 * Main #VC exception handler. It is called when the entry code was able to
 * switch off the IST to a safe kernel stack.
 *
 * With the current implementation it is always possible to switch to a safe
 * stack because #VC exceptions only happen at known places, like intercepted
 * instructions or accesses to MMIO areas/IO ports. They can also happen with
 * code instrumentation when the hypervisor intercepts #DB, but the critical
 * paths are forbidden to be instrumented, so #DB exceptions currently also
 * only happen in safe places.
 */
DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
{
	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
	struct ghcb_state state;
	struct es_em_ctxt ctxt;
	enum es_result result;
	struct ghcb *ghcb;

	lockdep_assert_irqs_disabled();
	instrumentation_begin();

	/*
	 * This is invoked through an interrupt gate, so IRQs are disabled. The
	 * code below might walk page-tables for user or kernel addresses, so
	 * keep the IRQs disabled to protect us against concurrent TLB flushes.
	 */

	ghcb = sev_es_get_ghcb(&state);
	if (!ghcb) {
		/*
		 * Mark GHCBs inactive so that panic() is able to print the
		 * message.
		 */
		data->ghcb_active        = false;
		data->backup_ghcb_active = false;

		panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
	}

	vc_ghcb_invalidate(ghcb);
	result = vc_init_em_ctxt(&ctxt, regs, error_code);

	if (result == ES_OK)
		result = vc_handle_exitcode(&ctxt, ghcb, error_code);

	sev_es_put_ghcb(&state);

	/* Done - now check the result */
	switch (result) {
	case ES_OK:
		vc_finish_insn(&ctxt);
		break;
	case ES_UNSUPPORTED:
		pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
				   error_code, regs->ip);
		goto fail;
	case ES_VMM_ERROR:
		pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
				   error_code, regs->ip);
		goto fail;
	case ES_DECODE_FAILED:
		pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
				   error_code, regs->ip);
		goto fail;
	case ES_EXCEPTION:
		vc_forward_exception(&ctxt);
		break;
	case ES_RETRY:
		/* Nothing to do */
		break;
	default:
		pr_emerg("Unknown result in %s():%d\n", __func__, result);
		/*
		 * Emulating the instruction which caused the #VC exception
		 * failed - can't continue so print debug information
		 */
		BUG();
	}

out:
	instrumentation_end();

	return;

fail:
	if (user_mode(regs)) {
		/*
		 * Do not kill the machine if user-space triggered the
		 * exception. Send SIGBUS instead and let user-space deal with
		 * it.
		 */
		force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
	} else {
		pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
			 result);

		/* Show some debug info */
		show_regs(regs);

		/* Ask hypervisor to sev_es_terminate */
		sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);

		/* If that fails and we get here - just panic */
		panic("Returned from Terminate-Request to Hypervisor\n");
	}

	goto out;
}

/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
{
	instrumentation_begin();
	panic("Can't handle #VC exception from unsupported context\n");
	instrumentation_end();
}

DEFINE_IDTENTRY_VC(exc_vmm_communication)
{
	if (likely(!on_vc_fallback_stack(regs)))
		safe_stack_exc_vmm_communication(regs, error_code);
	else
		ist_exc_vmm_communication(regs, error_code);
}

bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
{
	unsigned long exit_code = regs->orig_ax;
	struct es_em_ctxt ctxt;
	enum es_result result;

	/* Do initial setup or terminate the guest */
	if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
		sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);

	vc_ghcb_invalidate(boot_ghcb);

	result = vc_init_em_ctxt(&ctxt, regs, exit_code);
	if (result == ES_OK)
		result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);

	/* Done - now check the result */
	switch (result) {
	case ES_OK:
		vc_finish_insn(&ctxt);
		break;
	case ES_UNSUPPORTED:
		early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
				exit_code, regs->ip);
		goto fail;
	case ES_VMM_ERROR:
		early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
				exit_code, regs->ip);
		goto fail;
	case ES_DECODE_FAILED:
		early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
				exit_code, regs->ip);
		goto fail;
	case ES_EXCEPTION:
		vc_early_forward_exception(&ctxt);
		break;
	case ES_RETRY:
		/* Nothing to do */
		break;
	default:
		BUG();
	}

	return true;

fail:
	show_regs(regs);

	while (true)
		halt();
}
Commit	Line	Data
f980f9c3 JR	1	// SPDX-License-Identifier: GPL-2.0-only
	2	/*
	3	* AMD Memory Encryption Support
	4	*
	5	* Copyright (C) 2019 SUSE
	6	*
	7	* Author: Joerg Roedel <jroedel@suse.de>
	8	*/
	9
0786138c TL	10	#define pr_fmt(fmt) "SEV-ES: " fmt
0786138c TL	11
1aa9aa8e	12	#include <linux/sched/debug.h> /* For show_regs() */
885689e4 TL	13	#include <linux/percpu-defs.h>
885689e4 TL	14	#include <linux/mem_encrypt.h>
0786138c	15	#include <linux/lockdep.h>
1aa9aa8e	16	#include <linux/printk.h>
885689e4 TL	17	#include <linux/mm_types.h>
	18	#include <linux/set_memory.h>
	19	#include <linux/memblock.h>
	20	#include <linux/kernel.h>
f980f9c3 JR	21	#include <linux/mm.h>
f980f9c3 JR	22
02772fb9	23	#include <asm/cpu_entry_area.h>
f980f9c3 JR	24	#include <asm/sev-es.h>
	25	#include <asm/insn-eval.h>
	26	#include <asm/fpu/internal.h>
	27	#include <asm/processor.h>
0786138c TL	28	#include <asm/realmode.h>
0786138c TL	29	#include <asm/traps.h>
f980f9c3 JR	30	#include <asm/svm.h>
f980f9c3 JR	31
1aa9aa8e JR	32	/* For early boot hypervisor communication in SEV-ES enabled guests */
	33	static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
	34
	35	/*
	36	* Needs to be in the .data section because we need it NULL before bss is
	37	* cleared
	38	*/
	39	static struct ghcb __initdata *boot_ghcb;
	40
885689e4 TL	41	/* #VC handler runtime per-CPU data */
	42	struct sev_es_runtime_data {
	43	struct ghcb ghcb_page;
02772fb9 JR	44
	45	/* Physical storage for the per-CPU IST stack of the #VC handler */
	46	char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
	47
	48	/*
	49	* Physical storage for the per-CPU fall-back stack of the #VC handler.
	50	* The fall-back stack is used when it is not safe to switch back to the
	51	* interrupted stack in the #VC entry code.
	52	*/
	53	char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
0786138c TL	54
	55	/*
	56	* Reserve one page per CPU as backup storage for the unencrypted GHCB.
	57	* It is needed when an NMI happens while the #VC handler uses the real
	58	* GHCB, and the NMI handler itself is causing another #VC exception. In
	59	* that case the GHCB content of the first handler needs to be backed up
	60	* and restored.
	61	*/
	62	struct ghcb backup_ghcb;
	63
	64	/*
	65	* Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
	66	* There is no need for it to be atomic, because nothing is written to
	67	* the GHCB between the read and the write of ghcb_active. So it is safe
	68	* to use it when a nested #VC exception happens before the write.
	69	*
	70	* This is necessary for example in the #VC->NMI->#VC case when the NMI
	71	* happens while the first #VC handler uses the GHCB. When the NMI code
	72	* raises a second #VC handler it might overwrite the contents of the
	73	* GHCB written by the first handler. To avoid this the content of the
	74	* GHCB is saved and restored when the GHCB is detected to be in use
	75	* already.
	76	*/
	77	bool ghcb_active;
	78	bool backup_ghcb_active;
	79	};
	80
	81	struct ghcb_state {
	82	struct ghcb *ghcb;
885689e4 TL	83	};
	84
	85	static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
315562c9	86	DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
885689e4	87
0786138c TL	88	/* Needed in vc_early_forward_exception */
	89	void do_early_exception(struct pt_regs *regs, int trapnr);
	90
02772fb9 JR	91	static void __init setup_vc_stacks(int cpu)
	92	{
	93	struct sev_es_runtime_data *data;
	94	struct cpu_entry_area *cea;
	95	unsigned long vaddr;
	96	phys_addr_t pa;
	97
	98	data = per_cpu(runtime_data, cpu);
	99	cea = get_cpu_entry_area(cpu);
	100
	101	/* Map #VC IST stack */
	102	vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
	103	pa = __pa(data->ist_stack);
	104	cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
	105
	106	/* Map VC fall-back stack */
	107	vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
	108	pa = __pa(data->fallback_stack);
	109	cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
	110	}
	111
315562c9 JR	112	static __always_inline bool on_vc_stack(unsigned long sp)
	113	{
	114	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
	115	}
	116
	117	/*
	118	* This function handles the case when an NMI is raised in the #VC exception
	119	* handler entry code. In this case, the IST entry for #VC must be adjusted, so
	120	* that any subsequent #VC exception will not overwrite the stack contents of the
	121	* interrupted #VC handler.
	122	*
	123	* The IST entry is adjusted unconditionally so that it can be also be
	124	* unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
	125	* sev_es_ist_exit() call may adjust back the IST entry too early.
	126	*/
	127	void noinstr __sev_es_ist_enter(struct pt_regs *regs)
	128	{
	129	unsigned long old_ist, new_ist;
	130
	131	/* Read old IST entry */
	132	old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
	133
	134	/* Make room on the IST stack */
	135	if (on_vc_stack(regs->sp))
	136	new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
	137	else
	138	new_ist = old_ist - sizeof(old_ist);
	139
	140	/* Store old IST entry */
	141	(unsigned long )new_ist = old_ist;
	142
	143	/* Set new IST entry */
	144	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
	145	}
	146
	147	void noinstr __sev_es_ist_exit(void)
	148	{
	149	unsigned long ist;
	150
	151	/* Read IST entry */
	152	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
	153
	154	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
	155	return;
	156
	157	/* Read back old IST entry and write it to the TSS */
	158	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], (unsigned long )ist);
	159	}
	160
0786138c TL	161	static __always_inline struct ghcb sev_es_get_ghcb(struct ghcb_state state)
	162	{
	163	struct sev_es_runtime_data *data;
	164	struct ghcb *ghcb;
	165
	166	data = this_cpu_read(runtime_data);
	167	ghcb = &data->ghcb_page;
	168
	169	if (unlikely(data->ghcb_active)) {
	170	/* GHCB is already in use - save its contents */
	171
	172	if (unlikely(data->backup_ghcb_active))
	173	return NULL;
	174
	175	/* Mark backup_ghcb active before writing to it */
	176	data->backup_ghcb_active = true;
	177
	178	state->ghcb = &data->backup_ghcb;
	179
	180	/* Backup GHCB content */
	181	state->ghcb = ghcb;
	182	} else {
	183	state->ghcb = NULL;
	184	data->ghcb_active = true;
	185	}
	186
	187	return ghcb;
	188	}
	189
	190	static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
	191	{
	192	struct sev_es_runtime_data *data;
	193	struct ghcb *ghcb;
	194
	195	data = this_cpu_read(runtime_data);
	196	ghcb = &data->ghcb_page;
	197
	198	if (state->ghcb) {
	199	/* Restore GHCB from Backup */
	200	ghcb = state->ghcb;
	201	data->backup_ghcb_active = false;
	202	state->ghcb = NULL;
	203	} else {
	204	data->ghcb_active = false;
	205	}
	206	}
1aa9aa8e	207
f980f9c3 JR	208	static inline u64 sev_es_rd_ghcb_msr(void)
	209	{
	210	return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
	211	}
	212
	213	static inline void sev_es_wr_ghcb_msr(u64 val)
	214	{
	215	u32 low, high;
	216
	217	low = (u32)(val);
	218	high = (u32)(val >> 32);
	219
	220	native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
	221	}
	222
	223	static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
	224	unsigned char *buffer)
	225	{
	226	return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
	227	}
	228
	229	static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
	230	{
	231	char buffer[MAX_INSN_SIZE];
	232	enum es_result ret;
	233	int res;
	234
	235	res = vc_fetch_insn_kernel(ctxt, buffer);
	236	if (unlikely(res == -EFAULT)) {
	237	ctxt->fi.vector = X86_TRAP_PF;
	238	ctxt->fi.error_code = 0;
	239	ctxt->fi.cr2 = ctxt->regs->ip;
	240	return ES_EXCEPTION;
	241	}
	242
	243	insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
	244	insn_get_length(&ctxt->insn);
	245
	246	ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
	247
	248	return ret;
	249	}
	250
	251	static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
	252	char dst, char buf, size_t size)
	253	{
	254	unsigned long error_code = X86_PF_PROT \| X86_PF_WRITE;
	255	char __user target = (char __user )dst;
	256	u64 d8;
	257	u32 d4;
	258	u16 d2;
	259	u8 d1;
	260
	261	switch (size) {
	262	case 1:
	263	memcpy(&d1, buf, 1);
	264	if (put_user(d1, target))
	265	goto fault;
	266	break;
	267	case 2:
	268	memcpy(&d2, buf, 2);
	269	if (put_user(d2, target))
	270	goto fault;
	271	break;
272	case 4:
273	memcpy(&d4, buf, 4);
274	if (put_user(d4, target))
275	goto fault;
276	break;
277	case 8:
278	memcpy(&d8, buf, 8);
279	if (put_user(d8, target))
280	goto fault;
281	break;
282	default:
283	WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
284	return ES_UNSUPPORTED;
285	}
286
287	return ES_OK;
288
289	fault:
290	if (user_mode(ctxt->regs))
291	error_code \|= X86_PF_USER;
292
293	ctxt->fi.vector = X86_TRAP_PF;
294	ctxt->fi.error_code = error_code;
295	ctxt->fi.cr2 = (unsigned long)dst;
296
297	return ES_EXCEPTION;
298	}
299
300	static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
301	char src, char buf, size_t size)
302	{
303	unsigned long error_code = X86_PF_PROT;
304	char __user s = (char __user )src;
305	u64 d8;
306	u32 d4;
307	u16 d2;
308	u8 d1;
309
310	switch (size) {
311	case 1:
312	if (get_user(d1, s))
313	goto fault;
314	memcpy(buf, &d1, 1);
315	break;
316	case 2:
317	if (get_user(d2, s))
318	goto fault;
319	memcpy(buf, &d2, 2);
320	break;
321	case 4:
322	if (get_user(d4, s))
323	goto fault;
324	memcpy(buf, &d4, 4);
325	break;
326	case 8:
327	if (get_user(d8, s))
328	goto fault;
329	memcpy(buf, &d8, 8);
330	break;
331	default:
332	WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
333	return ES_UNSUPPORTED;
334	}
335
336	return ES_OK;
337
338	fault:
339	if (user_mode(ctxt->regs))
340	error_code \|= X86_PF_USER;
341
342	ctxt->fi.vector = X86_TRAP_PF;
343	ctxt->fi.error_code = error_code;
344	ctxt->fi.cr2 = (unsigned long)src;
345
346	return ES_EXCEPTION;
347	}
348
349	/* Include code shared with pre-decompression boot stage */
350	#include "sev-es-shared.c"
1aa9aa8e JR	351
	352	/*
	353	* This function runs on the first #VC exception after the kernel
	354	* switched to virtual addresses.
	355	*/
	356	static bool __init sev_es_setup_ghcb(void)
	357	{
	358	/* First make sure the hypervisor talks a supported protocol. */
	359	if (!sev_es_negotiate_protocol())
	360	return false;
	361
	362	/*
	363	* Clear the boot_ghcb. The first exception comes in before the bss
	364	* section is cleared.
	365	*/
	366	memset(&boot_ghcb_page, 0, PAGE_SIZE);
	367
	368	/* Alright - Make the boot-ghcb public */
	369	boot_ghcb = &boot_ghcb_page;
	370
	371	return true;
	372	}
	373
885689e4 TL	374	static void __init alloc_runtime_data(int cpu)
	375	{
	376	struct sev_es_runtime_data *data;
	377
	378	data = memblock_alloc(sizeof(*data), PAGE_SIZE);
	379	if (!data)
	380	panic("Can't allocate SEV-ES runtime data");
	381
	382	per_cpu(runtime_data, cpu) = data;
	383	}
	384
	385	static void __init init_ghcb(int cpu)
	386	{
	387	struct sev_es_runtime_data *data;
	388	int err;
	389
	390	data = per_cpu(runtime_data, cpu);
	391
	392	err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
	393	sizeof(data->ghcb_page));
	394	if (err)
	395	panic("Can't map GHCBs unencrypted");
	396
	397	memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
0786138c TL	398
	399	data->ghcb_active = false;
	400	data->backup_ghcb_active = false;
885689e4 TL	401	}
	402
	403	void __init sev_es_init_vc_handling(void)
	404	{
	405	int cpu;
	406
	407	BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
	408
	409	if (!sev_es_active())
	410	return;
	411
315562c9 JR	412	/* Enable SEV-ES special handling */
	413	static_branch_enable(&sev_es_enable_key);
	414
885689e4 TL	415	/* Initialize per-cpu GHCB pages */
	416	for_each_possible_cpu(cpu) {
	417	alloc_runtime_data(cpu);
	418	init_ghcb(cpu);
02772fb9	419	setup_vc_stacks(cpu);
885689e4	420	}
0786138c TL	421
	422	/* Secondary CPUs use the runtime #VC handler */
	423	initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
885689e4 TL	424	}
885689e4 TL	425
1aa9aa8e JR	426	static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
	427	{
	428	int trapnr = ctxt->fi.vector;
	429
	430	if (trapnr == X86_TRAP_PF)
	431	native_write_cr2(ctxt->fi.cr2);
	432
	433	ctxt->regs->orig_ax = ctxt->fi.error_code;
	434	do_early_exception(ctxt->regs, trapnr);
	435	}
	436
	437	static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
	438	struct ghcb *ghcb,
	439	unsigned long exit_code)
	440	{
	441	enum es_result result;
	442
	443	switch (exit_code) {
	444	default:
	445	/*
	446	* Unexpected #VC exception
	447	*/
	448	result = ES_UNSUPPORTED;
	449	}
	450
	451	return result;
	452	}
	453
0786138c TL	454	static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
	455	{
	456	long error_code = ctxt->fi.error_code;
	457	int trapnr = ctxt->fi.vector;
	458
	459	ctxt->regs->orig_ax = ctxt->fi.error_code;
	460
	461	switch (trapnr) {
	462	case X86_TRAP_GP:
	463	exc_general_protection(ctxt->regs, error_code);
	464	break;
	465	case X86_TRAP_UD:
	466	exc_invalid_op(ctxt->regs);
	467	break;
	468	default:
	469	pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
	470	BUG();
	471	}
	472	}
	473
	474	static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
	475	{
	476	unsigned long sp = (unsigned long)regs;
	477
	478	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
	479	}
	480
	481	/*
	482	* Main #VC exception handler. It is called when the entry code was able to
	483	* switch off the IST to a safe kernel stack.
	484	*
	485	* With the current implementation it is always possible to switch to a safe
	486	* stack because #VC exceptions only happen at known places, like intercepted
	487	* instructions or accesses to MMIO areas/IO ports. They can also happen with
	488	* code instrumentation when the hypervisor intercepts #DB, but the critical
	489	* paths are forbidden to be instrumented, so #DB exceptions currently also
	490	* only happen in safe places.
	491	*/
	492	DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
	493	{
	494	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
	495	struct ghcb_state state;
	496	struct es_em_ctxt ctxt;
	497	enum es_result result;
	498	struct ghcb *ghcb;
	499
	500	lockdep_assert_irqs_disabled();
	501	instrumentation_begin();
	502
	503	/*
	504	* This is invoked through an interrupt gate, so IRQs are disabled. The
	505	* code below might walk page-tables for user or kernel addresses, so
	506	* keep the IRQs disabled to protect us against concurrent TLB flushes.
	507	*/
	508
	509	ghcb = sev_es_get_ghcb(&state);
	510	if (!ghcb) {
	511	/*
	512	* Mark GHCBs inactive so that panic() is able to print the
	513	* message.
	514	*/
	515	data->ghcb_active = false;
	516	data->backup_ghcb_active = false;
	517
518	panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
519	}
520
521	vc_ghcb_invalidate(ghcb);
522	result = vc_init_em_ctxt(&ctxt, regs, error_code);
523
524	if (result == ES_OK)
525	result = vc_handle_exitcode(&ctxt, ghcb, error_code);
526
527	sev_es_put_ghcb(&state);
528
529	/* Done - now check the result */
530	switch (result) {
531	case ES_OK:
532	vc_finish_insn(&ctxt);
533	break;
534	case ES_UNSUPPORTED:
535	pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
536	error_code, regs->ip);
537	goto fail;
538	case ES_VMM_ERROR:
539	pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
540	error_code, regs->ip);
541	goto fail;
542	case ES_DECODE_FAILED:
543	pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
544	error_code, regs->ip);
545	goto fail;
546	case ES_EXCEPTION:
547	vc_forward_exception(&ctxt);
548	break;
549	case ES_RETRY:
550	/* Nothing to do */
551	break;
552	default:
553	pr_emerg("Unknown result in %s():%d\n", __func__, result);
554	/*
555	* Emulating the instruction which caused the #VC exception
556	* failed - can't continue so print debug information
557	*/
558	BUG();
559	}
560
561	out:
562	instrumentation_end();
563
564	return;
565
566	fail:
567	if (user_mode(regs)) {
568	/*
569	* Do not kill the machine if user-space triggered the
570	* exception. Send SIGBUS instead and let user-space deal with
571	* it.
572	*/
573	force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
574	} else {
575	pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
576	result);
577
578	/* Show some debug info */
579	show_regs(regs);
580
581	/* Ask hypervisor to sev_es_terminate */
582	sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
583
584	/* If that fails and we get here - just panic */
585	panic("Returned from Terminate-Request to Hypervisor\n");
586	}
587
588	goto out;
589	}
590
591	/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
592	DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
593	{
594	instrumentation_begin();
595	panic("Can't handle #VC exception from unsupported context\n");
596	instrumentation_end();
597	}
598
599	DEFINE_IDTENTRY_VC(exc_vmm_communication)
600	{
601	if (likely(!on_vc_fallback_stack(regs)))
602	safe_stack_exc_vmm_communication(regs, error_code);
603	else
604	ist_exc_vmm_communication(regs, error_code);
605	}
606
1aa9aa8e JR	607	bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
	608	{
	609	unsigned long exit_code = regs->orig_ax;
	610	struct es_em_ctxt ctxt;
	611	enum es_result result;
	612
	613	/* Do initial setup or terminate the guest */
	614	if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
	615	sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
	616
	617	vc_ghcb_invalidate(boot_ghcb);
	618
	619	result = vc_init_em_ctxt(&ctxt, regs, exit_code);
	620	if (result == ES_OK)
	621	result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
	622
	623	/* Done - now check the result */
	624	switch (result) {
	625	case ES_OK:
	626	vc_finish_insn(&ctxt);
	627	break;
	628	case ES_UNSUPPORTED:
	629	early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
	630	exit_code, regs->ip);
	631	goto fail;
	632	case ES_VMM_ERROR:
	633	early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
	634	exit_code, regs->ip);
	635	goto fail;
	636	case ES_DECODE_FAILED:
	637	early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
	638	exit_code, regs->ip);
	639	goto fail;
	640	case ES_EXCEPTION:
	641	vc_early_forward_exception(&ctxt);
	642	break;
	643	case ES_RETRY:
	644	/* Nothing to do */
	645	break;
	646	default:
	647	BUG();
	648	}
	649
	650	return true;
	651
	652	fail:
	653	show_regs(regs);
	654
	655	while (true)
	656	halt();
	657	}