From: "Kirill A. Shutemov" Today there are two separate locations where TDX error codes are defined: arch/x86/include/asm/tdx.h arch/x86/kvm/vmx/tdx_errno.h They have some overlap that is already defined similarly. Reduce the duplication and prepare to introduce some helpers for these error codes in the central place by unifying them. Join them at: asm/shared/tdx_errno.h ...and update the headers that contained the duplicated definitions to include the new unified header. Place the new header in "asm/shared". While the compressed code for the guest doesn't use these error code header definitions today, it does make the types of calls that return the values they define. Place the defines in "shared" location so that compressed code has the definitions accessible, but leave cleanups to use proper error codes for future changes. Opportunistically massage some comments. Also, adjust _BITUL()->_BITULL() to address 32 bit build errors after the move. Signed-off-by: Kirill A. Shutemov [enhance log] Tested-by: Sagi Shahar Signed-off-by: Sean Christopherson Acked-by: Vishal Annapurve Signed-off-by: Vishal Verma Signed-off-by: Rick Edgecombe --- arch/x86/include/asm/shared/tdx.h | 1 + .../vmx => include/asm/shared}/tdx_errno.h | 27 +++++++++++++++---- arch/x86/include/asm/tdx.h | 20 -------------- arch/x86/kvm/vmx/tdx.h | 1 - 4 files changed, 23 insertions(+), 26 deletions(-) rename arch/x86/{kvm/vmx => include/asm/shared}/tdx_errno.h (65%) diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 8bc074c8d7c6..6a1646fc2b2f 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -4,6 +4,7 @@ #include #include +#include #define TDX_HYPERCALL_STANDARD 0 diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/include/asm/shared/tdx_errno.h similarity index 65% rename from arch/x86/kvm/vmx/tdx_errno.h rename to arch/x86/include/asm/shared/tdx_errno.h index 6ff4672c4181..3aa74f6a6119 100644 --- a/arch/x86/kvm/vmx/tdx_errno.h +++ b/arch/x86/include/asm/shared/tdx_errno.h @@ -1,14 +1,16 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* architectural status code for SEAMCALL */ +#ifndef _X86_SHARED_TDX_ERRNO_H +#define _X86_SHARED_TDX_ERRNO_H -#ifndef __KVM_X86_TDX_ERRNO_H -#define __KVM_X86_TDX_ERRNO_H +#include +/* Upper 32 bit of the TDX error code encodes the status */ #define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL /* - * TDX SEAMCALL Status Codes (returned in RAX) + * TDX SEAMCALL Status Codes */ +#define TDX_SUCCESS 0ULL #define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL #define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL #define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL @@ -17,6 +19,7 @@ #define TDX_OPERAND_INVALID 0xC000010000000000ULL #define TDX_OPERAND_BUSY 0x8000020000000000ULL #define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL +#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL #define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL #define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL #define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL @@ -28,6 +31,20 @@ #define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL #define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL +/* + * SW-defined error codes. + * + * Bits 47:40 == 0xFF indicate Reserved status code class that never used by + * TDX module. + */ +#define TDX_ERROR _BITULL(63) +#define TDX_NON_RECOVERABLE _BITULL(62) +#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) +#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _ULL(0xFFFF0000)) + +#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) +#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) + /* * TDX module operand ID, appears in 31:0 part of error code as * detail information @@ -37,4 +54,4 @@ #define TDX_OPERAND_ID_SEPT 0x92 #define TDX_OPERAND_ID_TD_EPOCH 0xa9 -#endif /* __KVM_X86_TDX_ERRNO_H */ +#endif /* _X86_SHARED_TDX_ERRNO_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index a149740b24e8..0c1ae4954f17 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -12,26 +12,6 @@ #include #include -/* - * SW-defined error codes. - * - * Bits 47:40 == 0xFF indicate Reserved status code class that never used by - * TDX module. - */ -#define TDX_ERROR _BITUL(63) -#define TDX_NON_RECOVERABLE _BITUL(62) -#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) -#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) - -#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) -#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) - -/* - * TDX module SEAMCALL leaf function error codes - */ -#define TDX_SUCCESS 0ULL -#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL - #ifndef __ASSEMBLER__ #include diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index b5cd2ffb303e..ac8323a68b16 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -3,7 +3,6 @@ #define __KVM_X86_VMX_TDX_H #include "tdx_arch.h" -#include "tdx_errno.h" #ifdef CONFIG_KVM_INTEL_TDX #include "common.h" -- 2.53.0 KVM tries to take care of some required cache flushing earlier in the kexec path in order to be kind to some long standing races that can occur later in the operation. Until recently, VMXOFF was handled within KVM. Since VMX being enabled is required to make a SEAMCALL, it had the best per-cpu scoped operation to plug the flushing into. This early kexec cache flushing in KVM happens via a syscore shutdown callback. Now that VMX enablement control has moved to arch/x86, which has grown its own syscore shutdown callback, it no longer make sense for it to live in KVM. It fits better with the TDX enablement managing code. In addition, future changes will add a SEAMCALL that happens immediately before VMXOFF, which means the cache flush in KVM will be too late to be helpful. So move it to the newly added TDX arch/x86 syscore shutdown handler. Since tdx_cpu_flush_cache_for_kexec() is no longer needed by KVM, make it static and remove the export. Since it is also not part of an operation spread across disparate components, remove the redundant comments and verbose naming. Signed-off-by: Rick Edgecombe --- arch/x86/include/asm/tdx.h | 6 ------ arch/x86/kvm/vmx/tdx.c | 10 ---------- arch/x86/virt/vmx/tdx/tdx.c | 39 +++++++++++++++++++------------------ 3 files changed, 20 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 0c1ae4954f17..f0826b0a512a 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -206,11 +206,5 @@ static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ -#ifdef CONFIG_KEXEC_CORE -void tdx_cpu_flush_cache_for_kexec(void); -#else -static inline void tdx_cpu_flush_cache_for_kexec(void) { } -#endif - #endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b7264b533feb..50a5cfdbd33e 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -440,16 +440,6 @@ void tdx_disable_virtualization_cpu(void) tdx_flush_vp(&arg); } local_irq_restore(flags); - - /* - * Flush cache now if kexec is possible: this is necessary to avoid - * having dirty private memory cachelines when the new kernel boots, - * but WBINVD is a relatively expensive operation and doing it during - * kexec can exacerbate races in native_stop_other_cpus(). Do it - * now, since this is a safe moment and there is going to be no more - * TDX activity on this CPU from this point on. - */ - tdx_cpu_flush_cache_for_kexec(); } #define TDX_SEAMCALL_RETRIES 10000 diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index cb9b3210ab71..0802d0fd18a4 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -224,8 +224,28 @@ static int tdx_offline_cpu(unsigned int cpu) return 0; } +static void tdx_cpu_flush_cache(void) +{ + lockdep_assert_preemption_disabled(); + + if (!this_cpu_read(cache_state_incoherent)) + return; + + wbinvd(); + this_cpu_write(cache_state_incoherent, false); +} + static void tdx_shutdown_cpu(void *ign) { + /* + * Flush cache now if kexec is possible: this is necessary to avoid + * having dirty private memory cachelines when the new kernel boots, + * but WBINVD is a relatively expensive operation and doing it during + * kexec can exacerbate races in native_stop_other_cpus(). Do it + * now, since this is a safe moment and there is going to be no more + * TDX activity on this CPU from this point on. + */ + tdx_cpu_flush_cache(); x86_virt_put_ref(X86_FEATURE_VMX); } @@ -1920,22 +1940,3 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); - -#ifdef CONFIG_KEXEC_CORE -void tdx_cpu_flush_cache_for_kexec(void) -{ - lockdep_assert_preemption_disabled(); - - if (!this_cpu_read(cache_state_incoherent)) - return; - - /* - * Private memory cachelines need to be clean at the time of - * kexec. Write them back now, as the caller promises that - * there should be no more SEAMCALLs on this CPU. - */ - wbinvd(); - this_cpu_write(cache_state_incoherent, false); -} -EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec); -#endif -- 2.53.0 From: Vishal Verma Some early TDX-capable platforms have an erratum where a partial write to TDX private memory can cause a machine check on a subsequent read. On these platforms, kexec and kdump have been disabled in these cases, because the old kernel cannot safely hand off TDX state to the new kernel. Later TDX modules support the TDH.SYS.DISABLE SEAMCALL, which provides a way to cleanly disable TDX and allow kexec to proceed. This can be a long running operation, and the time needed largely depends on the amount of memory that has been allocated to TDs. If all TDs have been destroyed prior to the sys_disable call, then it is fast, with only needing to override the TDX module memory. After the SEAMCALL completes, the TDX module is disabled and all memory resources allocated to TDX are freed and reset. The next kernel can then re-initialize the TDX module from scratch via the normal TDX bring-up sequence. The SEAMCALL may be interrupted by an interrupt. In this case, it returns TDX_INTERRUPTED_RESUMABLE, and it must be retried in a loop until the operation completes successfully. Add a tdx_sys_disable() helper, which implements the retry loop around the SEAMCALL to provide this functionality. Signed-off-by: Vishal Verma Signed-off-by: Rick Edgecombe --- arch/x86/include/asm/tdx.h | 3 +++ arch/x86/virt/vmx/tdx/tdx.c | 18 ++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 1 + 3 files changed, 22 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index f0826b0a512a..baaf43a09e99 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -173,6 +173,8 @@ static inline int pg_level_to_tdx_sept_level(enum pg_level level) return level - 1; } +void tdx_sys_disable(void); + u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args); u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); @@ -204,6 +206,7 @@ static inline void tdx_init(void) { } static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } +static inline void tdx_sys_disable(void) { } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 0802d0fd18a4..68bd2618dde4 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1940,3 +1941,20 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); + +void tdx_sys_disable(void) +{ + struct tdx_module_args args = {}; + + /* + * SEAMCALLs that can return TDX_INTERRUPTED_RESUMABLE are guaranteed + * to make forward progress between interrupts, so it is safe to loop + * unconditionally here. + * + * This is a 'destructive' SEAMCALL, in that no other SEAMCALL can be + * run after this until a full reinitialization is done. + */ + while (seamcall(TDH_SYS_DISABLE, &args) == TDX_INTERRUPTED_RESUMABLE) + ; +} + diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index dde219c823b4..e2cf2dd48755 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -46,6 +46,7 @@ #define TDH_PHYMEM_PAGE_WBINVD 41 #define TDH_VP_WR 43 #define TDH_SYS_CONFIG 45 +#define TDH_SYS_DISABLE 69 /* * SEAMCALL leaf: -- 2.53.0 From: Vishal Verma Use the TDH.SYS.DISABLE SEAMCALL, which disables the TDX module, reclaims all memory resources assigned to TDX, and clears any partial-write induced poison, to allow kexec and kdump on platforms with the partial write errata. On TDX-capable platforms with the partial write erratum, kexec has been disabled because the new kernel could hit a machine check reading a previously poisoned memory location. Later TDX modules support TDH.SYS.DISABLE, which disables the module and reclaims all TDX memory resources, allowing the new kernel to re-initialize TDX from scratch. This operation also clears the old memory, cleaning up any poison. Add tdx_sys_disable() to tdx_shutdown(), which is called in the syscore_shutdown path for kexec. This is done just before tdx_shutdown() disables VMX on all CPUs. For kdump, call tdx_sys_disable() in the crash path before x86_virt_emergency_disable_virtualization_cpu() does VMXOFF. Since this clears any poison on TDX-managed memory, the X86_BUG_TDX_PW_MCE check in machine_kexec() that blocked kexec on partial write errata platforms can be removed. Signed-off-by: Vishal Verma Signed-off-by: Rick Edgecombe --- arch/x86/kernel/crash.c | 2 ++ arch/x86/kernel/machine_kexec_64.c | 16 ---------------- arch/x86/virt/vmx/tdx/tdx.c | 1 + 3 files changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index cd796818d94d..623d4474631a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); + tdx_sys_disable(); x86_virt_emergency_disable_virtualization_cpu(); /* diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 0590d399d4f1..c3f4a389992d 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -347,22 +347,6 @@ int machine_kexec_prepare(struct kimage *image) unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; - /* - * Some early TDX-capable platforms have an erratum. A kernel - * partial write (a write transaction of less than cacheline - * lands at memory controller) to TDX private memory poisons that - * memory, and a subsequent read triggers a machine check. - * - * On those platforms the old kernel must reset TDX private - * memory before jumping to the new kernel otherwise the new - * kernel may see unexpected machine check. For simplicity - * just fail kexec/kdump on those platforms. - */ - if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) { - pr_info_once("Not allowed on platform with tdx_pw_mce bug\n"); - return -EOPNOTSUPP; - } - /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, __pa(control_page)); if (result) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 68bd2618dde4..b388fbce5d76 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -252,6 +252,7 @@ static void tdx_shutdown_cpu(void *ign) static void tdx_shutdown(void *ign) { + tdx_sys_disable(); on_each_cpu(tdx_shutdown_cpu, NULL, 1); } -- 2.53.0