From: "Kirill A. Shutemov" Today there are two separate locations where TDX error codes are defined: arch/x86/include/asm/tdx.h arch/x86/kvm/vmx/tdx_errno.h They have some overlap that is already defined similarly. Reduce the duplication and prepare to introduce some helpers for these error codes in the central place by unifying them. Join them at: asm/shared/tdx_errno.h ...and update the headers that contained the duplicated definitions to include the new unified header. "asm/shared" is used for sharing TDX code between the early compressed code and the normal kernel code. While the compressed code for the guest doesn't use these error code header definitions today, it does make the types of calls that return the values they define. So place the defines in "shared" location so that it can, but leave such cleanups for future changes. Also, adjust BITUL() -> _BITULL() to address 32 bit build errors after the move. Signed-off-by: Kirill A. Shutemov [enhance log] Signed-off-by: Rick Edgecombe Signed-off-by: Vishal Verma --- arch/x86/include/asm/shared/tdx.h | 1 + .../{kvm/vmx => include/asm/shared}/tdx_errno.h | 28 +++++++++++++++++----- arch/x86/include/asm/tdx.h | 21 ---------------- arch/x86/kvm/vmx/tdx.h | 1 - 4 files changed, 23 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 8bc074c8d7c6..6a1646fc2b2f 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -4,6 +4,7 @@ #include #include +#include #define TDX_HYPERCALL_STANDARD 0 diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/include/asm/shared/tdx_errno.h similarity index 64% rename from arch/x86/kvm/vmx/tdx_errno.h rename to arch/x86/include/asm/shared/tdx_errno.h index 6ff4672c4181..8bf6765cf082 100644 --- a/arch/x86/kvm/vmx/tdx_errno.h +++ b/arch/x86/include/asm/shared/tdx_errno.h @@ -1,14 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* architectural status code for SEAMCALL */ - -#ifndef __KVM_X86_TDX_ERRNO_H -#define __KVM_X86_TDX_ERRNO_H +#ifndef _ASM_X86_SHARED_TDX_ERRNO_H +#define _ASM_X86_SHARED_TDX_ERRNO_H +#include +/* Upper 32 bit of the TDX error code encodes the status */ #define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL /* - * TDX SEAMCALL Status Codes (returned in RAX) + * TDX Status Codes (returned in RAX) */ +#define TDX_SUCCESS 0ULL #define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL #define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL #define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL @@ -17,6 +18,7 @@ #define TDX_OPERAND_INVALID 0xC000010000000000ULL #define TDX_OPERAND_BUSY 0x8000020000000000ULL #define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL +#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL #define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL #define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL #define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL @@ -28,6 +30,20 @@ #define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL #define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL +/* + * SW-defined error codes. + * + * Bits 47:40 == 0xFF indicate Reserved status code class that never used by + * TDX module. + */ +#define TDX_ERROR _BITULL(63) +#define TDX_NON_RECOVERABLE _BITULL(62) +#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) +#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _ULL(0xFFFF0000)) + +#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) +#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) + /* * TDX module operand ID, appears in 31:0 part of error code as * detail information @@ -37,4 +53,4 @@ #define TDX_OPERAND_ID_SEPT 0x92 #define TDX_OPERAND_ID_TD_EPOCH 0xa9 -#endif /* __KVM_X86_TDX_ERRNO_H */ +#endif /* _ASM_X86_SHARED_TDX_ERRNO_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index a149740b24e8..2917b3451491 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -9,29 +9,8 @@ #include #include -#include #include -/* - * SW-defined error codes. - * - * Bits 47:40 == 0xFF indicate Reserved status code class that never used by - * TDX module. - */ -#define TDX_ERROR _BITUL(63) -#define TDX_NON_RECOVERABLE _BITUL(62) -#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) -#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) - -#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) -#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) - -/* - * TDX module SEAMCALL leaf function error codes - */ -#define TDX_SUCCESS 0ULL -#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL - #ifndef __ASSEMBLER__ #include diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index b5cd2ffb303e..ac8323a68b16 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -3,7 +3,6 @@ #define __KVM_X86_VMX_TDX_H #include "tdx_arch.h" -#include "tdx_errno.h" #ifdef CONFIG_KVM_INTEL_TDX #include "common.h" -- 2.53.0 From: Rick Edgecombe KVM tries to take care of some required cache flushing earlier in the kexec path in order to be kind to some long standing races that can occur later in the operation. Until recently, VMXOFF was handled within KVM. Since VMX being enabled is required to make a SEAMCALL, it had the best per-cpu scoped operation to plug the flushing into. So it is kicked off from there. This early kexec cache flushing in KVM happens via a syscore shutdown callback. Now that VMX enablement control has moved to arch/x86, which has grown its own syscore shutdown callback, it no longer make sense for it to live in KVM. It fits better with the TDX enablement managing code. In addition, future changes will add a SEAMCALL that happens immediately before VMXOFF, which means the cache flush in KVM will be too late to flush the cache before the last SEAMCALL. So move it to the newly added TDX arch/x86 syscore shutdown handler. Since tdx_cpu_flush_cache_for_kexec() is no longer needed by KVM, make it static and remove the export. Since it is also not part of an operation spread across disparate components, remove the redundant comments and verbose naming. In the existing KVM based code, CPU offline also funnels through tdx_cpu_flush_cache_for_kexec(). So the centralization to the arch/x86 syscore shutdown callback elides this CPU offline time behavior. However, WBINVD is already generally done at CPU offline as matter of course. So don't bother adding TDX specific logic for this, and rely on the normal WBINVD to handle it. Acked-by: Kai Huang Signed-off-by: Rick Edgecombe Signed-off-by: Vishal Verma --- arch/x86/include/asm/tdx.h | 6 ------ arch/x86/kvm/vmx/tdx.c | 10 ---------- arch/x86/virt/vmx/tdx/tdx.c | 39 ++++++++++++++++++++------------------- 3 files changed, 20 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 2917b3451491..7674fc530090 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -205,11 +205,5 @@ static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ -#ifdef CONFIG_KEXEC_CORE -void tdx_cpu_flush_cache_for_kexec(void); -#else -static inline void tdx_cpu_flush_cache_for_kexec(void) { } -#endif - #endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b7264b533feb..50a5cfdbd33e 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -440,16 +440,6 @@ void tdx_disable_virtualization_cpu(void) tdx_flush_vp(&arg); } local_irq_restore(flags); - - /* - * Flush cache now if kexec is possible: this is necessary to avoid - * having dirty private memory cachelines when the new kernel boots, - * but WBINVD is a relatively expensive operation and doing it during - * kexec can exacerbate races in native_stop_other_cpus(). Do it - * now, since this is a safe moment and there is going to be no more - * TDX activity on this CPU from this point on. - */ - tdx_cpu_flush_cache_for_kexec(); } #define TDX_SEAMCALL_RETRIES 10000 diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index cb9b3210ab71..0802d0fd18a4 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -224,8 +224,28 @@ static int tdx_offline_cpu(unsigned int cpu) return 0; } +static void tdx_cpu_flush_cache(void) +{ + lockdep_assert_preemption_disabled(); + + if (!this_cpu_read(cache_state_incoherent)) + return; + + wbinvd(); + this_cpu_write(cache_state_incoherent, false); +} + static void tdx_shutdown_cpu(void *ign) { + /* + * Flush cache now if kexec is possible: this is necessary to avoid + * having dirty private memory cachelines when the new kernel boots, + * but WBINVD is a relatively expensive operation and doing it during + * kexec can exacerbate races in native_stop_other_cpus(). Do it + * now, since this is a safe moment and there is going to be no more + * TDX activity on this CPU from this point on. + */ + tdx_cpu_flush_cache(); x86_virt_put_ref(X86_FEATURE_VMX); } @@ -1920,22 +1940,3 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); - -#ifdef CONFIG_KEXEC_CORE -void tdx_cpu_flush_cache_for_kexec(void) -{ - lockdep_assert_preemption_disabled(); - - if (!this_cpu_read(cache_state_incoherent)) - return; - - /* - * Private memory cachelines need to be clean at the time of - * kexec. Write them back now, as the caller promises that - * there should be no more SEAMCALLs on this CPU. - */ - wbinvd(); - this_cpu_write(cache_state_incoherent, false); -} -EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec); -#endif -- 2.53.0 Some early TDX-capable platforms have an erratum where a partial write to TDX private memory can cause a machine check on a subsequent read. On these platforms, kexec and kdump have been disabled in these cases, because the old kernel cannot safely hand off TDX state to the new kernel. Later TDX modules support the TDH.SYS.DISABLE SEAMCALL, which provides a way to cleanly disable TDX and allow kexec to proceed. The new SEAMCALL has an enumeration bit, but that is ignored. It is expected that users will be using the latest TDX module, and the failure mode for running the missing SEAMCALL on an older module is not fatal. This can be a long running operation, and the time needed largely depends on the amount of memory that has been allocated to TDs. If all TDs have been destroyed prior to the sys_disable call, then it is fast, with only needing to override the TDX module memory. After the SEAMCALL completes, the TDX module is disabled and all memory resources allocated to TDX are freed and reset. The next kernel can then re-initialize the TDX module from scratch via the normal TDX bring-up sequence. The SEAMCALL can return two different error codes that expect a retry. - TDX_INTERRUPTED_RESUMABLE can be returned in the case of a host interrupt. However, it will not return until it makes some forward progress, so we can expect to complete even in the case of interrupt storms. - TDX_SYS_BUSY will be returned on contention with other TDH.SYS.* SEAMCALLs, however a side effect of TDH.SYS.DISABLE is that it will block other SEAMCALLs once it gets going. So this contention will be short lived. So loop infinitely on either of these error codes, until success or other error. Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Signed-off-by: Vishal Verma --- arch/x86/include/asm/shared/tdx_errno.h | 1 + arch/x86/include/asm/tdx.h | 3 +++ arch/x86/virt/vmx/tdx/tdx.h | 1 + arch/x86/virt/vmx/tdx/tdx.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 33 insertions(+) diff --git a/arch/x86/include/asm/shared/tdx_errno.h b/arch/x86/include/asm/shared/tdx_errno.h index 8bf6765cf082..246b4fd54a48 100644 --- a/arch/x86/include/asm/shared/tdx_errno.h +++ b/arch/x86/include/asm/shared/tdx_errno.h @@ -15,6 +15,7 @@ #define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL #define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL #define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL +#define TDX_SYS_BUSY 0x8000020200000000ULL #define TDX_OPERAND_INVALID 0xC000010000000000ULL #define TDX_OPERAND_BUSY 0x8000020000000000ULL #define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 7674fc530090..a0a4a15142fc 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -172,6 +172,8 @@ static inline int pg_level_to_tdx_sept_level(enum pg_level level) return level - 1; } +void tdx_sys_disable(void); + u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args); u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); @@ -203,6 +205,7 @@ static inline void tdx_init(void) { } static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } +static inline void tdx_sys_disable(void) { } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index dde219c823b4..e2cf2dd48755 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -46,6 +46,7 @@ #define TDH_PHYMEM_PAGE_WBINVD 41 #define TDH_VP_WR 43 #define TDH_SYS_CONFIG 45 +#define TDH_SYS_DISABLE 69 /* * SEAMCALL leaf: diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 0802d0fd18a4..3a76000dec7a 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1940,3 +1941,30 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); + +void tdx_sys_disable(void) +{ + struct tdx_module_args args = {}; + u64 ret; + + /* + * Don't loop forever. + * - TDX_INTERRUPTED_RESUMABLE guarantees forward progress between + * calls. + * - TDX_SYS_BUSY could transiently contend with TDH.SYS.* SEAMCALLs, + * but will lock out future ones. + * + * This is a 'destructive' SEAMCALL, in that no other SEAMCALL can be + * run after this until a full reinitialization is done. + */ + do { + ret = seamcall(TDH_SYS_DISABLE, &args); + } while (ret == TDX_INTERRUPTED_RESUMABLE || ret == TDX_SYS_BUSY); + + /* + * Print SEAMCALL failures, but not SW-defined error codes + * (SEAMCALL faulted with #GP/#UD, TDX not supported). + */ + if (ret && (ret & TDX_SW_ERROR) != TDX_SW_ERROR) + pr_err("TDH.SYS.DISABLE failed: 0x%016llx\n", ret); +} -- 2.53.0 Use the TDH.SYS.DISABLE SEAMCALL, which disables the TDX module, reclaims all memory resources assigned to TDX, and clears any partial-write induced poison, to allow kexec and kdump on platforms with the partial write errata. On TDX-capable platforms with the partial write erratum, kexec has been disabled because the new kernel could hit a machine check reading a previously poisoned memory location. Later TDX modules support TDH.SYS.DISABLE, which disables the module and reclaims all TDX memory resources, allowing the new kernel to re-initialize TDX from scratch. This operation also clears the old memory, cleaning up any poison. Add tdx_sys_disable() to tdx_shutdown(), which is called in the syscore_shutdown path for kexec. This is done just before tdx_shutdown() disables VMX on all CPUs. For kdump, call tdx_sys_disable() in the crash path before x86_virt_emergency_disable_virtualization_cpu() does VMXOFF. Since this clears any poison on TDX-managed memory, remove the X86_BUG_TDX_PW_MCE check in machine_kexec() that blocked kexec on partial write errata platforms. Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Signed-off-by: Vishal Verma --- arch/x86/kernel/crash.c | 2 ++ arch/x86/kernel/machine_kexec_64.c | 16 ---------------- arch/x86/virt/vmx/tdx/tdx.c | 1 + 3 files changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index cd796818d94d..623d4474631a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); + tdx_sys_disable(); x86_virt_emergency_disable_virtualization_cpu(); /* diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 0590d399d4f1..c3f4a389992d 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -347,22 +347,6 @@ int machine_kexec_prepare(struct kimage *image) unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; - /* - * Some early TDX-capable platforms have an erratum. A kernel - * partial write (a write transaction of less than cacheline - * lands at memory controller) to TDX private memory poisons that - * memory, and a subsequent read triggers a machine check. - * - * On those platforms the old kernel must reset TDX private - * memory before jumping to the new kernel otherwise the new - * kernel may see unexpected machine check. For simplicity - * just fail kexec/kdump on those platforms. - */ - if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) { - pr_info_once("Not allowed on platform with tdx_pw_mce bug\n"); - return -EOPNOTSUPP; - } - /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, __pa(control_page)); if (result) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 3a76000dec7a..aaf22a87717a 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -252,6 +252,7 @@ static void tdx_shutdown_cpu(void *ign) static void tdx_shutdown(void *ign) { + tdx_sys_disable(); on_each_cpu(tdx_shutdown_cpu, NULL, 1); } -- 2.53.0 From: Rick Edgecombe Recent changes have removed the hard limitations for using kexec and TDX together. So remove the section in the TDX docs. Users on partial write erratums will need an updated TDX module to handle the rare edge cases. The docs do not currently provide any guidance on recommended TDX module versions, so don't keep a whole section around to document this interaction. Signed-off-by: Rick Edgecombe Signed-off-by: Vishal Verma --- Documentation/arch/x86/tdx.rst | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst index ff6b110291bc..1a3b5bac1021 100644 --- a/Documentation/arch/x86/tdx.rst +++ b/Documentation/arch/x86/tdx.rst @@ -138,13 +138,6 @@ If the platform has such erratum, the kernel prints additional message in machine check handler to tell user the machine check may be caused by kernel bug on TDX private memory. -Kexec -~~~~~~~ - -Currently kexec doesn't work on the TDX platforms with the aforementioned -erratum. It fails when loading the kexec kernel image. Otherwise it -works normally. - Interaction vs S3 and deeper states ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- 2.53.0