Refactor the VCPU register state accessors to make them explicitly GPR-only. The existing register accessors operate on the cached VCPU register state. That cache holds GPRs and RIP. RIP has its own interface already. This renaming clarifies GPR access only. No functional changes intended. Signed-off-by: Chang S. Bae --- arch/x86/kvm/svm/svm.c | 8 ++++---- arch/x86/kvm/vmx/nested.c | 20 ++++++++++---------- arch/x86/kvm/vmx/vmx.c | 12 ++++++------ arch/x86/kvm/x86.c | 10 +++++----- arch/x86/kvm/x86.h | 5 ++--- arch/x86/kvm/xen.c | 2 +- 6 files changed, 28 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 153c12dbf3eb..3aa2c37754ef 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2473,7 +2473,7 @@ static int cr_interception(struct kvm_vcpu *vcpu) err = 0; if (cr >= 16) { /* mov to cr */ cr -= 16; - val = kvm_register_read(vcpu, reg); + val = kvm_gpr_read(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: @@ -2519,7 +2519,7 @@ static int cr_interception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - kvm_register_write(vcpu, reg, val); + kvm_gpr_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); } return kvm_complete_insn_gp(vcpu, err); @@ -2591,9 +2591,9 @@ static int dr_interception(struct kvm_vcpu *vcpu) dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; if (dr >= 16) { /* mov to DRn */ dr -= 16; - err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); + err = kvm_set_dr(vcpu, dr, kvm_gpr_read(vcpu, reg)); } else { - kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); + kvm_gpr_write(vcpu, reg, kvm_get_dr(vcpu, dr)); } return kvm_complete_insn_gp(vcpu, err); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 76271962cb70..47a941989787 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5325,9 +5325,9 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, else if (addr_size == 0) off = (gva_t)sign_extend64(off, 15); if (base_is_valid) - off += kvm_register_read(vcpu, base_reg); + off += kvm_gpr_read(vcpu, base_reg); if (index_is_valid) - off += kvm_register_read(vcpu, index_reg) << scaling; + off += kvm_gpr_read(vcpu, index_reg) << scaling; vmx_get_segment(vcpu, &s, seg_reg); /* @@ -5719,7 +5719,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* Decode instruction info and find the field to read */ - field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); + field = kvm_gpr_read(vcpu, (((instr_info) >> 28) & 0xf)); if (!nested_vmx_is_evmptr12_valid(vmx)) { /* @@ -5768,7 +5768,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * on the guest's mode (32 or 64 bit), not on the given field's length. */ if (instr_info & BIT(10)) { - kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); + kvm_gpr_write(vcpu, (((instr_info) >> 3) & 0xf), value); } else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, @@ -5842,7 +5842,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return nested_vmx_failInvalid(vcpu); if (instr_info & BIT(10)) - value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); + value = kvm_gpr_read(vcpu, (((instr_info) >> 3) & 0xf)); else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, @@ -5853,7 +5853,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_handle_memory_failure(vcpu, r, &e); } - field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); + field = kvm_gpr_read(vcpu, (((instr_info) >> 28) & 0xf)); offset = get_vmcs12_field_offset(field); if (offset < 0) @@ -6051,7 +6051,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); - type = kvm_register_read(vcpu, gpr_index); + type = kvm_gpr_read(vcpu, gpr_index); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -6132,7 +6132,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); - type = kvm_register_read(vcpu, gpr_index); + type = kvm_gpr_read(vcpu, gpr_index); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; @@ -6406,7 +6406,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ reg = (exit_qualification >> 8) & 15; - val = kvm_register_read(vcpu, reg); + val = kvm_gpr_read(vcpu, reg); switch (cr) { case 0: if (vmcs12->cr0_guest_host_mask & @@ -6492,7 +6492,7 @@ static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, /* Decode instruction info and find the field to access */ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_gpr_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Out-of-range fields always cause a VM exit from L2 to L1 */ if (field >> 15) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f87c216d976d..c7d38f7692cf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5462,7 +5462,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - val = kvm_register_read(vcpu, reg); + val = kvm_gpr_read(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: @@ -5504,12 +5504,12 @@ static int handle_cr(struct kvm_vcpu *vcpu) WARN_ON_ONCE(enable_unrestricted_guest); val = kvm_read_cr3(vcpu); - kvm_register_write(vcpu, reg, val); + kvm_gpr_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); return kvm_skip_emulated_instruction(vcpu); case 8: val = kvm_get_cr8(vcpu); - kvm_register_write(vcpu, reg, val); + kvm_gpr_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); return kvm_skip_emulated_instruction(vcpu); } @@ -5579,10 +5579,10 @@ static int handle_dr(struct kvm_vcpu *vcpu) reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); + kvm_gpr_write(vcpu, reg, kvm_get_dr(vcpu, dr)); err = 0; } else { - err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); + err = kvm_set_dr(vcpu, dr, kvm_gpr_read(vcpu, reg)); } out: @@ -5941,7 +5941,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); - type = kvm_register_read(vcpu, gpr_index); + type = kvm_gpr_read(vcpu, gpr_index); /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..603057ea7421 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2084,8 +2084,8 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) { if (!vcpu->run->msr.error) - kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, - vcpu->run->msr.data); + kvm_gpr_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, + vcpu->run->msr.data); return complete_fast_msr_access(vcpu); } @@ -2139,7 +2139,7 @@ static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, kvm_rax_write(vcpu, data & -1u); kvm_rdx_write(vcpu, (data >> 32) & -1u); } else { - kvm_register_write(vcpu, reg, data); + kvm_gpr_write(vcpu, reg, data); } } else { /* MSR read failed? See if we should ask user space */ @@ -2197,7 +2197,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr); int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) { - return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); + return __kvm_emulate_wrmsr(vcpu, msr, kvm_gpr_read(vcpu, reg)); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm); @@ -2301,7 +2301,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr); fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) { - return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); + return __handle_fastpath_wrmsr(vcpu, msr, kvm_gpr_read(vcpu, reg)); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f3dc77f006f9..4edadd64d3d5 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -400,15 +400,14 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } -static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) +static inline unsigned long kvm_gpr_read(struct kvm_vcpu *vcpu, int reg) { unsigned long val = kvm_register_read_raw(vcpu, reg); return is_64_bit_mode(vcpu) ? val : (u32)val; } -static inline void kvm_register_write(struct kvm_vcpu *vcpu, - int reg, unsigned long val) +static inline void kvm_gpr_write(struct kvm_vcpu *vcpu, int reg, unsigned long val) { if (!is_64_bit_mode(vcpu)) val = (u32)val; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index d6b2a665b499..c9700dc88bb1 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1679,7 +1679,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) bool handled = false; u8 cpl; - input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); + input = (u64)kvm_gpr_read(vcpu, VCPU_REGS_RAX); /* Hyper-V hypercalls get bit 31 set in EAX */ if ((input & 0x80000000) && -- 2.51.0 Refactor the GPR accessors to introduce internal helpers to distinguish between legacy and extended registers. x86 CPUs introduce additional GPRs, but those registers will initially remain unused in the kernel and will not be saved in KVM register cache on every VM exit. Guest states are expected to remain live in hardware registers. This abstraction layer centralizes the selection of access methods, providing a unified interface. For now, the EGPR accessors are placeholders to be implemented later. Signed-off-by: Chang S. Bae --- arch/x86/include/asm/kvm_host.h | 18 ++++++++++++ arch/x86/include/asm/kvm_vcpu_regs.h | 16 ++++++++++ arch/x86/kvm/fpu.h | 6 ++++ arch/x86/kvm/x86.h | 44 ++++++++++++++++++++++++++-- 4 files changed, 82 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48598d017d6f..940f83c121cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -212,6 +212,24 @@ enum { VCPU_SREG_GS, VCPU_SREG_TR, VCPU_SREG_LDTR, +#ifdef CONFIG_X86_64 + VCPU_XREG_R16 = __VCPU_XREG_R16, + VCPU_XREG_R17 = __VCPU_XREG_R17, + VCPU_XREG_R18 = __VCPU_XREG_R18, + VCPU_XREG_R19 = __VCPU_XREG_R19, + VCPU_XREG_R20 = __VCPU_XREG_R20, + VCPU_XREG_R21 = __VCPU_XREG_R21, + VCPU_XREG_R22 = __VCPU_XREG_R22, + VCPU_XREG_R23 = __VCPU_XREG_R23, + VCPU_XREG_R24 = __VCPU_XREG_R24, + VCPU_XREG_R25 = __VCPU_XREG_R25, + VCPU_XREG_R26 = __VCPU_XREG_R26, + VCPU_XREG_R27 = __VCPU_XREG_R27, + VCPU_XREG_R28 = __VCPU_XREG_R28, + VCPU_XREG_R29 = __VCPU_XREG_R29, + VCPU_XREG_R30 = __VCPU_XREG_R30, + VCPU_XREG_R31 = __VCPU_XREG_R31, +#endif }; enum exit_fastpath_completion { diff --git a/arch/x86/include/asm/kvm_vcpu_regs.h b/arch/x86/include/asm/kvm_vcpu_regs.h index 1af2cb59233b..dd0cc171f405 100644 --- a/arch/x86/include/asm/kvm_vcpu_regs.h +++ b/arch/x86/include/asm/kvm_vcpu_regs.h @@ -20,6 +20,22 @@ #define __VCPU_REGS_R13 13 #define __VCPU_REGS_R14 14 #define __VCPU_REGS_R15 15 +#define __VCPU_XREG_R16 16 +#define __VCPU_XREG_R17 17 +#define __VCPU_XREG_R18 18 +#define __VCPU_XREG_R19 19 +#define __VCPU_XREG_R20 20 +#define __VCPU_XREG_R21 21 +#define __VCPU_XREG_R22 22 +#define __VCPU_XREG_R23 23 +#define __VCPU_XREG_R24 24 +#define __VCPU_XREG_R25 25 +#define __VCPU_XREG_R26 26 +#define __VCPU_XREG_R27 27 +#define __VCPU_XREG_R28 28 +#define __VCPU_XREG_R29 29 +#define __VCPU_XREG_R30 30 +#define __VCPU_XREG_R31 31 #endif #endif /* _ASM_X86_KVM_VCPU_REGS_H */ diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h index 3ba12888bf66..159239b3a651 100644 --- a/arch/x86/kvm/fpu.h +++ b/arch/x86/kvm/fpu.h @@ -4,6 +4,7 @@ #define __KVM_FPU_H_ #include +#include typedef u32 __attribute__((vector_size(16))) sse128_t; #define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; } @@ -137,4 +138,9 @@ static inline void kvm_write_mmx_reg(int reg, const u64 *data) kvm_fpu_put(); } +#ifdef CONFIG_X86_64 +static inline unsigned long kvm_read_egpr(int reg) { return 0; } +static inline void kvm_write_egpr(int reg, unsigned long data) { } +#endif + #endif diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4edadd64d3d5..74ae8f12b5a1 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -400,9 +400,49 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } +#ifdef CONFIG_X86_64 +static inline unsigned long _kvm_gpr_read(struct kvm_vcpu *vcpu, int reg) +{ + switch (reg) { + case VCPU_REGS_RAX ... VCPU_REGS_R15: + return kvm_register_read_raw(vcpu, reg); + case VCPU_XREG_R16 ... VCPU_XREG_R31: + return kvm_read_egpr(reg); + default: + WARN_ON_ONCE(1); + } + + return 0; +} + +static inline void _kvm_gpr_write(struct kvm_vcpu *vcpu, int reg, unsigned long val) +{ + switch (reg) { + case VCPU_REGS_RAX ... VCPU_REGS_R15: + kvm_register_write_raw(vcpu, reg, val); + break; + case VCPU_XREG_R16 ... VCPU_XREG_R31: + kvm_write_egpr(reg, val); + break; + default: + WARN_ON_ONCE(1); + } +} +#else +static inline unsigned long _kvm_gpr_read(struct kvm_vcpu *vcpu, int reg) +{ + return kvm_register_read_raw(vcpu, reg); +} + +static inline void _kvm_gpr_write(struct kvm_vcpu *vcpu, int reg, unsigned long val) +{ + kvm_register_write_raw(vcpu, reg, val); +} +#endif + static inline unsigned long kvm_gpr_read(struct kvm_vcpu *vcpu, int reg) { - unsigned long val = kvm_register_read_raw(vcpu, reg); + unsigned long val = _kvm_gpr_read(vcpu, reg); return is_64_bit_mode(vcpu) ? val : (u32)val; } @@ -411,7 +451,7 @@ static inline void kvm_gpr_write(struct kvm_vcpu *vcpu, int reg, unsigned long v { if (!is_64_bit_mode(vcpu)) val = (u32)val; - return kvm_register_write_raw(vcpu, reg, val); + _kvm_gpr_write(vcpu, reg, val); } static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) -- 2.51.0 Add helpers to directly read and write EGPRs (R16–R31). Unlike legacy GPRs, EGPRs are not cached in vcpu->arch.regs[]. Their contents remain live in hardware. If preempted, the EGPR state is preserved in the guest XSAVE buffer. The Advanced Performance Extentions (APX) feature introduces EGPRs as an XSAVE-managed state component. The new helpers access the registers directly between kvm_fpu_get() and kvm_fpu_put(). Callers should ensure that EGPRs are enabled before using these helpers. Signed-off-by: Chang S. Bae --- RFC note: There may be alternative options for EGPR access. If the EGPR state is saved in the guest fpstate, KVM could read or write it there instead. However, since EGPR-related VM exits are expected to be rare, adding extra complexity and overhead at this stage doesn’t seem worthwhile. --- arch/x86/kvm/fpu.h | 80 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h index 159239b3a651..aa35bdf1a073 100644 --- a/arch/x86/kvm/fpu.h +++ b/arch/x86/kvm/fpu.h @@ -96,6 +96,61 @@ static inline void _kvm_write_mmx_reg(int reg, const u64 *data) } } +#ifdef CONFIG_X86_64 +/* + * Accessors for extended general-purpose registers. binutils >= 2.43 can + * recognize those register symbols. + */ + +static inline void _kvm_read_egpr(int reg, unsigned long *data) +{ + /* mov %r16..%r31, %rax */ + switch (reg) { + case __VCPU_XREG_R16: asm(".byte 0xd5, 0x48, 0x89, 0xc0" : "=a"(*data)); break; + case __VCPU_XREG_R17: asm(".byte 0xd5, 0x48, 0x89, 0xc8" : "=a"(*data)); break; + case __VCPU_XREG_R18: asm(".byte 0xd5, 0x48, 0x89, 0xd0" : "=a"(*data)); break; + case __VCPU_XREG_R19: asm(".byte 0xd5, 0x48, 0x89, 0xd8" : "=a"(*data)); break; + case __VCPU_XREG_R20: asm(".byte 0xd5, 0x48, 0x89, 0xe0" : "=a"(*data)); break; + case __VCPU_XREG_R21: asm(".byte 0xd5, 0x48, 0x89, 0xe8" : "=a"(*data)); break; + case __VCPU_XREG_R22: asm(".byte 0xd5, 0x48, 0x89, 0xf0" : "=a"(*data)); break; + case __VCPU_XREG_R23: asm(".byte 0xd5, 0x48, 0x89, 0xf8" : "=a"(*data)); break; + case __VCPU_XREG_R24: asm(".byte 0xd5, 0x4c, 0x89, 0xc0" : "=a"(*data)); break; + case __VCPU_XREG_R25: asm(".byte 0xd5, 0x4c, 0x89, 0xc8" : "=a"(*data)); break; + case __VCPU_XREG_R26: asm(".byte 0xd5, 0x4c, 0x89, 0xd0" : "=a"(*data)); break; + case __VCPU_XREG_R27: asm(".byte 0xd5, 0x4c, 0x89, 0xd8" : "=a"(*data)); break; + case __VCPU_XREG_R28: asm(".byte 0xd5, 0x4c, 0x89, 0xe0" : "=a"(*data)); break; + case __VCPU_XREG_R29: asm(".byte 0xd5, 0x4c, 0x89, 0xe8" : "=a"(*data)); break; + case __VCPU_XREG_R30: asm(".byte 0xd5, 0x4c, 0x89, 0xf0" : "=a"(*data)); break; + case __VCPU_XREG_R31: asm(".byte 0xd5, 0x4c, 0x89, 0xf8" : "=a"(*data)); break; + default: BUG(); + } +} + +static inline void _kvm_write_egpr(int reg, unsigned long *data) +{ + /* mov %rax, %r16...%r31*/ + switch (reg) { + case __VCPU_XREG_R16: asm(".byte 0xd5, 0x18, 0x89, 0xc0" : : "a"(*data)); break; + case __VCPU_XREG_R17: asm(".byte 0xd5, 0x18, 0x89, 0xc1" : : "a"(*data)); break; + case __VCPU_XREG_R18: asm(".byte 0xd5, 0x18, 0x89, 0xc2" : : "a"(*data)); break; + case __VCPU_XREG_R19: asm(".byte 0xd5, 0x18, 0x89, 0xc3" : : "a"(*data)); break; + case __VCPU_XREG_R20: asm(".byte 0xd5, 0x18, 0x89, 0xc4" : : "a"(*data)); break; + case __VCPU_XREG_R21: asm(".byte 0xd5, 0x18, 0x89, 0xc5" : : "a"(*data)); break; + case __VCPU_XREG_R22: asm(".byte 0xd5, 0x18, 0x89, 0xc6" : : "a"(*data)); break; + case __VCPU_XREG_R23: asm(".byte 0xd5, 0x18, 0x89, 0xc7" : : "a"(*data)); break; + case __VCPU_XREG_R24: asm(".byte 0xd5, 0x19, 0x89, 0xc0" : : "a"(*data)); break; + case __VCPU_XREG_R25: asm(".byte 0xd5, 0x19, 0x89, 0xc1" : : "a"(*data)); break; + case __VCPU_XREG_R26: asm(".byte 0xd5, 0x19, 0x89, 0xc2" : : "a"(*data)); break; + case __VCPU_XREG_R27: asm(".byte 0xd5, 0x19, 0x89, 0xc3" : : "a"(*data)); break; + case __VCPU_XREG_R28: asm(".byte 0xd5, 0x19, 0x89, 0xc4" : : "a"(*data)); break; + case __VCPU_XREG_R29: asm(".byte 0xd5, 0x19, 0x89, 0xc5" : : "a"(*data)); break; + case __VCPU_XREG_R30: asm(".byte 0xd5, 0x19, 0x89, 0xc6" : : "a"(*data)); break; + case __VCPU_XREG_R31: asm(".byte 0xd5, 0x19, 0x89, 0xc7" : : "a"(*data)); break; + default: BUG(); + } +} +#endif + static inline void kvm_fpu_get(void) { fpregs_lock(); @@ -139,8 +194,29 @@ static inline void kvm_write_mmx_reg(int reg, const u64 *data) } #ifdef CONFIG_X86_64 -static inline unsigned long kvm_read_egpr(int reg) { return 0; } -static inline void kvm_write_egpr(int reg, unsigned long data) { } +static inline unsigned long kvm_read_egpr(int reg) +{ + unsigned long data; + + if (WARN_ON_ONCE(!cpu_has_xfeatures(XFEATURE_MASK_APX, NULL))) + return 0; + + kvm_fpu_get(); + _kvm_read_egpr(reg, &data); + kvm_fpu_put(); + + return data; +} + +static inline void kvm_write_egpr(int reg, unsigned long data) +{ + if (WARN_ON_ONCE(!cpu_has_xfeatures(XFEATURE_MASK_APX, NULL))) + return; + + kvm_fpu_get(); + _kvm_write_egpr(reg, &data); + kvm_fpu_put(); +} #endif #endif -- 2.51.0 Define a unified data structure that can represent both the legacy and extended VMX instruction information formats. VMX provides per-instruction metadata for VM exits to help decode the attributes of the instruction that triggered the exit. The legacy format, however, only supports up to 16 GPRs and thus cannot represent EGPRs. To support these new registers, VMX introduces an extended 64-bit layout. Instead of maintaining separate storage for each format, a single union structure makes the overall handling simple. The field names are consistent across both layouts. While the presence of certain fields depends on the instruction type, the offsets remain fixed within each format. Signed-off-by: Chang S. Bae --- arch/x86/kvm/vmx/vmx.h | 61 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ea93121029f9..c358aca7253c 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -311,6 +311,67 @@ struct kvm_vmx { u64 *pid_table; }; +/* + * 32-bit layout of the legacy instruction information field. This format + * supports the 16 legacy GPRs. + */ +struct base_insn_info { + u32 scale : 2; /* Scaling factor */ + u32 reserved1 : 1; + u32 reg1 : 4; /* First register index */ + u32 asize : 3; /* Address size */ + u32 is_reg : 1; /* 0: memory, 1: register */ + u32 osize : 2; /* Operand size */ + u32 reserved2 : 2; + u32 seg : 3; /* Segment register index */ + u32 index : 4; /* Index register index */ + u32 index_invalid : 1; /* 0: valid, 1: invalid */ + u32 base : 4; /* Base register index */ + u32 base_invalid : 1; /* 0: valid, 1: invalid */ + u32 reg2 : 4; /* Second register index */ +}; + +/* + * 64-bit layout of the extended instruction information field, which + * supports EGPRs. + */ +struct ext_insn_info { + u64 scale : 2; /* Scaling factor */ + u64 asize : 2; /* Address size */ + u64 is_reg : 1; /* 0: memory, 1: register */ + u64 osize : 2; /* Operand size */ + u64 seg : 3; /* Segment register index */ + u64 index_invalid : 1; /* 0: valid, 1: invalid */ + u64 base_invalid : 1; /* 0: valid, 1: invalid */ + u64 reserved1 : 4; + u64 reg1 : 5; /* First register index */ + u64 reserved2 : 3; + u64 index : 5; /* Index register index */ + u64 reserved3 : 3; + u64 base : 5; /* Base register index */ + u64 reserved4 : 3; + u64 reg2 : 5; /* Second register index */ + u64 reserved5 : 19; +}; + +/* Union for accessing either the legacy or extended format. */ +union insn_info { + struct base_insn_info base; + struct ext_insn_info ext; + u32 word; + u64 dword; +}; + +/* + * Wrapper structure combining the instruction info and a flag indicating + * whether the extended layout is in use. + */ +struct vmx_insn_info { + /* true if using the extended layout */ + bool extended; + union insn_info info; +}; + static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu) { return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt); -- 2.51.0 Introduce helpers to convert and extract exited instruction attributes, preparing for EGPR support and deprecating some existing helpers. Previously, VMX exit handlers directly decoded the raw VMCS field, resulting in duplicated logic and assumption tied to the legacy layout. With the unified structure, handlers can convert raw data into a structure form and access each instruction attribute by field name. The helper will later determine the format based on the VCPU configuration. For now, there is no functional change since only the legacy layout is used. Signed-off-by: Chang S. Bae --- RFC note: Macro and variable naming may still evolve depending on maintainer/reviewer preferences. --- arch/x86/kvm/vmx/nested.c | 73 +++++++++++++++++++-------------------- arch/x86/kvm/vmx/nested.h | 2 +- arch/x86/kvm/vmx/vmx.c | 14 ++++---- arch/x86/kvm/vmx/vmx.h | 23 ++++++------ 4 files changed, 57 insertions(+), 55 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 47a941989787..4b883ded6c4b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5289,7 +5289,7 @@ static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) * #UD, #GP, or #SS. */ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, int len, gva_t *ret) + struct vmx_insn_info info, bool wr, int len, gva_t *ret) { gva_t off; bool exn; @@ -5303,14 +5303,14 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * For how an actual address is calculated from all these components, * refer to Vol. 1, "Operand Addressing". */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + int scaling = insn_attr(info, scale); + int addr_size = insn_attr(info, asize); + bool is_reg = insn_attr(info, is_reg); + int seg_reg = insn_attr(info, seg); + int index_reg = insn_attr(info, index); + bool index_is_valid = !insn_attr(info, index_invalid); + int base_reg = insn_attr(info, base); + bool base_is_valid = !insn_attr(info, base_invalid); if (is_reg) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5421,7 +5421,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int r; if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), - vmcs_read32(VMX_INSTRUCTION_INFO), false, + vmx_get_insn_info(vcpu), false, sizeof(*vmpointer), &gva)) { *ret = 1; return -EINVAL; @@ -5706,7 +5706,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); unsigned long exit_qualification = vmx_get_exit_qual(vcpu); - u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vmx_insn_info info = vmx_get_insn_info(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; unsigned long field; @@ -5719,7 +5719,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* Decode instruction info and find the field to read */ - field = kvm_gpr_read(vcpu, (((instr_info) >> 28) & 0xf)); + field = kvm_gpr_read(vcpu, insn_attr(info, reg2)); if (!nested_vmx_is_evmptr12_valid(vmx)) { /* @@ -5767,12 +5767,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * Note that the number of bits actually copied is 32 or 64 depending * on the guest's mode (32 or 64 bit), not on the given field's length. */ - if (instr_info & BIT(10)) { - kvm_gpr_write(vcpu, (((instr_info) >> 3) & 0xf), value); + if (insn_attr(info, is_reg)) { + kvm_gpr_write(vcpu, insn_attr(info, reg1), value); } else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - instr_info, true, len, &gva)) + info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); @@ -5812,7 +5812,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); unsigned long exit_qualification = vmx_get_exit_qual(vcpu); - u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vmx_insn_info info = vmx_get_insn_info(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; unsigned long field; @@ -5841,19 +5841,19 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); - if (instr_info & BIT(10)) - value = kvm_gpr_read(vcpu, (((instr_info) >> 3) & 0xf)); + if (insn_attr(info, is_reg)) + value = kvm_gpr_read(vcpu, insn_attr(info, reg1)); else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - instr_info, false, len, &gva)) + info, false, len, &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e); } - field = kvm_gpr_read(vcpu, (((instr_info) >> 28) & 0xf)); + field = kvm_gpr_read(vcpu, insn_attr(info, reg2)); offset = get_vmcs12_field_offset(field); if (offset < 0) @@ -6001,7 +6001,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) static int handle_vmptrst(struct kvm_vcpu *vcpu) { unsigned long exit_qual = vmx_get_exit_qual(vcpu); - u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vmx_insn_info info = vmx_get_insn_info(vcpu); gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; gva_t gva; @@ -6013,7 +6013,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) return 1; - if (get_vmx_mem_address(vcpu, exit_qual, instr_info, + if (get_vmx_mem_address(vcpu, exit_qual, info, true, sizeof(gpa_t), &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ @@ -6029,15 +6029,16 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) static int handle_invept(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 vmx_instruction_info, types; unsigned long type, roots_to_free; + struct vmx_insn_info info; struct kvm_mmu *mmu; gva_t gva; struct x86_exception e; struct { u64 eptp, gpa; } operand; - int i, r, gpr_index; + u32 types; + int i, r; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@ -6049,9 +6050,8 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); - type = kvm_gpr_read(vcpu, gpr_index); + info = vmx_get_insn_info(vcpu); + type = kvm_gpr_read(vcpu, insn_attr(info, reg2)); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -6062,7 +6062,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), - vmx_instruction_info, false, sizeof(operand), &gva)) + info, false, sizeof(operand), &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) @@ -6109,7 +6109,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) static int handle_invvpid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 vmx_instruction_info; + struct vmx_insn_info info; unsigned long type, types; gva_t gva; struct x86_exception e; @@ -6118,7 +6118,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 gla; } operand; u16 vpid02; - int r, gpr_index; + int r; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -6130,9 +6130,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); - type = kvm_gpr_read(vcpu, gpr_index); + info = vmx_get_insn_info(vcpu); + type = kvm_gpr_read(vcpu, insn_attr(info, reg2)); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; @@ -6145,7 +6144,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * operand is read even if it isn't needed (e.g., for type==global) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), - vmx_instruction_info, false, sizeof(operand), &gva)) + info, false, sizeof(operand), &gva)) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) @@ -6483,7 +6482,7 @@ static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, gpa_t bitmap) { - u32 vmx_instruction_info; + struct vmx_insn_info info; unsigned long field; u8 b; @@ -6491,8 +6490,8 @@ static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, return true; /* Decode instruction info and find the field to access */ - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - field = kvm_gpr_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + info = vmx_get_insn_info(vcpu); + field = kvm_gpr_read(vcpu, insn_attr(info, reg2)); /* Out-of-range fields always cause a VM exit from L2 to L1 */ if (field >> 15) diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 983484d42ebf..e54f4e7b3664 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -50,7 +50,7 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, int len, gva_t *ret); + struct vmx_insn_info info, bool wr, int len, gva_t *ret); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c7d38f7692cf..dd8c9517c38c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5925,29 +5925,27 @@ static int handle_monitor_trap(struct kvm_vcpu *vcpu) static int handle_invpcid(struct kvm_vcpu *vcpu) { - u32 vmx_instruction_info; + struct vmx_insn_info info; unsigned long type; gva_t gva; struct { u64 pcid; u64 gla; } operand; - int gpr_index; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); - type = kvm_gpr_read(vcpu, gpr_index); + info = vmx_get_insn_info(vcpu); + type = kvm_gpr_read(vcpu, insn_attr(info, reg2)); /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), - vmx_instruction_info, false, + info, false, sizeof(operand), &gva)) return 1; @@ -6084,7 +6082,9 @@ static int handle_notify(struct kvm_vcpu *vcpu) static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) { - return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); + struct vmx_insn_info info = vmx_get_insn_info(vcpu); + + return insn_attr(info, reg1); } static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index c358aca7253c..a58d9187ed1d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -372,6 +372,19 @@ struct vmx_insn_info { union insn_info info; }; +static inline struct vmx_insn_info vmx_get_insn_info(struct kvm_vcpu *vcpu __maybe_unused) +{ + struct vmx_insn_info insn; + + insn.extended = false; + insn.info.word = vmcs_read32(VMX_INSTRUCTION_INFO); + + return insn; +} + +#define insn_attr(insn, attr) \ + ((insn).extended ? (insn).info.ext.attr : (insn).info.base.attr) + static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu) { return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt); @@ -780,16 +793,6 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(struct kvm_vcpu *vcpu); -static inline int vmx_get_instr_info_reg(u32 vmx_instr_info) -{ - return (vmx_instr_info >> 3) & 0xf; -} - -static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) -{ - return (vmx_instr_info >> 28) & 0xf; -} - static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) { return lapic_in_kernel(vcpu) && enable_ipiv; -- 2.51.0 Introduce a helper to extract the GPR index from the exit qualification field. Some VMX exit qualification, in addition to the VMX instruction info field, encode a GPR index. With the introduction of EGPRs, this field is extended by a previously reserved bit position. This refactoring centralizes the logic so that future updates can handle the extended GPR index without code duplication. Since the VMCS exit qualification is cached in VCPU state, it is safe for the helper to access it directly via the VCPU pointer. This argument will also be used later to determine EGPR availability. No functional change intended. Signed-off-by: Chang S. Bae --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4b883ded6c4b..97ec8e594155 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6404,7 +6404,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - reg = (exit_qualification >> 8) & 15; + reg = vmx_get_exit_qual_gpr(vcpu); val = kvm_gpr_read(vcpu, reg); switch (cr) { case 0: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index dd8c9517c38c..4405724cb874 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5459,7 +5459,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) exit_qualification = vmx_get_exit_qual(vcpu); cr = exit_qualification & 15; - reg = (exit_qualification >> 8) & 15; + reg = vmx_get_exit_qual_gpr(vcpu); switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ val = kvm_gpr_read(vcpu, reg); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a58d9187ed1d..64a0772c883c 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -411,6 +411,11 @@ static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) return vt->exit_qualification; } +static inline int vmx_get_exit_qual_gpr(struct kvm_vcpu *vcpu) +{ + return (vmx_get_exit_qual(vcpu) >> 8) & 0xf; +} + static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); -- 2.51.0 Define the VMCS field offset for the extended instruction information and handle it for nested VMX. When EGPRs are available, VMX provides a new 64-bit field to extend the legacy instruction information, allowing access to the higher register indices. Then, nested VMX needs to propagate this field between L1 and L2. The EGPR checker will be implemented later. Signed-off-by: Chang S. Bae --- RFC note: During the draft, I brought up the offset definition initially for non-nested VMX primarily. Then, I realized the switching helper affects nVMX code anyway. Due to this dependency, this change is placed first together with the offset definition. --- arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/vmx/vmcs12.c | 1 + arch/x86/kvm/vmx/vmcs12.h | 3 ++- arch/x86/kvm/vmx/vmx.h | 2 ++ 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index c85c50019523..ab0684948c56 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -264,6 +264,8 @@ enum vmcs_field { PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, + EXTENDED_INSTRUCTION_INFO = 0x00002406, + EXTENDED_INSTRUCTION_INFO_HIGH = 0x00002407, VMCS_LINK_POINTER = 0x00002800, VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 97ec8e594155..3442610a6b70 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4798,6 +4798,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vm_exit_intr_info = exit_intr_info; vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + if (vmx_egpr_enabled(vcpu)) + vmcs12->extended_instruction_info = vmcs_read64(EXTENDED_INSTRUCTION_INFO); /* * According to spec, there's no need to store the guest's diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 4233b5ca9461..ea2b690a419e 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -53,6 +53,7 @@ const unsigned short vmcs12_field_offsets[] = { FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), + FIELD64(EXTENDED_INSTRUCTION_INFO, extended_instruction_info), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), FIELD64(GUEST_IA32_PAT, guest_ia32_pat), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 4ad6b16525b9..2146e45aaade 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -71,7 +71,7 @@ struct __packed vmcs12 { u64 pml_address; u64 encls_exiting_bitmap; u64 tsc_multiplier; - u64 padding64[1]; /* room for future expansion */ + u64 extended_instruction_info; /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -261,6 +261,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(pml_address, 312); CHECK_OFFSET(encls_exiting_bitmap, 320); CHECK_OFFSET(tsc_multiplier, 328); + CHECK_OFFSET(extended_instruction_info, 336); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 64a0772c883c..b8da6ebc35dc 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -372,6 +372,8 @@ struct vmx_insn_info { union insn_info info; }; +static inline bool vmx_egpr_enabled(struct kvm_vcpu *vcpu __maybe_unused) { return false; } + static inline struct vmx_insn_info vmx_get_insn_info(struct kvm_vcpu *vcpu __maybe_unused) { struct vmx_insn_info insn; -- 2.51.0 Support to 5-bit register indices in VMCS fields when EGPRs are enabled. Signed-off-by: Chang S. Bae --- RFC note: The "chicken bit" (XCR0.APX) checker is intentionally deferred, as the emulator in the next series will do a similar check. Consolidating the XCR0 handling at the end keeps the logic clearer during the feature exposition. --- arch/x86/kvm/vmx/vmx.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index b8da6ebc35dc..6cf1eb739caf 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -374,12 +374,17 @@ struct vmx_insn_info { static inline bool vmx_egpr_enabled(struct kvm_vcpu *vcpu __maybe_unused) { return false; } -static inline struct vmx_insn_info vmx_get_insn_info(struct kvm_vcpu *vcpu __maybe_unused) +static inline struct vmx_insn_info vmx_get_insn_info(struct kvm_vcpu *vcpu) { struct vmx_insn_info insn; - insn.extended = false; - insn.info.word = vmcs_read32(VMX_INSTRUCTION_INFO); + if (vmx_egpr_enabled(vcpu)) { + insn.extended = true; + insn.info.dword = vmcs_read64(EXTENDED_INSTRUCTION_INFO); + } else { + insn.extended = false; + insn.info.word = vmcs_read32(VMX_INSTRUCTION_INFO); + } return insn; } @@ -415,7 +420,10 @@ static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) static inline int vmx_get_exit_qual_gpr(struct kvm_vcpu *vcpu) { - return (vmx_get_exit_qual(vcpu) >> 8) & 0xf; + if (vmx_egpr_enabled(vcpu)) + return (vmx_get_exit_qual(vcpu) >> 8) & 0x1f; + else + return (vmx_get_exit_qual(vcpu) >> 8) & 0xf; } static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) -- 2.51.0 Extend the emulator context and GPR accessors to handle EGPRs before adding support for REX2-prefixed instructions. Now the KVM GPR accessors can handle EGPRs. Then, the emulator can uniformly cache and track all GPRs without requiring separate handling. Signed-off-by: Chang S. Bae --- arch/x86/kvm/kvm_emulate.h | 10 +++++----- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 7b5ddb787a25..153c70ea5561 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -105,13 +105,13 @@ struct x86_instruction_info { struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* - * read_gpr: read a general purpose register (rax - r15) + * read_gpr: read a general purpose register (rax - r31) * * @reg: gpr number. */ ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg); /* - * write_gpr: write a general purpose register (rax - r15) + * write_gpr: write a general purpose register (rax - r31) * * @reg: gpr number. * @val: value to write. @@ -312,7 +312,7 @@ typedef void (*fastop_t)(struct fastop *); * a ModRM or SIB byte. */ #ifdef CONFIG_X86_64 -#define NR_EMULATOR_GPRS 16 +#define NR_EMULATOR_GPRS 32 #else #define NR_EMULATOR_GPRS 8 #endif @@ -361,9 +361,9 @@ struct x86_emulate_ctxt { u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ - u16 regs_valid; + u32 regs_valid; /* bitmaps of registers in _regs[] that have been written */ - u16 regs_dirty; + u32 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 603057ea7421..338986a5a3ae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8812,12 +8812,12 @@ static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ct static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { - return kvm_register_read_raw(emul_to_vcpu(ctxt), reg); + return _kvm_gpr_read(emul_to_vcpu(ctxt), reg); } static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) { - kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val); + _kvm_gpr_write(emul_to_vcpu(ctxt), reg, val); } static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) -- 2.51.0 Restructure how to represent and interpret REX fields. Specifically, * Repurpose the existing rex_prefix field to identify the prefix type * Introduce a new union to hold both REX and REX2 bitfields * Update decoder logic to interpret the unified data type Historically, REX used the upper four bits of a signle byte as a fixed identifier, with the lower bits encoded. REX2 extends this to two bytes. The first byte identifies the prefix, and the second encodes additional bits, preserving compatibility with legacy REX encoding. Previously, the emulator stored the REX byte as-is, which cannot capture REX2 semantics. This refactor prepares for REX2 decoding while preserving current behavior. No functional changes intended. Signed-off-by: Chang S. Bae --- arch/x86/kvm/emulate.c | 33 ++++++++++++++++++--------------- arch/x86/kvm/kvm_emulate.h | 31 ++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4e3da5b497b8..763fbd139242 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -924,7 +924,7 @@ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, int byteop) { void *p; - int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; + int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; @@ -1080,10 +1080,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, { unsigned int reg; - if (ctxt->d & ModRM) + if (ctxt->d & ModRM) { reg = ctxt->modrm_reg; - else - reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); + } else { + reg = (ctxt->b & 7) | + (ctxt->rex.bits.b3 * BIT(3)); + } if (ctxt->d & Sse) { op->type = OP_XMM; @@ -1122,9 +1124,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ - index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ - base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ + ctxt->modrm_reg = ctxt->rex.bits.r3 * BIT(3); + index_reg = ctxt->rex.bits.x3 * BIT(3); + base_reg = ctxt->rex.bits.b3 * BIT(3); ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; @@ -2466,7 +2468,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(&cs, &ss); - if ((ctxt->rex_prefix & 0x8) != 0x0) + if (ctxt->rex.bits.w) usermode = X86EMUL_MODE_PROT64; else usermode = X86EMUL_MODE_PROT32; @@ -4851,7 +4853,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) goto done_prefixes; - ctxt->rex_prefix = ctxt->b; + ctxt->rex_prefix = REX_PREFIX; + ctxt->rex.raw = 0x0f & ctxt->b; continue; case 0xf0: /* LOCK */ ctxt->lock_prefix = 1; @@ -4865,15 +4868,14 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int } /* Any legacy prefix after a REX prefix nullifies its effect. */ - - ctxt->rex_prefix = 0; + ctxt->rex_prefix = REX_NONE; + ctxt->rex.raw = 0; } done_prefixes: - /* REX prefix. */ - if (ctxt->rex_prefix & 8) - ctxt->op_bytes = 8; /* REX.W */ + if (ctxt->rex.bits.w) + ctxt->op_bytes = 8; /* Opcode byte(s). */ opcode = opcode_table[ctxt->b]; @@ -5137,7 +5139,8 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt) { /* Clear fields that are set conditionally but read without a guard. */ ctxt->rip_relative = false; - ctxt->rex_prefix = 0; + ctxt->rex_prefix = REX_NONE; + ctxt->rex.raw = 0; ctxt->lock_prefix = 0; ctxt->rep_prefix = 0; ctxt->regs_valid = 0; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 153c70ea5561..b285299ebfa4 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -317,6 +317,32 @@ typedef void (*fastop_t)(struct fastop *); #define NR_EMULATOR_GPRS 8 #endif +/* + * REX prefix type to distinguish between no prefix, legacy REX, REX2, + * or an invalid REX2 sequence. + */ +enum rex_type { + REX_NONE, + REX_PREFIX, + REX2_PREFIX, + REX2_INVALID +}; + +/* Unified representation for REX/REX2 prefix bits */ +union rex_field { + struct { + u8 b3 :1, /* REX2.B3 or REX.B */ + x3 :1, /* REX2.X3 or REX.X */ + r3 :1, /* REX2.R3 or REX.R */ + w :1, /* REX2.W or REX.W */ + b4 :1, /* REX2.B4 */ + x4 :1, /* REX2.X4 */ + r4 :1, /* REX2.R4 */ + m0 :1; /* REX2.M0 */ + } bits; + u8 raw; +}; + struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; @@ -357,7 +383,10 @@ struct x86_emulate_ctxt { int (*check_perm)(struct x86_emulate_ctxt *ctxt); bool rip_relative; - u8 rex_prefix; + /* Type of REX prefix (none, REX, REX2) */ + enum rex_type rex_prefix; + /* Rex bits */ + union rex_field rex; u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ -- 2.51.0 Refactor opcode lookup to clearly separate handling of different byte sequences and prefix types, in preparation for REX2 support. The decoder begins with a one-byte opcode table by default and falls through to other tables on escape bytes, but the logic is intertwined and hard to extend. REX2 introduces a dedicated bit in its payload byte to indicate which opcode table to use. To accommodate this mapping bit, the existing lookup path needs to be restructured. No functional changes intended. Signed-off-by: Chang S. Bae --- arch/x86/kvm/emulate.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 763fbd139242..9c98843094a1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4773,7 +4773,6 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int ctxt->_eip = ctxt->eip; ctxt->fetch.ptr = ctxt->fetch.data; ctxt->fetch.end = ctxt->fetch.data + insn_len; - ctxt->opcode_len = 1; ctxt->intercept = x86_intercept_none; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); @@ -4877,20 +4876,24 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int if (ctxt->rex.bits.w) ctxt->op_bytes = 8; - /* Opcode byte(s). */ - opcode = opcode_table[ctxt->b]; - /* Two-byte opcode? */ + /* Determine opcode byte(s): */ if (ctxt->b == 0x0f) { - ctxt->opcode_len = 2; + /* Escape byte: start two-byte opcode sequence */ ctxt->b = insn_fetch(u8, ctxt); - opcode = twobyte_table[ctxt->b]; - - /* 0F_38 opcode map */ if (ctxt->b == 0x38) { + /* Three-byte opcode */ ctxt->opcode_len = 3; ctxt->b = insn_fetch(u8, ctxt); opcode = opcode_map_0f_38[ctxt->b]; + } else { + /* Two-byte opcode */ + ctxt->opcode_len = 2; + opcode = twobyte_table[ctxt->b]; } + } else { + /* Single-byte opcode */ + ctxt->opcode_len = 1; + opcode = opcode_table[ctxt->b]; } ctxt->d = opcode.flags; -- 2.51.0 Update register index decoding to account for the additional bit fields introduced by the REX2 prefix. Both ModR/M and opcode register decoding paths now consider the extended index bits (R4, X4, B4) in addition to the legacy REX bits (R3, X3, B3). Signed-off-by: Chang S. Bae --- arch/x86/kvm/emulate.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9c98843094a1..ed3a8c0bca20 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1084,7 +1084,8 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, reg = ctxt->modrm_reg; } else { reg = (ctxt->b & 7) | - (ctxt->rex.bits.b3 * BIT(3)); + (ctxt->rex.bits.b3 * BIT(3)) | + (ctxt->rex.bits.b4 * BIT(4)); } if (ctxt->d & Sse) { @@ -1124,9 +1125,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - ctxt->modrm_reg = ctxt->rex.bits.r3 * BIT(3); - index_reg = ctxt->rex.bits.x3 * BIT(3); - base_reg = ctxt->rex.bits.b3 * BIT(3); + ctxt->modrm_reg = (ctxt->rex.bits.r3 * BIT(3)) | + (ctxt->rex.bits.r4 * BIT(4)); + index_reg = (ctxt->rex.bits.x3 * BIT(3)) | + (ctxt->rex.bits.x4 * BIT(4)); + base_reg = (ctxt->rex.bits.b3 * BIT(3)) | + (ctxt->rex.bits.b4 * BIT(4)); ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; -- 2.51.0 Extend the decoder to find REX2-prefixed opcodes by introducing dedicated REX2 opcode tables. During initialization, clone the legacy opcode tables and patch entries that differ under REX2. Although most REX2-prefixed opcodes follow the legacy tables, some differ for instructions that do not reference extended register bits or are newly introduced under REX2. Using separate tables simplifies the lookup logic and allows efficient patching of exceptions. The EGPR checker will be implemented later. Signed-off-by: Chang S. Bae --- RFC note: The lookup logic could be separated from the table population, but keeping the user of the tables close to their initialization helps clarify the purpose of the new table. If this becomes hard to follow, splitting the lookup separately can be an option. --- arch/x86/kvm/emulate.c | 73 +++++++++++++++++++++++++++++++++++++- arch/x86/kvm/kvm_emulate.h | 2 ++ arch/x86/kvm/x86.c | 1 + 3 files changed, 75 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ed3a8c0bca20..58879a31abcd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4475,6 +4475,19 @@ static const struct opcode opcode_map_0f_38[256] = { N, N, X4(N), X8(N) }; +/* + * REX2 opcode tables. + * + * REX2-prefixed opcodes mostly follow the legacy tables but differ slightly + * for instructions that do not use R/X/B register bits. Initialize the REX2 + * tables by copying the legacy ones, then mark mismatched rows as undefined. + */ +static struct opcode rex2_opcode_table[256] __ro_after_init; +static struct opcode rex2_twobyte_table[256] __ro_after_init; + +static const struct opcode undefined = D(Undefined); +static const struct opcode notimpl = N; + #undef D #undef N #undef G @@ -4761,6 +4774,11 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, return rc; } +static inline bool emul_egpr_enabled(struct x86_emulate_ctxt *ctxt __maybe_unused) +{ + return false; +} + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) { int rc = X86EMUL_CONTINUE; @@ -4881,7 +4899,24 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int ctxt->op_bytes = 8; /* Determine opcode byte(s): */ - if (ctxt->b == 0x0f) { + if (ctxt->rex_prefix == REX2_INVALID) { + /* + * A REX2 prefix was detected, but the prefix decoder + * found invalid byte sequence. + */ + opcode = undefined; + } else if (ctxt->rex_prefix == REX2_PREFIX) { + /* REX2 prefix is only valid when EGPRs are enabled. */ + if (!emul_egpr_enabled(ctxt)) { + opcode = undefined; + } else if (ctxt->rex.bits.m0) { + ctxt->opcode_len = 2; + opcode = rex2_twobyte_table[ctxt->b]; + } else { + ctxt->opcode_len = 1; + opcode = rex2_opcode_table[ctxt->b]; + } + } else if (ctxt->b == 0x0f) { /* Escape byte: start two-byte opcode sequence */ ctxt->b = insn_fetch(u8, ctxt); if (ctxt->b == 0x38) { @@ -5526,3 +5561,39 @@ bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt) return true; } + +static void undefine_row(struct opcode *row) +{ + struct opcode *ptr = row; + int i; + + /* Clear 16 entries per row */ + for (i = 0; i < 0x10; i++, ptr++) + *ptr = undefined; +} + +/* + * Populate REX2 opcode table: + * + * REX2-prefixed opcodes mostly reuse the legacy layout, except for those that + * neither reference extended register bits nor are newly introduced under the + * REX2 prefix. Initialize both single- and two-byte tables by cloning the + * legacy versions, then patch the table for some exceptions. + */ +void __init kvm_init_rex2_opcode_table(void) +{ + /* Copy legacy tables: */ + memcpy(rex2_opcode_table, opcode_table, sizeof(opcode_table)); + memcpy(rex2_twobyte_table, twobyte_table, sizeof(twobyte_table)); + + /* Undefine reserved opcode ranges: */ + undefine_row(&rex2_opcode_table[0x40]); + undefine_row(&rex2_opcode_table[0x70]); + undefine_row(&rex2_opcode_table[0xa0]); + undefine_row(&rex2_opcode_table[0xe0]); + undefine_row(&rex2_twobyte_table[0x30]); + undefine_row(&rex2_twobyte_table[0x80]); + + /* Mark opcode not yet implemented: */ + rex2_opcode_table[0xa1] = notimpl; +} diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index b285299ebfa4..cc16211d61f6 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -589,4 +589,6 @@ static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) return reg_write(ctxt, nr); } +void __init kvm_init_rex2_opcode_table(void); + #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 338986a5a3ae..4c8c2fc3bda6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -14354,6 +14354,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); static int __init kvm_x86_init(void) { kvm_init_xstate_sizes(); + kvm_init_rex2_opcode_table(); kvm_mmu_x86_module_init(); mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); -- 2.51.0 Add support for the new absolute jump, previously unimplemented. This instruction has an unusual quirk: the REX2.W bit uses inverted polarity. Unlike normal REX or REX2 semantics (where W=1 indicates a 64-bit operand size), this instruction uses W=0 to select an 8-byte operand size. The new InvertedWidthPolarity flag and its helper to interpret the W bit correctly, avoiding special-case hacks in the emulator logic. Since the ctxt->op_bytes depends on the instruction flags, the size should be determined after the instruction lookup. Signed-off-by: Chang S. Bae --- arch/x86/kvm/emulate.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 58879a31abcd..03f8e007b14e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -179,6 +179,7 @@ #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ #define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */ +#define InvertedWidthPolarity ((u64)1 << 58) /* Instruction uses inverted REX2.W polarity */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -993,6 +994,16 @@ EM_ASM_2W(btc); EM_ASM_2R(cmp, cmp_r); +static inline bool is_64bit_operand_size(struct x86_emulate_ctxt *ctxt) +{ + /* + * Most instructions interpret REX.W=1 as 64-bit operand size. + * Some REX2 opcodes invert this logic. + */ + return ctxt->d & InvertedWidthPolarity ? + ctxt->rex.bits.w == 0 : ctxt->rex.bits.w == 1; +} + static int em_bsf_c(struct x86_emulate_ctxt *ctxt) { /* If src is zero, do not writeback, but update flags */ @@ -2472,7 +2483,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(&cs, &ss); - if (ctxt->rex.bits.w) + if (is_64bit_operand_size(ctxt)) usermode = X86EMUL_MODE_PROT64; else usermode = X86EMUL_MODE_PROT32; @@ -4486,7 +4497,8 @@ static struct opcode rex2_opcode_table[256] __ro_after_init; static struct opcode rex2_twobyte_table[256] __ro_after_init; static const struct opcode undefined = D(Undefined); -static const struct opcode notimpl = N; +static const struct opcode pfx_d5_a1 = I(SrcImm64 | NearBranch | IsBranch | InvertedWidthPolarity, \ + em_jmp_abs); #undef D #undef N @@ -4543,6 +4555,7 @@ static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt) return true; case SrcNone: case SrcImm: + case SrcImm64: case SrcImmByte: /* * Note, ImmU16 is used only for the stack adjustment operand on ENTER @@ -4895,9 +4908,6 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int done_prefixes: - if (ctxt->rex.bits.w) - ctxt->op_bytes = 8; - /* Determine opcode byte(s): */ if (ctxt->rex_prefix == REX2_INVALID) { /* @@ -4936,6 +4946,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int } ctxt->d = opcode.flags; + if (is_64bit_operand_size(ctxt)) + ctxt->op_bytes = 8; + if (ctxt->d & ModRM) ctxt->modrm = insn_fetch(u8, ctxt); @@ -5594,6 +5607,6 @@ void __init kvm_init_rex2_opcode_table(void) undefine_row(&rex2_twobyte_table[0x30]); undefine_row(&rex2_twobyte_table[0x80]); - /* Mark opcode not yet implemented: */ - rex2_opcode_table[0xa1] = notimpl; + /* Define the REX2-specific absolute jump (0xA1) opcode */ + rex2_opcode_table[0xa1] = pfx_d5_a1; } -- 2.51.0 Explicitly mark EVEX-prefixed opcodes (0x62) as unsupported, clarifying current decoding behavior. While new prefixes like REX2 extend GPR handling, EVEX emulation should be addressed separately once after VEX support is implemented. Signed-off-by: Chang S. Bae --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 03f8e007b14e..9bd61ea496e5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4952,8 +4952,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int if (ctxt->d & ModRM) ctxt->modrm = insn_fetch(u8, ctxt); - /* vex-prefix instructions are not implemented */ - if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && + /* VEX and EVEX-prefixed instructions are not implemented */ + if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4 || ctxt->b == 0x62) && (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) { ctxt->d = NotImpl; } -- 2.51.0 Extend the instruction emulator to recognize and interpret the REX2 prefix byte. Also, detect and flag invalid prefix sequences after a REX2 prefix. In the existing prefix-decoding loop, * The loop exits when the first non-prefix byte is encountered. * Any non-REX prefix clears previously recorded REX information. For REX2, however, once a REX2 prefix is encountered, most subsequent prefixes are invalid. So, each subsequent prefix needs to be validated before continuing the loop. Signed-off-by: Chang S. Bae --- RFC note: The REX2 decoding itself is straightforward. The additional logic is mainly to detect and handle invalid prefix sequences. If this seems excessive, there is a chance to cut off this check since VMX would raise '#UD' on such cases anyway. --- arch/x86/kvm/emulate.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9bd61ea496e5..f9381a4055d6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4844,7 +4844,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int ctxt->op_bytes = def_op_bytes; ctxt->ad_bytes = def_ad_bytes; - /* Legacy prefixes. */ + /* Legacy and REX/REX2 prefixes. */ for (;;) { switch (ctxt->b = insn_fetch(u8, ctxt)) { case 0x66: /* operand-size override */ @@ -4887,9 +4887,20 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) goto done_prefixes; + if (ctxt->rex_prefix == REX2_PREFIX) + break; ctxt->rex_prefix = REX_PREFIX; ctxt->rex.raw = 0x0f & ctxt->b; continue; + case 0xd5: /* REX2 */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + if (ctxt->rex_prefix == REX2_PREFIX && + ctxt->rex.bits.m0 == 0) + break; + ctxt->rex_prefix = REX2_PREFIX; + ctxt->rex.raw = insn_fetch(u8, ctxt); + continue; case 0xf0: /* LOCK */ ctxt->lock_prefix = 1; break; @@ -4901,6 +4912,17 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int goto done_prefixes; } + if (ctxt->rex_prefix == REX2_PREFIX) { + /* + * A legacy or REX prefix following a REX2 prefix + * forms an invalid byte sequences. Likewise, + * a second REX2 prefix following a REX2 prefix + * with M0=0 is invalid. + */ + ctxt->rex_prefix = REX2_INVALID; + goto done_prefixes; + } + /* Any legacy prefix after a REX prefix nullifies its effect. */ ctxt->rex_prefix = REX_NONE; ctxt->rex.raw = 0; -- 2.51.0 Prepare the APX state enabling in XCR0 by implementing the previous placeholders and ensuring its readiness. APX introduces EGPRs, tracked as XSTATE component 19. Like other XSAVE-managed states, EGPR availability is controlled through XCR0, and the registers are accessible only in 64-bit mode. At this point, only VMX supports EGPRs. SVM will require corresponding extensions to handle EGPR indices. The addition to the supported XCR0 mask should accompany guest CPUID exposure, which will be done separately. Signed-off-by: Chang S. Bae --- RFC note Not all callers may need to validate the XCR0 bit -- maybe a capability bit. However, every exit associated with EGPRs should already have that control bit set in the first place. Checking it explicitly does not charge additional cost, so I have this for consistency. --- arch/x86/kvm/emulate.c | 9 +++++++-- arch/x86/kvm/kvm_cache_regs.h | 1 + arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/svm/svm.c | 7 ++++++- arch/x86/kvm/vmx/vmx.h | 9 ++++++++- arch/x86/kvm/x86.c | 11 +++++++++++ 6 files changed, 34 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f9381a4055d6..ba3020e6f469 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4787,9 +4787,14 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, return rc; } -static inline bool emul_egpr_enabled(struct x86_emulate_ctxt *ctxt __maybe_unused) +/* EGPR availability is controlled by the APX feature bit in XCR0. */ +static inline bool emul_egpr_enabled(struct x86_emulate_ctxt *ctxt) { - return false; + u64 xcr0; + + ctxt->ops->get_xcr(ctxt, XCR_XFEATURE_ENABLED_MASK, &xcr0); + + return xcr0 & XFEATURE_MASK_APX; } int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 8ddb01191d6f..acdb3751317c 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -3,6 +3,7 @@ #define ASM_KVM_CACHE_REGS_H #include +#include #define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP) #define KVM_POSSIBLE_CR4_GUEST_BITS \ diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index cc16211d61f6..673a82532c78 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -237,6 +237,7 @@ struct x86_emulate_ops { bool (*is_smm)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); + int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3aa2c37754ef..e6a082686000 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5288,8 +5288,13 @@ static __init int svm_hardware_setup(void) } kvm_enable_efer_bits(EFER_NX); + /* + * APX introduces EGPRs, which require additional VMCB support. + * Disable APX until the necessary extensions are handled. + */ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | - XFEATURE_MASK_BNDCSR); + XFEATURE_MASK_BNDCSR | + XFEATURE_MASK_APX); if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6cf1eb739caf..784aa0504dce 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -372,7 +372,14 @@ struct vmx_insn_info { union insn_info info; }; -static inline bool vmx_egpr_enabled(struct kvm_vcpu *vcpu __maybe_unused) { return false; } +/* + * EGPR availability is controlled by the APX xfeature bit in XCR0 and is + * only accessible in 64-bit mode. + */ +static inline bool vmx_egpr_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.xcr0 & XFEATURE_MASK_APX && is_64_bit_mode(vcpu); +} static inline struct vmx_insn_info vmx_get_insn_info(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c8c2fc3bda6..e087db0f4153 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8843,6 +8843,16 @@ static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); } +static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr) +{ + /* Only support XCR_XFEATURE_ENABLED_MASK now */ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + + *xcr = emul_to_vcpu(ctxt)->arch.xcr0; + return 0; +} + static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) { return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); @@ -8915,6 +8925,7 @@ static const struct x86_emulate_ops emulate_ops = { .is_smm = emulator_is_smm, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, + .get_xcr = emulator_get_xcr, .set_xcr = emulator_set_xcr, .get_untagged_addr = emulator_get_untagged_addr, .is_canonical_addr = emulator_is_canonical_addr, -- 2.51.0 From: Peter Fang Add the APX xfeature bit to the list of supported XCR0 components and expose the APX feature to guests. Define the APX CPUID feature bits and update the maximum supported CPUID leaf to 0x29 to include the APX leaf. On SVM systems, ensure that the feature is not advertised as EGPR support is not yet supported. No APX sub-features are enumerated yet. Those will be exposed in a separate patch. Signed-off-by: Peter Fang Signed-off-by: Chang S. Bae --- Peter had figured out establishing this change by spotting the CPUID maximum updates. --- arch/x86/kvm/cpuid.c | 8 +++++++- arch/x86/kvm/reverse_cpuid.h | 2 ++ arch/x86/kvm/svm/svm.c | 8 ++++++++ arch/x86/kvm/x86.c | 3 ++- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 52524e0ca97f..b90e58f2a42f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1031,6 +1031,7 @@ void kvm_set_cpu_caps(void) F(AVX_VNNI_INT16), F(PREFETCHITI), F(AVX10), + SCATTERED_F(APX), ); kvm_cpu_cap_init(CPUID_7_2_EDX, @@ -1393,7 +1394,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) switch (function) { case 0: /* Limited to the highest leaf implemented in KVM. */ - entry->eax = min(entry->eax, 0x24U); + entry->eax = min(entry->eax, 0x29U); break; case 1: cpuid_entry_override(entry, CPUID_1_EDX); @@ -1638,6 +1639,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; } + case 0x29: { + /* No APX sub-features are supported yet */ + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 743ab25ba787..e9d9fb4070ca 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -35,6 +35,7 @@ #define X86_FEATURE_AVX_VNNI_INT16 KVM_X86_FEATURE(CPUID_7_1_EDX, 10) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) #define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) +#define KVM_X86_FEATURE_APX KVM_X86_FEATURE(CPUID_7_1_EDX, 21) /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) @@ -126,6 +127,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(SGX1); KVM_X86_TRANSLATE_FEATURE(SGX2); KVM_X86_TRANSLATE_FEATURE(SGX_EDECCSSA); + KVM_X86_TRANSLATE_FEATURE(APX); KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC); KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e6a082686000..da57f7506f88 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5271,6 +5271,14 @@ static __init void svm_set_cpu_caps(void) */ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); + + /* + * If the APX xfeature bit is not supported, meaning that VMCB + * support for EGPRs is unavailable, then the APX feature should + * not be exposed to the guest. + */ + if (!(kvm_caps.supported_xcr0 & XFEATURE_MASK_APX)) + kvm_cpu_cap_clear(X86_FEATURE_APX); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e087db0f4153..bcf8e95d88dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -217,7 +217,8 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ - | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) + | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE \ + | XFEATURE_MASK_APX) #define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) /* -- 2.51.0 Add CPUID leaf 0x29 sub-leaf 0 to enumerate APX sub-features to guests. This leaf currently defines the following sub-features: * New Conditional Instructions (NCI) * New Data Destination (NDD) * Flags Suppression (NF) The CPUID leaf is only exposed if the APX feature is enabled. Signed-off-by: Chang S. Bae --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 10 ++++++++-- arch/x86/kvm/reverse_cpuid.h | 4 ++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 940f83c121cf..763872080c64 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -794,6 +794,7 @@ enum kvm_only_cpuid_leafs { CPUID_24_0_EBX, CPUID_8000_0021_ECX, CPUID_7_1_ECX, + CPUID_29_0_EBX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b90e58f2a42f..95c25de641ca 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1063,6 +1063,10 @@ void kvm_set_cpu_caps(void) F(AVX10_512), ); + kvm_cpu_cap_init(CPUID_29_0_EBX, + F(APX_NCI_NDD_NF), + ); + kvm_cpu_cap_init(CPUID_8000_0001_ECX, F(LAHF_LM), F(CMP_LEGACY), @@ -1640,8 +1644,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } case 0x29: { - /* No APX sub-features are supported yet */ - entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + if (!(kvm_caps.supported_xcr0 & XFEATURE_MASK_APX)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } break; } case KVM_CPUID_SIGNATURE: { diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index e9d9fb4070ca..a8eca23ee2d4 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -50,6 +50,9 @@ #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) #define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) +/* Intel-defined sub-features, CPUID level 0x00000029:0 (EBX) */ +#define X86_FEATURE_APX_NCI_NDD_NF KVM_X86_FEATURE(CPUID_29_0_EBX, 0) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -92,6 +95,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, + [CPUID_29_0_EBX] = { 0x29, 0, CPUID_EBX}, }; /* -- 2.51.0 Now that KVM exposes the APX feature to guests on APX-capable systems, extend the selftests to validate XCR0 configuration and state management. Since APX repurposes the XSAVE area previously used by MPX in the non-compacted format, add a check to ensure that MPX states are not set when APX is enabled. Also, load non-init APX state data in the guest so that XSTATE_BV[APX] is set, allowing validation of APX state testing. Signed-off-by: Chang S. Bae --- .../selftests/kvm/include/x86/processor.h | 1 + tools/testing/selftests/kvm/x86/state_test.c | 6 ++++++ .../selftests/kvm/x86/xcr0_cpuid_test.c | 20 +++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 51cd84b9ca66..dde7af40584e 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -88,6 +88,7 @@ struct xstate { #define XFEATURE_MASK_LBR BIT_ULL(15) #define XFEATURE_MASK_XTILE_CFG BIT_ULL(17) #define XFEATURE_MASK_XTILE_DATA BIT_ULL(18) +#define XFEATURE_MASK_APX BIT_ULL(19) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index 141b7fc0c965..6d1dc575b22b 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -167,6 +167,12 @@ static void __attribute__((__flatten__)) guest_code(void *arg) asm volatile ("vmovupd %0, %%zmm16" :: "m" (buffer)); } + if (supported_xcr0 & XFEATURE_MASK_APX) { + /* mov $0xcccccccc, %r16 */ + asm volatile (".byte 0xd5, 0x18, 0xb8, 0xcc, 0xcc," + "0xcc, 0xcc, 0x00, 0x00, 0x00, 0x00"); + } + if (this_cpu_has(X86_FEATURE_MPX)) { uint64_t bounds[2] = { 10, 0xffffffffull }; uint64_t output[2] = { }; diff --git a/tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c index d038c1571729..6e4f2f83c831 100644 --- a/tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86/xcr0_cpuid_test.c @@ -46,6 +46,21 @@ do { \ __supported, (xfeatures)); \ } while (0) +/* + * Verify that mutually exclusive architectural features do not overlap. + * For example, APX and MPX must never be reported as supported together. + */ +#define ASSERT_XFEATURE_CONFLICT(supported_xcr0, xfeatures, conflicts) \ +do { \ + uint64_t __supported = (supported_xcr0) & ((xfeatures) | (conflicts)); \ + \ + __GUEST_ASSERT((__supported & (xfeatures)) != (xfeatures) || \ + !(__supported & (conflicts)), \ + "supported = 0x%lx, xfeatures = 0x%llx, conflicts = 0x%llx", \ + __supported, (xfeatures), (conflicts)); \ +} while (0) + + static void guest_code(void) { uint64_t initial_xcr0; @@ -79,6 +94,11 @@ static void guest_code(void) ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0, XFEATURE_MASK_XTILE); + /* Check APX by ensuring MPX is not exposed concurrently */ + ASSERT_XFEATURE_CONFLICT(supported_xcr0, + XFEATURE_MASK_APX, + XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + vector = xsetbv_safe(0, XFEATURE_MASK_FP); __GUEST_ASSERT(!vector, "Expected success on XSETBV(FP), got %s", -- 2.51.0