EPT exit qualification bit 6 is used when mode-based execute control is enabled, and reflects user executable addresses. Rework name to reflect the intention and add to EPT_VIOLATION_PROT_MASK, which allows simplifying the return evaluation in tdx_is_sept_violation_unexpected_pending a pinch. Rework handling in __vmx_handle_ept_violation to unconditionally clear EPT_VIOLATION_PROT_USER_EXEC until MBEC is implemented, as suggested by Sean [1]. Note: Intel SDM Table 29-7 defines bit 6 as: If the “mode-based execute control” VM-execution control is 0, the value of this bit is undefined. If that control is 1, this bit is the logical-AND of bit 10 in the EPT paging-structure entries used to translate the guest-physical address of the access causing the EPT violation. In this case, it indicates whether the guest-physical address was executable for user-mode linear addresses. [1] https://lore.kernel.org/all/aCJDzU1p_SFNRIJd@google.com/ Suggested-by: Sean Christopherson Signed-off-by: Jon Kohler --- arch/x86/include/asm/vmx.h | 5 +++-- arch/x86/kvm/vmx/common.h | 9 +++++++-- arch/x86/kvm/vmx/tdx.c | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index c85c50019523..de3abec84fe5 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -596,10 +596,11 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_PROT_READ BIT(3) #define EPT_VIOLATION_PROT_WRITE BIT(4) #define EPT_VIOLATION_PROT_EXEC BIT(5) -#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6) +#define EPT_VIOLATION_PROT_USER_EXEC BIT(6) #define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \ EPT_VIOLATION_PROT_WRITE | \ - EPT_VIOLATION_PROT_EXEC) + EPT_VIOLATION_PROT_EXEC | \ + EPT_VIOLATION_PROT_USER_EXEC) #define EPT_VIOLATION_GVA_IS_VALID BIT(7) #define EPT_VIOLATION_GVA_TRANSLATED BIT(8) diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index 412d0829d7a2..adf925500b9e 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -94,8 +94,13 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, /* Is it a fetch fault? */ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) + /* + * ept page table entry is present? + * note: unconditionally clear USER_EXEC until mode-based + * execute control is implemented + */ + error_code |= (exit_qualification & + (EPT_VIOLATION_PROT_MASK & ~EPT_VIOLATION_PROT_USER_EXEC)) ? PFERR_PRESENT_MASK : 0; if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0a49c863c811..61185c30a40e 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1922,7 +1922,7 @@ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcp if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) return false; - return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); + return !(eq & EPT_VIOLATION_PROT_MASK); } static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) -- 2.43.0 SPTE_PERM_MASK is no longer referenced by anything in the kernel. Signed-off-by: Jon Kohler --- arch/x86/kvm/mmu/spte.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 3133f066927e..0fc83c9064c5 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -42,9 +42,6 @@ static_assert(SPTE_TDP_AD_ENABLED == 0); #define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask | shadow_me_mask) - #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK -- 2.43.0 Update SPTE_MMIO_ALLOWED_MASK to allow EPT user executable (bit 10) to be treated like EPT RWX bit2:0, as when mode-based execute control is enabled, bit 10 can act like a "present" bit. No functional changes intended. Cc: Kai Huang Signed-off-by: Jon Kohler --- arch/x86/kvm/mmu/spte.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0fc83c9064c5..b60666778f61 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -96,11 +96,11 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); #undef SHADOW_ACC_TRACK_SAVED_MASK /* - * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * Due to limited space in PTEs, the MMIO generation is an 18 bit subset of * the memslots generation and is derived as follows: * - * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 - * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 + * Bits 0-6 of the MMIO generation are propagated to spte bits 3-9 + * Bits 7-17 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from @@ -111,7 +111,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); */ #define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 10 +#define MMIO_SPTE_GEN_LOW_END 9 #define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 @@ -133,7 +133,8 @@ static_assert(!(SPTE_MMU_PRESENT_MASK & * and so they're off-limits for generation; additional checks ensure the mask * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). */ -#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) +#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | \ + BIT_ULL(10) | GENMASK_ULL(2, 0)) static_assert(!(SPTE_MMIO_ALLOWED_MASK & (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); @@ -141,7 +142,7 @@ static_assert(!(SPTE_MMIO_ALLOWED_MASK & #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ -static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); +static_assert(MMIO_SPTE_GEN_LOW_BITS == 7 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) -- 2.43.0 Introduce ACC_RWX to capture traditional RWX access bits and modify the various consumers of ACC_ALL to use ACC_RWX instead, to prepare for Intel MBEC enablement, as suggested by Sean [1]. The only areas that really need ACC_ALL are kvm_mmu_page_get_access() and trace_mark_mmio_spte(). No functional change intended. [1] https://lore.kernel.org/all/aCI-z5vzzLwxOCfw@google.com/ Suggested-by: Sean Christopherson Signed-off-by: Jon Kohler --- arch/x86/kvm/mmu/mmu.c | 16 ++++++++-------- arch/x86/kvm/mmu/spte.h | 3 ++- arch/x86/kvm/mmu/tdp_mmu.c | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 667d66cf76d5..b1a7c7cc0c44 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3452,7 +3452,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (it.level == fault->goal_level) break; - sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_RWX); if (sp == ERR_PTR(-EEXIST)) continue; @@ -3465,7 +3465,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (WARN_ON_ONCE(it.level != fault->goal_level)) return -EFAULT; - ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_RWX, base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -3698,7 +3698,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * current CPU took the fault. * * Need not check the access of upper level table entries since - * they are always ACC_ALL. + * they are always ACC_RWX. */ if (is_access_allowed(fault, spte)) { ret = RET_PF_SPURIOUS; @@ -4804,7 +4804,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_RWX); if (r != RET_PF_CONTINUE) return r; @@ -4895,7 +4895,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (r) return r; - r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_RWX); if (r != RET_PF_CONTINUE) return r; @@ -5614,7 +5614,7 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, { union kvm_cpu_role role = {0}; - role.base.access = ACC_ALL; + role.base.access = ACC_RWX; role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); role.ext.valid = 1; @@ -5695,7 +5695,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, { union kvm_mmu_page_role role = {0}; - role.access = ACC_ALL; + role.access = ACC_RWX; role.cr0_wp = true; role.efer_nx = true; role.smm = cpu_role.base.smm; @@ -5826,7 +5826,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; - role.base.access = ACC_ALL; + role.base.access = ACC_RWX; role.ext.word = 0; role.ext.execonly = execonly; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index b60666778f61..101a2f61ec96 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -45,7 +45,8 @@ static_assert(SPTE_TDP_AD_ENABLED == 0); #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#define ACC_RWX (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#define ACC_ALL ACC_RWX /* The mask for the R/X bits in EPT PTEs */ #define SPTE_EPT_READABLE_MASK 0x1ull diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c5734ca5c17d..98221ed34c51 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1190,9 +1190,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, } if (unlikely(!fault->slot)) - new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_RWX); else - wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, + wrprot = make_spte(vcpu, sp, fault->slot, ACC_RWX, iter->gfn, fault->pfn, iter->old_spte, fault->prefetch, false, fault->map_writable, &new_spte); -- 2.43.0 Extend kvm_mmu_page_role access bitfield from 3 to 4, where the 4th bit will be used to track user executable pages with Intel mode-based execute control (MBEC). Extend SPTE generation and introduce shadow_ux value to account for user and kernel executable distinctions under MBEC. Extend kvm_mmu_page_role to include a new has_mbec field, such that a given MMU's can be flagged as one with MBEC-awareness, which is to say with an access bitfield that can evaluate to 4 different options. Modified various functions to utilize the new access masks, ensuring compatibility with MBEC. Update mmu documentation to clarify the role of execute permissions when MBEC is enabled. Add capability helper for cpu_has_vmx_mode_based_ept_exec and plumb into kvm_mmu_set_ept_masks. NOTE: Also roll back change to ACC_ALL vs ACC_RWX in mmu.c, as VM's do not boot properly, which likely need to be reworked, open to ideas! Past that, no functional change intended, as nothing sets has_mbec yet. Signed-off-by: Jon Kohler --- Documentation/virt/kvm/x86/mmu.rst | 9 ++++- arch/x86/include/asm/kvm_host.h | 11 ++--- arch/x86/include/asm/vmx.h | 4 ++ arch/x86/kvm/mmu.h | 8 +++- arch/x86/kvm/mmu/mmu.c | 24 +++++------ arch/x86/kvm/mmu/mmutrace.h | 23 +++++++---- arch/x86/kvm/mmu/paging_tmpl.h | 24 +++++++---- arch/x86/kvm/mmu/spte.c | 65 +++++++++++++++++++++++------- arch/x86/kvm/mmu/spte.h | 57 +++++++++++++++++++++++--- arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++-- arch/x86/kvm/vmx/capabilities.h | 6 +++ arch/x86/kvm/vmx/vmx.c | 3 +- 12 files changed, 188 insertions(+), 58 deletions(-) diff --git a/Documentation/virt/kvm/x86/mmu.rst b/Documentation/virt/kvm/x86/mmu.rst index 2b3b6d442302..f63db1a2b3df 100644 --- a/Documentation/virt/kvm/x86/mmu.rst +++ b/Documentation/virt/kvm/x86/mmu.rst @@ -172,7 +172,14 @@ Shadow pages contain the following information: quadrant maps 1GB virtual address space. role.access: Inherited guest access permissions from the parent ptes in the form uwx. - Note execute permission is positive, not negative. + Note execute permission is positive, not negative. When Intel MBEC is + not enabled, permissions follow the uwx form. When Intel MBEC is enabled, + execute is split into two permissions, kernel executable and user + executable, with the split controlled by role.has_mbec. + role.has_mbec: + When role.has_mbec=1, Intel mode-based execute control is active, which + gives the guest the ability to split execute pages into two permissions, + kernel executable and user executable. role.invalid: The page is invalid and should not be used. It is a root page that is currently pinned (by a cpu hardware register pointing to it); once it is diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48598d017d6f..66afcff43ec5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -319,11 +319,11 @@ struct kvm_kernel_irq_routing_entry; * the number of unique SPs that can theoretically be created is 2^n, where n * is the number of bits that are used to compute the role. * - * But, even though there are 20 bits in the mask below, not all combinations + * But, even though there are 22 bits in the mask below, not all combinations * of modes and flags are possible: * * - invalid shadow pages are not accounted, mirror pages are not shadowed, - * so the bits are effectively 18. + * so the bits are effectively 20. * * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); * execonly and ad_disabled are only used for nested EPT which has @@ -338,7 +338,7 @@ struct kvm_kernel_irq_routing_entry; * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. * * Therefore, the maximum number of possible upper-level shadow pages for a - * single gfn is a bit less than 2^13. + * single gfn is a bit less than 2^15. */ union kvm_mmu_page_role { u32 word; @@ -347,7 +347,7 @@ union kvm_mmu_page_role { unsigned has_4_byte_gpte:1; unsigned quadrant:2; unsigned direct:1; - unsigned access:3; + unsigned access:4; unsigned invalid:1; unsigned efer_nx:1; unsigned cr0_wp:1; @@ -357,7 +357,8 @@ union kvm_mmu_page_role { unsigned guest_mode:1; unsigned passthrough:1; unsigned is_mirror:1; - unsigned :4; + unsigned has_mbec:1; + unsigned:2; /* * This is left at the top of the word so that diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index de3abec84fe5..9a98c063148c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -547,6 +547,7 @@ enum vmcs_field { #define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_USER_EXECUTABLE_MASK (1ull << 10) #define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63) #define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ VMX_EPT_WRITABLE_MASK | \ @@ -605,9 +606,12 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_GVA_TRANSLATED BIT(8) #define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3) +#define EPT_VIOLATION_USER_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_USER_EXECUTABLE_MASK) >> 4) static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) == (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC)); +static_assert(EPT_VIOLATION_USER_EXEC_TO_PROT(VMX_EPT_USER_EXECUTABLE_MASK) == + (EPT_VIOLATION_PROT_USER_EXEC)); /* * Exit Qualifications for NOTIFY VM EXIT diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f63074048ec6..558a15ff82e6 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -81,7 +81,8 @@ u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); -void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only, + bool has_mbec); void kvm_init_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, @@ -174,6 +175,11 @@ static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, __kvm_mmu_refresh_passthrough_bits(vcpu, mmu); } +static inline bool mmu_has_mbec(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mmu->root_role.has_mbec; +} + /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b1a7c7cc0c44..b0eb8d4c5ef2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2033,7 +2033,7 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) */ const union kvm_mmu_page_role sync_role_ign = { .level = 0xf, - .access = 0x7, + .access = 0xf, .quadrant = 0x3, .passthrough = 0x1, }; @@ -3080,7 +3080,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, ret = RET_PF_SPURIOUS; } else { flush |= mmu_spte_update(sptep, spte); - trace_kvm_mmu_set_spte(level, gfn, sptep); + trace_kvm_mmu_set_spte(vcpu, level, gfn, sptep); } if (wrprot && write_fault) @@ -3452,7 +3452,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (it.level == fault->goal_level) break; - sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_RWX); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); if (sp == ERR_PTR(-EEXIST)) continue; @@ -3465,7 +3465,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (WARN_ON_ONCE(it.level != fault->goal_level)) return -EFAULT; - ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_RWX, + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -3698,9 +3698,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * current CPU took the fault. * * Need not check the access of upper level table entries since - * they are always ACC_RWX. + * they are always ACC_ALL. */ - if (is_access_allowed(fault, spte)) { + if (is_access_allowed(fault, spte, vcpu)) { ret = RET_PF_SPURIOUS; break; } @@ -3748,7 +3748,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || - !is_access_allowed(fault, new_spte)) + !is_access_allowed(fault, new_spte, vcpu)) break; /* @@ -4804,7 +4804,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_RWX); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -4895,7 +4895,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (r) return r; - r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_RWX); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -5614,7 +5614,7 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, { union kvm_cpu_role role = {0}; - role.base.access = ACC_RWX; + role.base.access = ACC_ALL; role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); role.ext.valid = 1; @@ -5695,7 +5695,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, { union kvm_mmu_page_role role = {0}; - role.access = ACC_RWX; + role.access = ACC_ALL; role.cr0_wp = true; role.efer_nx = true; role.smm = cpu_role.base.smm; @@ -5826,7 +5826,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; - role.base.access = ACC_RWX; + role.base.access = ACC_ALL; role.ext.word = 0; role.ext.execonly = execonly; diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 764e3015d021..74d51370422a 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -22,10 +22,16 @@ __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; +/* + * X == ACC_EXEC_MASK: executable without guest_exec_control and only + * supervisor execute with guest exec control + * x == ACC_USER_EXEC_MASK: user execute with guest exec control + */ #define KVM_MMU_PAGE_PRINTK() ({ \ const char *saved_ptr = trace_seq_buffer_ptr(p); \ static const char *access_str[] = { \ - "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ + "----", "---X", "-w--", "-w-X", "--u-", "--uX", "-wu-", "-wuX", \ + "x---", "x--X", "xw--", "xw-X", "x-u-", "x-uX", "xwu-", "xwuX" \ }; \ union kvm_mmu_page_role role; \ \ @@ -336,8 +342,8 @@ TRACE_EVENT( TRACE_EVENT( kvm_mmu_set_spte, - TP_PROTO(int level, gfn_t gfn, u64 *sptep), - TP_ARGS(level, gfn, sptep), + TP_PROTO(struct kvm_vcpu *vcpu, int level, gfn_t gfn, u64 *sptep), + TP_ARGS(vcpu, level, gfn, sptep), TP_STRUCT__entry( __field(u64, gfn) @@ -346,7 +352,8 @@ TRACE_EVENT( __field(u8, level) /* These depend on page entry type, so compute them now. */ __field(bool, r) - __field(bool, x) + __field(bool, kx) + __field(bool, ux) __field(signed char, u) ), @@ -356,15 +363,17 @@ TRACE_EVENT( __entry->sptep = virt_to_phys(sptep); __entry->level = level; __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); - __entry->x = is_executable_pte(__entry->spte); + __entry->kx = is_executable_pte(__entry->spte, false, vcpu); + __entry->ux = is_executable_pte(__entry->spte, true, vcpu); __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; ), - TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", + TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx", __entry->gfn, __entry->spte, __entry->r ? "r" : "-", __entry->spte & PT_WRITABLE_MASK ? "w" : "-", - __entry->x ? "x" : "-", + __entry->kx ? "X" : "-", + __entry->ux ? "x" : "-", __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), __entry->level, __entry->sptep ) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ed762bb4b007..664b424108ed 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -124,12 +124,17 @@ static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *acce *access &= mask; } -static inline int FNAME(is_present_gpte)(unsigned long pte) +static inline int FNAME(is_present_gpte)(struct kvm_vcpu *vcpu, + unsigned long pte) { #if PTTYPE != PTTYPE_EPT return pte & PT_PRESENT_MASK; #else - return pte & 7; + /* + * For EPT, an entry is present if any of bits 2:0 are set. + * With mode-based execute control, bit 10 also indicates presence. + */ + return (pte & 7) || (mmu_has_mbec(vcpu) && (pte & (1ULL << 10))); #endif } @@ -152,7 +157,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (!FNAME(is_present_gpte)(gpte)) + if (!FNAME(is_present_gpte)(vcpu, gpte)) goto no_present; /* Prefetch only accessed entries (unless A/D bits are disabled). */ @@ -181,8 +186,9 @@ static inline unsigned FNAME(gpte_access)(u64 gpte) unsigned access; #if PTTYPE == PTTYPE_EPT access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | - ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | - ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ((gpte & VMX_EPT_USER_EXECUTABLE_MASK) ? ACC_USER_EXEC_MASK : 0) | + ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); BUILD_BUG_ON(ACC_EXEC_MASK != 1); @@ -332,7 +338,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!FNAME(is_present_gpte)(pte)) + if (!FNAME(is_present_gpte)(vcpu, pte)) goto error; --walker->level; } @@ -414,7 +420,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, */ pte_access = pt_access & (pte ^ walk_nx_mask); - if (unlikely(!FNAME(is_present_gpte)(pte))) + if (unlikely(!FNAME(is_present_gpte)(vcpu, pte))) goto error; if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { @@ -493,6 +499,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, * out of date if it is serving an EPT misconfiguration. * [5:3] - Calculated by the page walk of the guest EPT page tables * [7:8] - Derived from [7:8] of real exit_qualification + * [10] - Derived from real exit_qualification, useful only with MBEC. * * The other bits are set to 0. */ @@ -511,6 +518,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, * ACC_*_MASK flags! */ walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); + if (mmu_has_mbec(vcpu)) + walker->fault.exit_qualification |= + EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access); } #endif walker->fault.address = addr; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 37647afde7d3..a4720eedcacb 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -30,6 +30,7 @@ u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +u64 __read_mostly shadow_ux_mask; u64 __read_mostly shadow_user_mask; u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; @@ -223,19 +224,38 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * would tie make_spte() further to vCPU/MMU state, and add complexity * just to optimize a mode that is anything but performance critical. */ - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && + if (level > PG_LEVEL_4K && + (pte_access & (ACC_EXEC_MASK | ACC_USER_EXEC_MASK)) && is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; + if (mmu_has_mbec(vcpu)) + pte_access &= ~ACC_USER_EXEC_MASK; } - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else + if (pte_access & (ACC_EXEC_MASK | ACC_USER_EXEC_MASK)) { + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + + if (pte_access & ACC_USER_EXEC_MASK) + spte |= shadow_ux_mask; + } else { spte |= shadow_nx_mask; + } if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; + /* + * With MBEC enabled, EPT misconfigurations occur if bit 0 is clear + * (read disabled) and bit 10 is set (user-executable). Prevent the + * creation of such invalid SPTEs by clearing the user-executable bit + * when read access is not permitted. + */ + if (mmu_has_mbec(vcpu) && + !(spte & VMX_EPT_READABLE_MASK) && + (spte & VMX_EPT_USER_EXECUTABLE_MASK)) + spte &= ~VMX_EPT_USER_EXECUTABLE_MASK; + if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; @@ -311,17 +331,21 @@ static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) KVM_MMU_WARN_ON(set & clear); spte = (spte | set) & ~clear; + /* + * With MBEC enabled, ensure we don't create invalid SPTEs where + * read access is disabled but user-executable access is enabled. + */ + if (shadow_ux_mask && + !(spte & VMX_EPT_READABLE_MASK) && + (spte & VMX_EPT_USER_EXECUTABLE_MASK)) + spte &= ~VMX_EPT_USER_EXECUTABLE_MASK; + if (is_access_track) spte = mark_spte_for_access_track(spte); return spte; } -static u64 make_spte_executable(u64 spte) -{ - return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); -} - static u64 make_spte_nonexecutable(u64 spte) { return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); @@ -356,8 +380,14 @@ u64 make_small_spte(struct kvm *kvm, u64 huge_spte, * the page executable as the NX hugepage mitigation no longer * applies. */ - if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) - child_spte = make_spte_executable(child_spte); + if ((role.access & (ACC_EXEC_MASK | ACC_USER_EXEC_MASK)) && + is_nx_huge_page_enabled(kvm)) { + if (role.access & ACC_EXEC_MASK) + child_spte |= shadow_x_mask; + + if (role.access & ACC_USER_EXEC_MASK) + child_spte |= shadow_ux_mask; + } } return child_spte; @@ -389,7 +419,8 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) u64 spte = SPTE_MMU_PRESENT_MASK; spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_value; + shadow_user_mask | shadow_ux_mask | shadow_x_mask | + shadow_me_value; if (ad_disabled) spte |= SPTE_TDP_AD_DISABLED; @@ -489,7 +520,8 @@ void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask); -void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only, + bool has_mbec) { kvm_ad_enabled = has_ad_bits; @@ -501,7 +533,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ shadow_present_mask = (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; - + shadow_ux_mask = has_mbec ? VMX_EPT_USER_EXECUTABLE_MASK : 0ull; shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -509,9 +541,11 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) /* * EPT Misconfigurations are generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). + * With MBEC, the additional case of bit 0 clear and bit 10 set + * (read disabled but user-executable) is prevented in make_spte(). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, - VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); + (VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT), 0); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_ept_masks); @@ -551,6 +585,7 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_dirty_mask = PT_DIRTY_MASK; shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; + shadow_ux_mask = 0; shadow_present_mask = PT_PRESENT_MASK; shadow_acc_track_mask = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 101a2f61ec96..74fb1fe60d89 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -45,8 +45,9 @@ static_assert(SPTE_TDP_AD_ENABLED == 0); #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK +#define ACC_USER_EXEC_MASK (1ULL << 3) #define ACC_RWX (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) -#define ACC_ALL ACC_RWX +#define ACC_ALL (ACC_RWX | ACC_USER_EXEC_MASK) /* The mask for the R/X bits in EPT PTEs */ #define SPTE_EPT_READABLE_MASK 0x1ull @@ -180,6 +181,7 @@ extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; +extern u64 __read_mostly shadow_ux_mask; extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; @@ -344,9 +346,53 @@ static inline bool is_last_spte(u64 pte, int level) return (level == PG_LEVEL_4K) || is_large_pte(pte); } -static inline bool is_executable_pte(u64 spte) +static inline bool is_executable_pte(u64 spte, bool is_user_access, + struct kvm_vcpu *vcpu) { - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; + if (spte & shadow_nx_mask) + return false; + + if (!mmu_has_mbec(vcpu)) + return (spte & shadow_x_mask) == shadow_x_mask; + + /* + * Warn against AMD systems (where shadow_x_mask == 0) reaching + * this point, so this will always evaluate to true for user-mode + * pages, but until GMET is implemented, this should be a no-op. + */ + if (WARN_ON_ONCE(!shadow_x_mask)) + return is_user_access || !(spte & shadow_user_mask); + + return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask); +} + +static inline bool is_executable_pte_fault(u64 spte, + struct kvm_page_fault *fault, + struct kvm_vcpu *vcpu) +{ + if (spte & shadow_nx_mask) + return false; + + if (!mmu_has_mbec(vcpu)) + return (spte & shadow_x_mask) == shadow_x_mask; + + /* + * Warn against AMD systems (where shadow_x_mask == 0) reaching + * this point, so this will always evaluate to true for user-mode + * pages, but until GMET is implemented, this should be a no-op. + */ + if (WARN_ON_ONCE(!shadow_x_mask)) + return fault->user || !(spte & shadow_user_mask); + + /* + * For TDP MMU, the fault->user bit indicates a read access, + * not the guest's CPL. For execute faults, check both execute + * permissions since we don't know the actual CPL. + */ + if (fault->is_tdp) + return spte & (shadow_x_mask | shadow_ux_mask); + + return spte & (fault->user ? shadow_ux_mask : shadow_x_mask); } static inline kvm_pfn_t spte_to_pfn(u64 pte) @@ -479,10 +525,11 @@ static inline bool is_mmu_writable_spte(u64 spte) * SPTE protections. Note, the caller is responsible for checking that the * SPTE is a shadow-present, leaf SPTE (either before or after). */ -static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) +static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte, + struct kvm_vcpu *vcpu) { if (fault->exec) - return is_executable_pte(spte); + return is_executable_pte_fault(spte, fault, vcpu); if (fault->write) return is_writable_pte(spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 98221ed34c51..46988a11dc51 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1183,16 +1183,20 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, return RET_PF_RETRY; if (is_shadow_present_pte(iter->old_spte) && - (fault->prefetch || is_access_allowed(fault, iter->old_spte)) && + (fault->prefetch || + is_access_allowed(fault, iter->old_spte, vcpu)) && is_last_spte(iter->old_spte, iter->level)) { WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte)); return RET_PF_SPURIOUS; } if (unlikely(!fault->slot)) - new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_RWX); + new_spte = make_mmio_spte(vcpu, iter->gfn, + ACC_ALL); else - wrprot = make_spte(vcpu, sp, fault->slot, ACC_RWX, iter->gfn, + wrprot = make_spte(vcpu, sp, fault->slot, + ACC_ALL, + iter->gfn, fault->pfn, iter->old_spte, fault->prefetch, false, fault->map_writable, &new_spte); @@ -1220,7 +1224,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, new_spte); ret = RET_PF_EMULATE; } else { - trace_kvm_mmu_set_spte(iter->level, iter->gfn, + trace_kvm_mmu_set_spte(vcpu, iter->level, iter->gfn, rcu_dereference(iter->sptep)); } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 02aadb9d730e..8107c8cb1d4b 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -270,6 +270,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void) SECONDARY_EXEC_TSC_SCALING; } +static inline bool cpu_has_vmx_mode_based_ept_exec(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_MODE_BASED_EPT_EXEC; +} + static inline bool cpu_has_vmx_bus_lock_detection(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b6e370213769..520ccca27502 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8607,7 +8607,8 @@ __init int vmx_hardware_setup(void) if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, - cpu_has_vmx_ept_execute_only()); + cpu_has_vmx_ept_execute_only(), + cpu_has_vmx_mode_based_ept_exec()); else vt_x86_ops.get_mt_mask = NULL; -- 2.43.0 Extend __vmx_handle_ept_violation to understand mmu_has_mbec and the differences between user mode and kernel mode fetches. Add synthetic PF bit PFERR_USER_FETCH_MASK in EPT violation handler, used in error_code as a signal that the EPT violation is a user mode instruction fetch into permission_fault. Extend permissions_fault and route mmu_has_mbec to a special handler, mbec_permission_fault, since permission_fault can no longer trivially shift to figure out if there was a permission fault or not. Signed-off-by: Jon Kohler --- arch/x86/include/asm/kvm_host.h | 8 +++- arch/x86/kvm/mmu.h | 7 +++- arch/x86/kvm/mmu/mmu.c | 66 +++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/spte.h | 14 ++++--- arch/x86/kvm/vmx/common.h | 22 ++++++----- 5 files changed, 100 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 66afcff43ec5..99381c55fceb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -286,7 +286,13 @@ enum x86_intercept_stage; * when the guest was accessing private memory. */ #define PFERR_PRIVATE_ACCESS BIT_ULL(49) -#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) +/* + * USER_FETCH_MASK is a KVM-defined flag used to indicate user fetches when + * translating EPT violations for Intel MBEC. + */ +#define PFERR_USER_FETCH_MASK BIT_ULL(50) +#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS | \ + PFERR_USER_FETCH_MASK) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 558a15ff82e6..d7bf679183f7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -95,6 +95,8 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu); +bool mbec_permission_fault(struct kvm_vcpu *vcpu, unsigned int pte_access, + unsigned int pfec); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); @@ -216,7 +218,10 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, kvm_mmu_refresh_passthrough_bits(vcpu, mmu); - fault = (mmu->permissions[index] >> pte_access) & 1; + if (mmu_has_mbec(vcpu)) + fault = mbec_permission_fault(vcpu, pte_access, pfec); + else + fault = (mmu->permissions[index] >> pte_access) & 1; WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK)); if (unlikely(mmu->pkru_mask)) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b0eb8d4c5ef2..673f2cebc36c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5664,6 +5664,72 @@ void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, reset_guest_paging_metadata(vcpu, mmu); } +/* + * Check permissions for MBEC-enabled EPT accesses. + * Handles all permission checks with MBEC awareness (UX/KX distinction). + * + * Returns true if access should fault, false otherwise. + */ +bool mbec_permission_fault(struct kvm_vcpu *vcpu, unsigned int pte_access, + unsigned int pfec) +{ + bool has_ux = pte_access & ACC_USER_EXEC_MASK; + bool has_kx = pte_access & ACC_EXEC_MASK; + bool write_fault = false; + bool fetch_fault = false; + bool read_fault = false; + + /* + * Fault conditions: + * - Write fault: pfec has WRITE_MASK set but pte_access lacks + * WRITE permission + * - Fetch fault: pfec has FETCH_MASK set but pte_access lacks + * matching execute permission. For MBEC, checks both guest PTE + * U/S bits and CPL, both are additive: + * * If neither UX nor KX is set: + * always fault (no execute permission at all) + * * User fetch (guest PTE user OR CPL > 0): + * requires UX permission (has_ux) + * * Kernel fetch (guest PTE supervisor AND CPL = 0): + * requires KX permission (has_kx) + * - Read fault: pfec has USER_MASK set (read access in EPT + * context) but pte_access lacks read permission + * + * Note: In EPT context, PFERR_USER_MASK indicates read access, + * not user-mode access. This is different from regular paging + * where PFERR_USER_MASK means user-mode (CPL=3). + * ACC_USER_MASK in EPT context maps to VMX_EPT_READABLE_MASK + * (bit 0), the readable permission. + */ + + /* Check write permission independently */ + if (pfec & PFERR_WRITE_MASK) + write_fault = !(pte_access & ACC_WRITE_MASK); + + /* Check fetch permission independently */ + if (pfec & PFERR_FETCH_MASK) { + /* + * For MBEC, check execute permissions. A fetch faults if: + * - User fetch (guest PTE user OR CPL > 0) lacks UX permission + * - Kernel fetch (guest PTE supervisor AND CPL = 0) lacks KX permission + */ + bool is_user_fetch = (pfec & PFERR_USER_FETCH_MASK) || + (kvm_x86_call(get_cpl)(vcpu) > 0); + + /* + * A user-mode fetch requires user-execute permission (UX). + * A kernel-mode fetch requires kernel-execute permission (KX). + */ + fetch_fault = is_user_fetch ? !has_ux : !has_kx; + } + + /* Check read permission: PFERR_USER_MASK indicates read in EPT */ + if (pfec & PFERR_USER_MASK) + read_fault = !(pte_access & ACC_USER_MASK); + + return write_fault || fetch_fault || read_fault; +} + static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { int maxpa; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 74fb1fe60d89..cb94f039898d 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -383,14 +383,18 @@ static inline bool is_executable_pte_fault(u64 spte, */ if (WARN_ON_ONCE(!shadow_x_mask)) return fault->user || !(spte & shadow_user_mask); - /* - * For TDP MMU, the fault->user bit indicates a read access, - * not the guest's CPL. For execute faults, check both execute - * permissions since we don't know the actual CPL. + * For TDP MMU, fault->user indicates a read access, not CPL. + * For execute faults, we don't know the CPL here, so we can't + * definitively check permissions. Being optimistic and checking + * for any execute permission can lead to infinite fault loops + * if the wrong type of execute permission is present (e.g. UX + * only for a kernel fetch). The safe approach is to be + * pessimistic and return false, forcing the fault to the slow + * path which can do a full permission check. */ if (fault->is_tdp) - return spte & (shadow_x_mask | shadow_ux_mask); + return false; return spte & (fault->user ? shadow_ux_mask : shadow_x_mask); } diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index adf925500b9e..96bdca78696d 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -83,6 +83,7 @@ static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa) static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long exit_qualification) { + unsigned long rwx_mask; u64 error_code; /* Is it a read fault? */ @@ -92,16 +93,17 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) ? PFERR_WRITE_MASK : 0; /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* - * ept page table entry is present? - * note: unconditionally clear USER_EXEC until mode-based - * execute control is implemented - */ - error_code |= (exit_qualification & - (EPT_VIOLATION_PROT_MASK & ~EPT_VIOLATION_PROT_USER_EXEC)) - ? PFERR_PRESENT_MASK : 0; + if (exit_qualification & EPT_VIOLATION_ACC_INSTR) { + error_code |= PFERR_FETCH_MASK; + if (mmu_has_mbec(vcpu) && + exit_qualification & EPT_VIOLATION_PROT_USER_EXEC) + error_code |= PFERR_USER_FETCH_MASK; + } + /* ept page table entry is present? */ + rwx_mask = EPT_VIOLATION_PROT_MASK; + if (mmu_has_mbec(vcpu)) + rwx_mask |= EPT_VIOLATION_PROT_USER_EXEC; + error_code |= (exit_qualification & rwx_mask) ? PFERR_PRESENT_MASK : 0; if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? -- 2.43.0 Extend EVMCS1_SUPPORTED_2NDEXEC to allow MBEC and EVMCS to coexist. Presenting both EVMCS and MBEC simultaneously causes KVM to filter out MBEC and not present it as a supported control to the guest, preventing performance gains from MBEC when Windows HVCI is enabled. The guest may choose not to use MBEC (e.g., if the admin does not enable Windows HVCI / Memory Integrity), but if they use traditional nested virt (Hyper-V, WSL2, etc.), having EVMCS exposed is important for improving nested guest performance. IOW allowing MBEC and EVMCS to coexist provides maximum optionality to Windows users without overcomplicating VM administration. Signed-off-by: Jon Kohler --- arch/x86/kvm/vmx/hyperv_evmcs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h index 6536290f4274..0568f76aafc1 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.h +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -87,6 +87,7 @@ SECONDARY_EXEC_PT_CONCEAL_VMX | \ SECONDARY_EXEC_BUS_LOCK_DETECTION | \ SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_MODE_BASED_EPT_EXEC | \ SECONDARY_EXEC_ENCLS_EXITING) #define EVMCS1_SUPPORTED_3RDEXEC (0ULL) -- 2.43.0 Add SECONDARY_EXEC_MODE_BASED_EPT_EXEC as optional secondary execution control bit; however, this is not used by L1 VM's, so filter out this similar to how VMFUNC is treated. Advertise SECONDARY_EXEC_MODE_BASED_EPT_EXEC (MBEC) to userspace, which allows userspace to expose and advertise the feature to the guest. When MBEC is enabled by userspace, configure mmu root_role has_mbec. Signed-off-by: Jon Kohler --- arch/x86/kvm/vmx/nested.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 7 +++++++ arch/x86/kvm/vmx/vmx.h | 1 + 3 files changed, 14 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcea087b642f..ca1f548e0703 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -474,6 +474,7 @@ static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; @@ -483,6 +484,8 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC)) + vcpu->arch.mmu->root_role.has_mbec = true; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -7313,6 +7316,9 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps, msrs->ept_caps |= VMX_EPT_AD_BIT; } + if (cpu_has_vmx_mode_based_ept_exec()) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_MODE_BASED_EPT_EXEC; /* * Advertise EPTP switching irrespective of hardware support, * KVM emulates it in software so long as VMFUNC is supported. diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 520ccca27502..e23e4ffdc1b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2682,6 +2682,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmx_cap->ept = 0; + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC; _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && @@ -4610,6 +4611,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) */ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; + /* + * KVM doesn't support mode-based EPT execute control for L1, but the + * capability is advertised to L1 guests so they can use it for L2. + */ + exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC; + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, * in vmx_set_cr4. */ exec_control &= ~SECONDARY_EXEC_DESC; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bb3d96b620b1..ef45e0ca0bb8 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -584,6 +584,7 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_ENABLE_VMFUNC | \ SECONDARY_EXEC_BUS_LOCK_DETECTION | \ SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_MODE_BASED_EPT_EXEC | \ SECONDARY_EXEC_ENCLS_EXITING | \ SECONDARY_EXEC_EPT_VIOLATION_VE) -- 2.43.0