When EPT is in use, replace ACC_USER_MASK with ACC_USER_EXEC_MASK, so that supervisor and user-mode execution can be controlled independently (ACC_USER_MASK would not allow a setting similar to XU=0 XS=1 W=1 R=1). Replace shadow_x_mask with shadow_xs_mask/shadow_xu_mask, to allow setting XS and XU bits separately in EPT entries. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/mmu/mmu.c | 15 ++++++++--- arch/x86/kvm/mmu/mmutrace.h | 6 ++--- arch/x86/kvm/mmu/paging_tmpl.h | 4 +++ arch/x86/kvm/mmu/spte.c | 47 ++++++++++++++++++++++------------ arch/x86/kvm/mmu/spte.h | 8 +++--- 6 files changed, 55 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 4a0804cc7c82..0041f8a77447 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -538,6 +538,7 @@ enum vmcs_field { #define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_USER_EXECUTABLE_MASK (1ull << 10) #define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63) #define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ VMX_EPT_WRITABLE_MASK | \ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b7366e416baa..254d69c4b9f3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5371,7 +5371,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, static inline bool boot_cpu_is_amd(void) { WARN_ON_ONCE(!tdp_enabled); - return shadow_x_mask == 0; + return shadow_xs_mask == 0; } /* @@ -5450,7 +5450,6 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) { unsigned byte; - const u16 x = ACC_BITS_MASK(ACC_EXEC_MASK); const u16 w = ACC_BITS_MASK(ACC_WRITE_MASK); const u16 r = ACC_BITS_MASK(ACC_READ_MASK); @@ -5491,8 +5490,18 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) u16 smapf = 0; if (ept) { - ff = (pfec & PFERR_FETCH_MASK) ? (u16)~x : 0; + const u16 xs = ACC_BITS_MASK(ACC_EXEC_MASK); + const u16 xu = ACC_BITS_MASK(ACC_USER_EXEC_MASK); + + if (pfec & PFERR_FETCH_MASK) { + /* Ignore XU unless MBEC is enabled. */ + if (cr4_smep) + ff = pfec & PFERR_USER_MASK ? (u16)~xu : (u16)~xs; + else + ff = (u16)~xs; + } } else { + const u16 x = ACC_BITS_MASK(ACC_EXEC_MASK); const u16 u = ACC_BITS_MASK(ACC_USER_MASK); /* Faults from kernel mode accesses to user pages */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 44545f6f860a..e22588d3e145 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -354,8 +354,8 @@ TRACE_EVENT( __entry->sptep = virt_to_phys(sptep); __entry->level = level; __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); - __entry->x = is_executable_pte(__entry->spte); - __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; + __entry->x = (__entry->spte & (shadow_xs_mask | shadow_nx_mask)) == shadow_xs_mask; + __entry->u = !!(__entry->spte & (shadow_xu_mask | shadow_user_mask)); ), TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", @@ -363,7 +363,7 @@ TRACE_EVENT( __entry->r ? "r" : "-", __entry->spte & PT_WRITABLE_MASK ? "w" : "-", __entry->x ? "x" : "-", - __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), + __entry->u ? "u" : "-", __entry->level, __entry->sptep ) ); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index bbdbf4ae2d65..c657ea90bb33 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -174,6 +174,10 @@ static inline unsigned FNAME(gpte_access)(u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT + /* + * For now nested MBEC is not supported and permission_fault() ignores + * ACC_USER_EXEC_MASK. + */ access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | ((gpte & VMX_EPT_READABLE_MASK) ? ACC_READ_MASK : 0); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 0b09124b0d54..0b3e2b97afbf 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -29,8 +29,9 @@ bool __read_mostly kvm_ad_enabled; u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; -u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ u64 __read_mostly shadow_user_mask; +u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */ +u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */ u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; u64 __read_mostly shadow_mmio_value; @@ -216,22 +217,30 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * when CR0.PG is toggled, but leveraging that to ignore the mitigation * would tie make_spte() further to vCPU/MMU state, and add complexity * just to optimize a mode that is anything but performance critical. + * + * Use ACC_USER_EXEC_MASK here assuming only Intel processors (EPT) + * are affected by the NX huge page erratum. */ - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && + if (level > PG_LEVEL_4K && + (pte_access & (ACC_EXEC_MASK | ACC_USER_EXEC_MASK)) && is_nx_huge_page_enabled(vcpu->kvm)) { - pte_access &= ~ACC_EXEC_MASK; + pte_access &= ~(ACC_EXEC_MASK | ACC_USER_EXEC_MASK); } if (pte_access & ACC_READ_MASK) spte |= PT_PRESENT_MASK; /* or VMX_EPT_READABLE_MASK */ - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; - - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; + if (shadow_nx_mask) { + if (!(pte_access & ACC_EXEC_MASK)) + spte |= shadow_nx_mask; + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + } else { + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_xs_mask; + if (pte_access & ACC_USER_EXEC_MASK) + spte |= shadow_xu_mask; + } if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; @@ -317,11 +326,13 @@ static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) static u64 make_spte_executable(u64 spte, u8 access) { u64 set, clear; - if (access & ACC_EXEC_MASK) - set = shadow_x_mask; + if (shadow_nx_mask) + set = (access & ACC_EXEC_MASK) ? 0 : shadow_nx_mask; else - set = shadow_nx_mask; - clear = set ^ (shadow_nx_mask | shadow_x_mask); + set = + (access & ACC_EXEC_MASK ? shadow_xs_mask : 0) | + (access & ACC_USER_EXEC_MASK ? shadow_xu_mask : 0); + clear = set ^ (shadow_nx_mask | shadow_xs_mask | shadow_xu_mask); return modify_spte_protections(spte, set, clear); } @@ -388,7 +399,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | PT_PRESENT_MASK /* or VMX_EPT_READABLE_MASK */ | - shadow_user_mask | shadow_x_mask | shadow_me_value; + shadow_user_mask | shadow_xs_mask | shadow_xu_mask | shadow_me_value; if (ad_disabled) spte |= SPTE_TDP_AD_DISABLED; @@ -496,7 +507,8 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits) shadow_accessed_mask = VMX_EPT_ACCESS_BIT; shadow_dirty_mask = VMX_EPT_DIRTY_BIT; shadow_nx_mask = 0ull; - shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; + shadow_xs_mask = VMX_EPT_EXECUTABLE_MASK; + shadow_xu_mask = VMX_EPT_EXECUTABLE_MASK; shadow_present_mask = VMX_EPT_SUPPRESS_VE_BIT; shadow_acc_track_mask = VMX_EPT_RWX_MASK; @@ -547,7 +559,8 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_accessed_mask = PT_ACCESSED_MASK; shadow_dirty_mask = PT_DIRTY_MASK; shadow_nx_mask = PT64_NX_MASK; - shadow_x_mask = 0; + shadow_xs_mask = 0; + shadow_xu_mask = 0; shadow_present_mask = PT_PRESENT_MASK; shadow_acc_track_mask = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0c305f2f4ba0..7323ff19056b 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -54,7 +54,8 @@ static_assert(SPTE_TDP_AD_ENABLED == 0); #define ACC_READ_MASK PT_PRESENT_MASK #define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK +#define ACC_USER_MASK PT_USER_MASK /* non EPT */ +#define ACC_USER_EXEC_MASK ACC_USER_MASK /* EPT only */ #define ACC_EXEC_MASK 8 #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK | ACC_READ_MASK) @@ -184,8 +185,9 @@ extern bool __read_mostly kvm_ad_enabled; extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; -extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; +extern u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */ +extern u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */ extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; @@ -352,7 +354,7 @@ static inline bool is_last_spte(u64 pte, int level) static inline bool is_executable_pte(u64 spte) { - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; + return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask; } static inline kvm_pfn_t spte_to_pfn(u64 pte) -- 2.52.0