Now that kvm_mmu_memory_cache supports custom page allocators, wire up the S-EPT cache to use tdx_{alloc,free}_control_page() (arguably S-EPT pages aren't "control" pages, but they're not guest pages either). Using the TDX APIs will make S-EPT pages naturally play nice with Dynamic PAMT, by virtue of adding/removing PAMT entries when S-EPT pages are allocated and freed, as opposed to when they are added/removed from the S-EPT tree. Inserting into the PAMT entries on allocation does mean KVM will create unnecessary PAMT entries, e.g. once a vCPU stops faulting in memory, the remaining pages in the MMU cache will go unused. But in practice, odds are very good the containing 2MiB page will have other in-use S-EPT pages, i.e. will create PAMT entries anyways. And _if_ creating PAMT entries on allocation is problematic for memory consumption, that can be resolved by tweaking KVM's cache size. Suggested-by: Kai Huang Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 2 ++ arch/x86/include/asm/kvm_host.h | 18 +++++++++--------- arch/x86/kvm/mmu/mmu.c | 6 ++++-- arch/x86/kvm/mmu/mmu_internal.h | 11 ----------- arch/x86/kvm/mmu/tdp_mmu.c | 5 +++-- arch/x86/kvm/vmx/tdx.c | 13 ++++++++++++- 6 files changed, 30 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index c17cedc485c9..17dddada69fc 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -94,6 +94,8 @@ KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) +KVM_X86_OP_OPTIONAL(alloc_external_sp) +KVM_X86_OP_OPTIONAL(free_external_sp) KVM_X86_OP_OPTIONAL_RET0(set_external_spte) KVM_X86_OP_OPTIONAL(remove_external_spte) KVM_X86_OP_OPTIONAL(reclaim_external_sp) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b35a07ed11fb..6e84dbc89e79 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -867,10 +867,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_shadow_page_cache; struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; - /* - * This cache is to allocate external page table. E.g. private EPT used - * by the TDX module. - */ + /* Used to allocate S-EPT pages (gifted to the TDX-Module). */ struct kvm_mmu_memory_cache mmu_external_spt_cache; /* @@ -1853,18 +1850,21 @@ struct kvm_x86_ops { void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); - /* Update the external page table from spte getting set. */ + /* + * Callbacks to allocate and free external page tables, a.k.a. S-EPT, + * and to propagate changes in mirror page tables to the external page + * tables. + */ + unsigned long (*alloc_external_sp)(gfp_t gfp); + void (*free_external_sp)(unsigned long addr); int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, u64 mirror_spte); - - /* Update external page tables for page table about to be freed. */ void (*reclaim_external_sp)(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp); - - /* Update external page table from spte getting removed, and flush TLB. */ void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, u64 mirror_spte); + bool (*has_wbinvd_exit)(void); u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3911ac9bddfd..9b5a6861e2a4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6690,11 +6690,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; - vcpu->arch.mmu_shadow_page_cache.init_value = - SHADOW_NONPRESENT_VALUE; + vcpu->arch.mmu_shadow_page_cache.init_value = SHADOW_NONPRESENT_VALUE; if (!vcpu->arch.mmu_shadow_page_cache.init_value) vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu_external_spt_cache.page_get = kvm_x86_ops.alloc_external_sp; + vcpu->arch.mmu_external_spt_cache.page_free = kvm_x86_ops.free_external_sp; + vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 73cdcbccc89e..6bb97f660793 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -157,17 +157,6 @@ static inline bool is_mirror_sp(const struct kvm_mmu_page *sp) return sp->role.is_mirror; } -static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) -{ - /* - * external_spt is allocated for TDX module to hold private EPT mappings, - * TDX module will initialize the page by itself. - * Therefore, KVM does not need to initialize or access external_spt. - * KVM only interacts with sp->spt for private EPT operations. - */ - sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); -} - static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root) { /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 18764dbc97ea..01e3e4f4baa5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -55,7 +55,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { - free_page((unsigned long)sp->external_spt); + if (sp->external_spt) + kvm_x86_call(free_external_sp)((unsigned long)sp->external_spt); free_page((unsigned long)sp->spt); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1246,7 +1247,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) sp = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_child_sp(sp, &iter); if (is_mirror_sp(sp)) - kvm_mmu_alloc_external_spt(vcpu, sp); + sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); sp->nx_huge_page_disallowed = fault->huge_page_disallowed; diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 323aae4300a1..0946eba2de23 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1790,7 +1790,9 @@ static void tdx_sept_reclaim_private_sp(struct kvm *kvm, gfn_t gfn, * TD's hkid is freed, when the TD is being torn down. * * If the S-EPT PTE can't be removed for any reason, intentionally leak - * the page to prevent the kernel from accessing the encrypted page. + * the page to prevent the kernel from accessing the encrypted page, + * and if Dynamic PAMT is enabled, to avoid inducing a failure on + * removal of the still-used PAMT entry. */ if (KVM_BUG_ON(is_hkid_assigned(to_kvm_tdx(kvm)), kvm) || tdx_reclaim_page(virt_to_page(sp->external_spt))) @@ -3600,6 +3602,15 @@ void __init tdx_hardware_setup(void) */ vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); + /* + * TDX uses the external_spt cache to allocate S-EPT page table pages, + * which (a) don't need to be initialized by KVM as the TDX-Module will + * initialize the page (using the guest's encryption key), and (b) need + * to use a custom allocator to be compatible with Dynamic PAMT. + */ + vt_x86_ops.alloc_external_sp = tdx_alloc_control_page; + vt_x86_ops.free_external_sp = tdx_free_control_page; + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; vt_x86_ops.reclaim_external_sp = tdx_sept_reclaim_private_sp; vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; -- 2.53.0.rc1.217.geba53bf80e-goog