Add support for splitting S-EPT hugepages in preparation for converting a subset of a hugepage to be shared, as KVM must precisely zap/remove S-EPT entries to avoid clobbering guest memory (the lifetime of guest private memory is tied to the S-EPT). I.e. KVM needs to first split a hugepage so that only the to-be-converted small pages can be zapped. To avoid unnecessary work, e.g. if only the tail/end page of massive region isn't aligned to the conversion, explicitly detect unaligned head and tail pages relative to the max page size support by KVM, i.e. head/tail pages that will undergo partial conversion. To support splitting an S-EPT hugepage without a vCPU, add a per-VM PAMT cache, along with a mutex to guard the cache. Using a mutex, e.g. versus a spinlock, is important at it allows KVM to allocate memory *without* dropping the lock, i.e. so that the PAMT cache can be topped-up as needed without needed to juggle arch.tdp_mmu_external_cache_lock. Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 8 +++- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 72 +++++++++++++++++++++++++++++++-- arch/x86/kvm/vmx/tdx.c | 34 +++++++++++++--- arch/x86/kvm/vmx/tdx.h | 2 + 5 files changed, 107 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 385f1cf32d70..54dea90a53dc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1563,6 +1563,12 @@ struct kvm_arch { * the code to do so. */ spinlock_t tdp_mmu_pages_lock; + + /* + * Protect the per-VM cache of pre-allocate pages used to populate the + * Dynamic PAMT when splitting S-EPT huge pages without a vCPU. + */ + struct mutex tdp_mmu_external_cache_lock; #endif /* CONFIG_X86_64 */ /* @@ -1861,7 +1867,7 @@ struct kvm_x86_ops { u64 new_spte, enum pg_level level); void (*reclaim_external_sp)(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp); - int (*topup_external_cache)(struct kvm_vcpu *vcpu, int min); + int (*topup_external_cache)(struct kvm *kvm, struct kvm_vcpu *vcpu, int min); bool (*has_wbinvd_exit)(void); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c2765bfc8492..62bf6bec2df2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -606,7 +606,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) if (r) return r; - r = kvm_x86_call(topup_external_cache)(vcpu, PT64_ROOT_MAX_LEVEL); + r = kvm_x86_call(topup_external_cache)(vcpu->kvm, vcpu, PT64_ROOT_MAX_LEVEL); if (r) return r; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c46ebdacdb50..3181406c5e0b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1447,7 +1447,8 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, return spte_set; } -static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct tdp_iter *iter) +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, + struct tdp_iter *iter) { struct kvm_mmu_page *sp; @@ -1464,7 +1465,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct tdp_iter *iter) if (!sp->external_spt) goto err_external_spt; - if (kvm_x86_call(topup_external_cache)(kvm_get_running_vcpu(), 1)) + if (kvm_x86_call(topup_external_cache)(kvm, kvm_get_running_vcpu(), 1)) goto err_external_split; } @@ -1556,7 +1557,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, else write_unlock(&kvm->mmu_lock); - sp = tdp_mmu_alloc_sp_for_split(&iter); + sp = tdp_mmu_alloc_sp_for_split(kvm, &iter); if (shared) read_lock(&kvm->mmu_lock); @@ -1631,9 +1632,74 @@ int kvm_tdp_mmu_split_huge_pages(struct kvm_vcpu *vcpu, gfn_t start, gfn_t end, EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_split_huge_pages); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_CONVERT +static int __tdp_mmu_split_mirror_huge_pages(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t gfn, int target_level) +{ + gfn_t end = gfn + KVM_PAGES_PER_HPAGE(target_level + 1); + + return tdp_mmu_split_huge_pages_root(kvm, root, gfn, end, target_level, false); +} + +static int tdp_mmu_split_mirror_huge_pages(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end, int level) +{ + + gfn_t head = gfn_round_for_level(start, level + 1); + gfn_t tail = gfn_round_for_level(end, level + 1); + int r; + + if (head != start) { + r = __tdp_mmu_split_mirror_huge_pages(kvm, root, head, level); + if (r) + return r; + } + + if (tail != end && (head != tail || head == start)) { + r = __tdp_mmu_split_mirror_huge_pages(kvm, root, tail, level); + if (r) + return r; + } + + return 0; +} + int kvm_arch_gmem_convert(struct kvm *kvm, gfn_t start, gfn_t end, bool to_private) { + struct kvm_mmu_page *root; + int r; + + /* + * When converting from private=>shared, KVM must first split potential + * hugepages, as KVM mustn't overzap private mappings for TDX guests, + * i.e. must zap _exactly_ [start, end). Split potential hugepages at + * the head and tail of the to-be-converted (and thus zapped) range so + * that KVM doesn't overzap due to dropping a hugepage that doesn't + * fall wholly inside the range. + */ + if (to_private || !kvm_has_mirrored_tdp(kvm)) + return 0; + + /* + * Acquire the external cache lock, a.k.a. the Dynamic PAMT lock, to + * protect the per-VM cache of pre-allocate pages used to populate the + * Dynamic PAMT when splitting S-EPT huge pages. + */ + guard(mutex)(&kvm->arch.tdp_mmu_external_cache_lock); + + guard(write_lock)(&kvm->mmu_lock); + + /* + * TODO: Also split from PG_LEVEL_1G => PG_LEVEL_2M when KVM supports + * 1GiB S-EPT pages. + */ + __for_each_tdp_mmu_root_yield_safe(kvm, root, 0, KVM_MIRROR_ROOTS) { + r = tdp_mmu_split_mirror_huge_pages(kvm, root, start, end, PG_LEVEL_4K); + if (r) + return r; + } return 0; } #endif /* CONFIG_HAVE_KVM_ARCH_GMEM_CONVERT */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 098954f5e07c..774d395e5c73 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -607,6 +607,8 @@ void tdx_vm_destroy(struct kvm *kvm) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + tdx_free_pamt_cache(&kvm_tdx->pamt_cache); + tdx_reclaim_td_control_pages(kvm); kvm_tdx->state = TD_STATE_UNINITIALIZED; @@ -629,6 +631,8 @@ int tdx_vm_init(struct kvm *kvm) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + tdx_init_pamt_cache(&kvm_tdx->pamt_cache); + kvm->arch.has_protected_state = true; /* * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap, @@ -1621,15 +1625,32 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); } -static int tdx_topup_external_pamt_cache(struct kvm_vcpu *vcpu, int min) +static struct tdx_pamt_cache *tdx_get_pamt_cache(struct kvm *kvm, + struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(vcpu && vcpu->kvm != kvm, kvm)) + return NULL; + + if (vcpu) + return &to_tdx(vcpu)->pamt_cache; + + lockdep_assert_held(&kvm->arch.tdp_mmu_external_cache_lock); + return &to_kvm_tdx(kvm)->pamt_cache; +} + +static int tdx_topup_external_pamt_cache(struct kvm *kvm, + struct kvm_vcpu *vcpu, int min) +{ + struct tdx_pamt_cache *pamt_cache; + if (!tdx_supports_dynamic_pamt(tdx_sysinfo)) return 0; - if (WARN_ON_ONCE(!vcpu)) + pamt_cache = tdx_get_pamt_cache(kvm, vcpu); + if (!pamt_cache) return -EIO; - return tdx_topup_pamt_cache(&to_tdx(vcpu)->pamt_cache, min); + return tdx_topup_pamt_cache(pamt_cache, min); } static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, @@ -1792,8 +1813,8 @@ static struct page *tdx_spte_to_external_spt(struct kvm *kvm, gfn_t gfn, static int tdx_sept_split_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, u64 new_spte, enum pg_level level) { - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct tdx_pamt_cache *pamt_cache; gpa_t gpa = gfn_to_gpa(gfn); u64 err, entry, level_state; struct page *external_spt; @@ -1804,7 +1825,8 @@ static int tdx_sept_split_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, if (!external_spt) return -EIO; - if (KVM_BUG_ON(!vcpu || vcpu->kvm != kvm, kvm)) + pamt_cache = tdx_get_pamt_cache(kvm, kvm_get_running_vcpu()); + if (!pamt_cache) return -EIO; err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa, @@ -1816,7 +1838,7 @@ static int tdx_sept_split_private_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, err = tdh_do_no_vcpus(tdh_mem_page_demote, kvm, &kvm_tdx->td, gpa, level, spte_to_pfn(old_spte), external_spt, - &to_tdx(vcpu)->pamt_cache, &entry, &level_state); + pamt_cache, &entry, &level_state); if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_DEMOTE, entry, level_state, kvm)) return -EIO; diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index f444fc84d93b..57d7e70ffe7d 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -48,6 +48,8 @@ struct kvm_tdx { * Set/unset is protected with kvm->mmu_lock. */ bool wait_for_sept_zap; + + struct tdx_pamt_cache pamt_cache; }; /* TDX module vCPU states */ -- 2.53.0.rc1.217.geba53bf80e-goog