Drop TDX's sanity check that an S-EPT mapping isn't zapped between creating said mapping and doing TDH.MEM.PAGE.ADD, as the check is simultaneously superfluous and incomplete. Per commit 2608f1057601 ("KVM: x86/tdp_mmu: Add a helper function to walk down the TDP MMU"), the justification for introducing kvm_tdp_mmu_gpa_is_mapped() was to check that the target gfn was pre-populated, with a link that points to this snippet: : > One small question: : > : > What if the memory region passed to KVM_TDX_INIT_MEM_REGION hasn't been pre- : > populated? If we want to make KVM_TDX_INIT_MEM_REGION work with these regions, : > then we still need to do the real map. Or we can make KVM_TDX_INIT_MEM_REGION : > return error when it finds the region hasn't been pre-populated? : : Return an error. I don't love the idea of bleeding so many TDX details into : userspace, but I'm pretty sure that ship sailed a long, long time ago. But that justification makes little sense for the final code, as simply doing TDH.MEM.PAGE.ADD without a paranoid sanity check will return an error if the S-EPT mapping is invalid (as evidenced by the code being guarded with CONFIG_KVM_PROVE_MMU=y). The sanity check is also incomplete in the sense that mmu_lock is dropped between the check and TDH.MEM.PAGE.ADD, i.e. will only detect KVM bugs that zap SPTEs in a very specific window. Removing the sanity check will allow removing kvm_tdp_mmu_gpa_is_mapped(), which has no business being exposed to vendor code. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 66744f5768c8..a6155f76cc6a 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3175,20 +3175,6 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret < 0) goto out; - /* - * The private mem cannot be zapped after kvm_tdp_map_page() - * because all paths are covered by slots_lock and the - * filemap invalidate lock. Check that they are indeed enough. - */ - if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { - scoped_guard(read_lock, &kvm->mmu_lock) { - if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { - ret = -EIO; - goto out; - } - } - } - ret = 0; err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), src_page, &entry, &level_state); -- 2.51.0.268.g9569e192d0-goog Add and use a new API for mapping a private pfn from guest_memfd into the TDP MMU from TDX's post-populate hook instead of partially open-coding the functionality into the TDX code. Sharing code with the pre-fault path sounded good on paper, but it's fatally flawed as simulating a fault loses the pfn, and calling back into gmem to re-retrieve the pfn creates locking problems, e.g. kvm_gmem_populate() already holds the gmem invalidation lock. Providing a dedicated API will also removing several MMU exports that ideally would not be exposed outside of the MMU, let alone to vendor code. On that topic, opportunistically drop the kvm_mmu_load() export. Leave kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future. Cc: Michael Roth Cc: Yan Zhao Cc: Ira Weiny Cc: Vishal Annapurve Cc: Rick Edgecombe Link: https://lore.kernel.org/all/20250709232103.zwmufocd3l7sqk7y@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/mmu/mmu.c | 60 +++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/tdx.c | 10 +++---- 3 files changed, 63 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index b4b6860ab971..697b90a97f43 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -259,6 +259,7 @@ extern bool tdp_mmu_enabled; bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..d3625e00baf9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4990,6 +4990,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, return min(range->size, end - range->gpa); } +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +{ + struct kvm_page_fault fault = { + .addr = gfn_to_gpa(gfn), + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS, + .prefetch = true, + .is_tdp = true, + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm), + + .max_level = KVM_MAX_HUGEPAGE_LEVEL, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + .is_private = true, + + .gfn = gfn, + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn), + .pfn = pfn, + .map_writable = true, + }; + struct kvm *kvm = vcpu->kvm; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + if (KVM_BUG_ON(!tdp_mmu_enabled, kvm)) + return -EIO; + + if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn)) + return -EPERM; + + r = kvm_mmu_reload(vcpu); + if (r) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + do { + if (signal_pending(current)) + return -EINTR; + + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + + cond_resched(); + + guard(read_lock)(&kvm->mmu_lock); + + r = kvm_tdp_mmu_map(vcpu, &fault); + } while (r == RET_PF_RETRY); + + if (r != RET_PF_FIXED) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_map_private_pfn); + static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; @@ -5973,7 +6032,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index a6155f76cc6a..1724d82c8512 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3151,15 +3151,12 @@ struct tdx_gmem_post_populate_arg { static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, void __user *src, int order, void *_arg) { - u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct tdx_gmem_post_populate_arg *arg = _arg; - struct kvm_vcpu *vcpu = arg->vcpu; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err, entry, level_state; gpa_t gpa = gfn_to_gpa(gfn); - u8 level = PG_LEVEL_4K; struct page *src_page; int ret, i; - u64 err, entry, level_state; /* * Get the source page if it has been faulted in. Return failure if the @@ -3171,7 +3168,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret != 1) return -ENOMEM; - ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); + ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); if (ret < 0) goto out; @@ -3234,7 +3231,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) return -EINVAL; - kvm_mmu_reload(vcpu); ret = 0; while (region.nr_pages) { if (signal_pending(current)) { -- 2.51.0.268.g9569e192d0-goog Remove the helper and exports that were added to allow TDX code to reuse kvm_tdp_map_page() for its gmem post-populated flow now that a dedicated TDP MMU API is provided to install a mapping given a gfn+pfn pair. This reverts commit 2608f105760115e94a03efd9f12f8fbfd1f9af4b. Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu.h | 2 -- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.c | 37 +++++-------------------------------- 3 files changed, 7 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 697b90a97f43..dc6b965cea4f 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -257,8 +257,6 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif -bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); -int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d3625e00baf9..f532beed9029 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4900,7 +4900,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, + u8 *level) { int r; @@ -4942,7 +4943,6 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level return -EIO; } } -EXPORT_SYMBOL_GPL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7f3d7229b2c1..1b559a50db51 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1910,13 +1910,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - struct kvm_mmu_page *root) +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); struct tdp_iter iter; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; + *root_level = vcpu->arch.mmu->root_role.level; + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; @@ -1925,36 +1928,6 @@ static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) -{ - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); - *root_level = vcpu->arch.mmu->root_role.level; - - return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); -} - -bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) -{ - struct kvm *kvm = vcpu->kvm; - bool is_direct = kvm_is_addr_direct(kvm, gpa); - hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : - vcpu->arch.mmu->mirror_root_hpa; - u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; - int leaf; - - lockdep_assert_held(&kvm->mmu_lock); - rcu_read_lock(); - leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); - rcu_read_unlock(); - if (leaf < 0) - return false; - - spte = sptes[leaf]; - return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); -} -EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no -- 2.51.0.268.g9569e192d0-goog Rename kvm_tdp_map_page() to kvm_tdp_prefault_page() now that it's used only by kvm_arch_vcpu_pre_fault_memory(). No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f532beed9029..cb08785ce29b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4900,8 +4900,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, - u8 *level) +static int kvm_tdp_prefault_page(struct kvm_vcpu *vcpu, gpa_t gpa, + u64 error_code, u8 *level) { int r; @@ -4978,7 +4978,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ - r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); + r = kvm_tdp_prefault_page(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r; -- 2.51.0.268.g9569e192d0-goog Don't explicitly pin pages when mapping pages into the S-EPT, guest_memfd doesn't support page migration in any capacity, i.e. there are no migrate callbacks because guest_memfd pages *can't* be migrated. See the WARN in kvm_gmem_migrate_folio(). Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 1724d82c8512..9fb6e5f02cc9 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1586,29 +1586,22 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); } -static void tdx_unpin(struct kvm *kvm, struct page *page) -{ - put_page(page); -} - static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) + enum pg_level level, kvm_pfn_t pfn) { int tdx_level = pg_level_to_tdx_sept_level(level); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); gpa_t gpa = gfn_to_gpa(gfn); u64 entry, level_state; u64 err; err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); - if (unlikely(tdx_operand_busy(err))) { - tdx_unpin(kvm, page); + if (unlikely(tdx_operand_busy(err))) return -EBUSY; - } if (KVM_BUG_ON(err, kvm)) { pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); - tdx_unpin(kvm, page); return -EIO; } @@ -1642,29 +1635,18 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - struct page *page = pfn_to_page(pfn); /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) return -EINVAL; - /* - * Because guest_memfd doesn't support page migration with - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page - * migration. Until guest_memfd supports page migration, prevent page - * migration. - * TODO: Once guest_memfd introduces callback on page migration, - * implement it and remove get_page/put_page(). - */ - get_page(page); - /* * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching * barrier in tdx_td_finalize(). */ smp_rmb(); if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) - return tdx_mem_page_aug(kvm, gfn, level, page); + return tdx_mem_page_aug(kvm, gfn, level, pfn); return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); } @@ -1715,7 +1697,6 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, return -EIO; } tdx_clear_page(page); - tdx_unpin(kvm, page); return 0; } @@ -1795,7 +1776,6 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { atomic64_dec(&kvm_tdx->nr_premapped); - tdx_unpin(kvm, page); return 0; } -- 2.51.0.268.g9569e192d0-goog Return -EIO when a KVM_BUG_ON() is tripped, as KVM's ABI is to return -EIO when a VM has been killed due to a KVM bug, not -EINVAL. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 9fb6e5f02cc9..ef4ffcad131f 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1624,7 +1624,7 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) - return -EINVAL; + return -EIO; /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ atomic64_inc(&kvm_tdx->nr_premapped); @@ -1638,7 +1638,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EINVAL; + return -EIO; /* * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching @@ -1849,7 +1849,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, * and slot move/deletion. */ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) - return -EINVAL; + return -EIO; /* * The HKID assigned to this TD was already freed and cache was @@ -1870,7 +1870,7 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, * there can't be anything populated in the private EPT. */ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) - return -EINVAL; + return -EIO; ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); if (ret <= 0) -- 2.51.0.268.g9569e192d0-goog Return -EIO immediately from tdx_sept_zap_private_spte() if the number of to-be-added pages underflows, so that the following "KVM_BUG_ON(err, kvm)" isn't also triggered. Isolating the check from the "is premap error" if-statement will also allow adding a lockdep assertion that premap errors are encountered if and only if slots_lock is held. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index ef4ffcad131f..88079e2d45fb 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1773,8 +1773,10 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); tdx_no_vcpus_enter_stop(kvm); } - if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && - !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { + if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level)) { + if (KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + return -EIO; + atomic64_dec(&kvm_tdx->nr_premapped); return 0; } -- 2.51.0.268.g9569e192d0-goog Use atomic64_dec_return() when decrementing the number of "pre-mapped" S-EPT pages to ensure that the count can't go negative without KVM noticing. In theory, checking for '0' and then decrementing in a separate operation could miss a 0=>-1 transition. In practice, such a condition is impossible because nr_premapped is protected by slots_lock, i.e. doesn't actually need to be an atomic (that wart will be addressed shortly). Don't bother trying to keep the count non-negative, as the KVM_BUG_ON() ensures the VM is dead, i.e. there's no point in trying to limp along. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 88079e2d45fb..b7559ea1e353 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1774,10 +1774,9 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, tdx_no_vcpus_enter_stop(kvm); } if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level)) { - if (KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + if (KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm)) return -EIO; - atomic64_dec(&kvm_tdx->nr_premapped); return 0; } @@ -3162,8 +3161,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, goto out; } - if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) - atomic64_dec(&kvm_tdx->nr_premapped); + KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm); if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { -- 2.51.0.268.g9569e192d0-goog Fold tdx_mem_page_record_premap_cnt() into tdx_sept_set_private_spte() as providing a one-off helper for effectively three lines of code is at best a wash, and splitting the code makes the comment for smp_rmb() _extremely_ confusing as the comment talks about reading kvm->arch.pre_fault_allowed before kvm_tdx->state, but the immediately visible code does the exact opposite. Opportunistically rewrite the comments to more explicitly explain who is checking what, as well as _why_ the ordering matters. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 49 ++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b7559ea1e353..e4b70c0dbda3 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1608,29 +1608,6 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, return 0; } -/* - * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the - * callback tdx_gmem_post_populate() then maps pages into private memory. - * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the - * private EPT structures for the page to have been built before, which is - * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that - * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). - * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there - * are no half-initialized shared EPT pages. - */ -static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) - return -EIO; - - /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ - atomic64_inc(&kvm_tdx->nr_premapped); - return 0; -} - static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn) { @@ -1641,14 +1618,30 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, return -EIO; /* - * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching - * barrier in tdx_td_finalize(). + * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory() + * before kvm_tdx->state. Userspace must not be allowed to pre-fault + * arbitrary memory until the initial memory image is finalized. Pairs + * with the smp_wmb() in tdx_td_finalize(). */ smp_rmb(); - if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) - return tdx_mem_page_aug(kvm, gfn, level, pfn); - return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); + /* + * If the TD isn't finalized/runnable, then userspace is initializing + * the VM image via KVM_TDX_INIT_MEM_REGION. Increment the number of + * pages that need to be initialized via TDH.MEM.PAGE.ADD (PAGE.ADD + * requires a pre-existing S-EPT mapping). KVM_TDX_FINALIZE_VM checks + * the counter to ensure all mapped pages have been added to the image, + * to prevent running the TD with uninitialized memory. + */ + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) { + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) + return -EIO; + + atomic64_inc(&kvm_tdx->nr_premapped); + return 0; + } + + return tdx_mem_page_aug(kvm, gfn, level, pfn); } static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, -- 2.51.0.268.g9569e192d0-goog Assert that slots_lock is held when the TDX codes accesses the number of premapped pfns, as KVM relies on calls to tdx_vcpu_init_mem_region() being serialized to prevent double-population of gmem and false negatives on the consumption of a "premapped" pfn. In addition to helping document how the TDX code works, this will allow converting "nr_premapped" to a non-atomic variable, as all usage asserts that slots_lock is held. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index e4b70c0dbda3..27941defb62e 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1634,6 +1634,8 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, * to prevent running the TD with uninitialized memory. */ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) { + lockdep_assert_held(&kvm->slots_lock); + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) return -EIO; @@ -1767,6 +1769,8 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, tdx_no_vcpus_enter_stop(kvm); } if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level)) { + lockdep_assert_held(&kvm->slots_lock); + if (KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm)) return -EIO; @@ -3132,6 +3136,8 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, struct page *src_page; int ret, i; + lockdep_assert_held(&kvm->slots_lock); + /* * Get the source page if it has been faulted in. Return failure if the * source page has been swapped out or unmapped in primary memory. -- 2.51.0.268.g9569e192d0-goog Track the number of premapped pfns as a non-atomic variable as all usage is guarded by slots_lock, and KVM now asserts as much. Note, slots_lock has always effectively guarded nr_premapped since TDX support landed, the use of an atomic64_t was likely a leftover from development that was never cleaned up. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 8 ++++---- arch/x86/kvm/vmx/tdx.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 27941defb62e..5d2bb27f22da 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1639,7 +1639,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) return -EIO; - atomic64_inc(&kvm_tdx->nr_premapped); + kvm_tdx->nr_premapped++; return 0; } @@ -1771,7 +1771,7 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level)) { lockdep_assert_held(&kvm->slots_lock); - if (KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm)) + if (KVM_BUG_ON(--kvm_tdx->nr_premapped < 0, kvm)) return -EIO; return 0; @@ -2846,7 +2846,7 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue * TDH.MEM.PAGE.ADD(). */ - if (atomic64_read(&kvm_tdx->nr_premapped)) + if (kvm_tdx->nr_premapped) return -EINVAL; cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); @@ -3160,7 +3160,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, goto out; } - KVM_BUG_ON(atomic64_dec_return(&kvm_tdx->nr_premapped) < 0, kvm); + KVM_BUG_ON(--kvm_tdx->nr_premapped < 0, kvm); if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index ca39a9391db1..04ba9ea3e0ba 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -37,7 +37,7 @@ struct kvm_tdx { struct tdx_td td; /* For KVM_TDX_INIT_MEM_REGION. */ - atomic64_t nr_premapped; + unsigned long nr_premapped; /* * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do -- 2.51.0.268.g9569e192d0-goog Rename "nr_premapped" to an asurdly verbose "nr_pending_tdh_mem_page_adds" to make it explicitly clear what the counter tracks. "pre-map" is far too similar to "pre-fault", especially since tdx_sept_set_private_spte() deals with both "pre_fault_allowed" and the counter. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 8 ++++---- arch/x86/kvm/vmx/tdx.h | 9 +++++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5d2bb27f22da..f9ac590e8ff0 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1639,7 +1639,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) return -EIO; - kvm_tdx->nr_premapped++; + kvm_tdx->nr_pending_tdh_mem_page_adds++; return 0; } @@ -1771,7 +1771,7 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level)) { lockdep_assert_held(&kvm->slots_lock); - if (KVM_BUG_ON(--kvm_tdx->nr_premapped < 0, kvm)) + if (KVM_BUG_ON(--kvm_tdx->nr_pending_tdh_mem_page_adds < 0, kvm)) return -EIO; return 0; @@ -2846,7 +2846,7 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue * TDH.MEM.PAGE.ADD(). */ - if (kvm_tdx->nr_premapped) + if (kvm_tdx->nr_pending_tdh_mem_page_adds) return -EINVAL; cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); @@ -3160,7 +3160,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, goto out; } - KVM_BUG_ON(--kvm_tdx->nr_premapped < 0, kvm); + KVM_BUG_ON(--kvm_tdx->nr_pending_tdh_mem_page_adds < 0, kvm); if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 04ba9ea3e0ba..45d86f9fa41c 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -36,8 +36,13 @@ struct kvm_tdx { struct tdx_td td; - /* For KVM_TDX_INIT_MEM_REGION. */ - unsigned long nr_premapped; + /* + * The number of pages that KVM_TDX_INIT_MEM_REGION has mapped into the + * S-EPT, but not yet initialized via TDH.MEM.PAGE_ADD. Used to sanity + * check adding pages to the image, and to ensure that all pages have + * been initialized before finalizing the TD. + */ + unsigned long nr_pending_tdh_mem_page_adds; /* * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do -- 2.51.0.268.g9569e192d0-goog