While removing the KVM_BUG_ON() for the mirror root before invoking tdp_mmu_split_huge_page() in the fault path, update the hook split_external_spt to pass in shared mmu_lock info and invoke the hook in set_external_spte_present() on splitting is detected. Reject the splitting in TDX if the splitting is under shared mmu_lock. TDX requires different handling for splitting under shared or exclusive mmu_lock. Under a shared mmu_lock, TDX cannot kick off all vCPUs to avoid BUSY error from tdh_mem_page_demote(). As the current TDX module requires tdh_mem_range_block() to be invoked before each tdh_mem_page_demote(), if a BUSY error occurs, TDX must call tdh_mem_range_unblock() before returning the error to the KVM MMU core to roll back the old SPTE and retry. However, tdh_mem_range_unblock() may also fail due to contention. Reject splitting huge pages under shared mmu_lock for mirror root in TDX rather than KVM_BUG_ON() in KVM MMU core to allow for future real implementation of demote under shared mmu_lock once non-blocking demote is available. Signed-off-by: Yan Zhao --- RFC v2: - WARN_ON_ONCE() and return error in tdx_sept_split_private_spt() if it's invoked under shared mmu_lock. (rather than increase the next fault's max_level in current vCPU via tdx->violation_gfn_start/end and tdx->violation_request_level). - TODO: Perform the real implementation of demote under shared mmu_lock when new version of TDX module supporting non-blocking demote is available. RFC v1: - New patch. --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 45 ++++++++++++++++++++------------- arch/x86/kvm/vmx/tdx.c | 8 +++++- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e431ce0e3180..6cb5b422dd1d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1841,7 +1841,7 @@ struct kvm_x86_ops { /* Split the external page table into smaller page tables */ int (*split_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - void *external_spt); + void *external_spt, bool mmu_lock_shared); bool (*has_wbinvd_exit)(void); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a2c6e6e4773f..ce49cc850ed5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -386,15 +386,14 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, } static int split_external_spt(struct kvm *kvm, gfn_t gfn, u64 old_spte, - u64 new_spte, int level) + u64 new_spte, int level, bool shared) { void *external_spt = get_external_spt(gfn, new_spte, level); int ret; KVM_BUG_ON(!external_spt, kvm); - ret = kvm_x86_call(split_external_spt)(kvm, gfn, level, external_spt); - + ret = kvm_x86_call(split_external_spt)(kvm, gfn, level, external_spt, shared); return ret; } /** @@ -533,11 +532,19 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); + bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); kvm_pfn_t new_pfn = spte_to_pfn(new_spte); int ret = 0; - KVM_BUG_ON(was_present, kvm); + /* + * Caller ensures new_spte must be present. + * Current valid transitions: + * - leaf to non-leaf (demote) + * - !present to present leaf + * - !present to present non-leaf + */ + KVM_BUG_ON(!(!was_present || (was_leaf && !is_leaf)), kvm); lockdep_assert_held(&kvm->mmu_lock); /* @@ -548,18 +555,24 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE)) return -EBUSY; - /* - * Use different call to either set up middle level - * external page table, or leaf. - */ - if (is_leaf) { - ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); - } else { - void *external_spt = get_external_spt(gfn, new_spte, level); + if (!was_present) { + /* + * Use different call to either set up middle level + * external page table, or leaf. + */ + if (is_leaf) { + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); + } else { + void *external_spt = get_external_spt(gfn, new_spte, level); - KVM_BUG_ON(!external_spt, kvm); - ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); + KVM_BUG_ON(!external_spt, kvm); + ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); + } + } else if (was_leaf && !is_leaf) { + /* demote */ + ret = split_external_spt(kvm, gfn, old_spte, new_spte, level, true); } + if (ret) __kvm_tdp_mmu_write_spte(sptep, old_spte); else @@ -789,7 +802,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, if (!is_shadow_present_pte(new_spte)) remove_external_spte(kvm, gfn, old_spte, level); else if (is_last_spte(old_spte, level) && !is_last_spte(new_spte, level)) - split_external_spt(kvm, gfn, old_spte, new_spte, level); + split_external_spt(kvm, gfn, old_spte, new_spte, level, false); else KVM_BUG_ON(1, kvm); } @@ -1308,8 +1321,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) sp->nx_huge_page_disallowed = fault->huge_page_disallowed; if (is_shadow_present_pte(iter.old_spte)) { - /* Don't support large page for mirrored roots (TDX) */ - KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm); r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); } else { r = tdp_mmu_link_sp(kvm, &iter, sp, true); diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 8a60ba5b6595..035d81275be4 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1941,7 +1941,7 @@ static int tdx_spte_demote_private_spte(struct kvm *kvm, gfn_t gfn, } static int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level, - void *private_spt) + void *private_spt, bool mmu_lock_shared) { struct page *page = virt_to_page(private_spt); int ret; @@ -1950,6 +1950,12 @@ static int tdx_sept_split_private_spt(struct kvm *kvm, gfn_t gfn, enum pg_level level != PG_LEVEL_2M, kvm)) return -EINVAL; + if (WARN_ON_ONCE(mmu_lock_shared)) { + pr_warn_once("Splitting of GFN %llx level %d under shared lock occurs when KVM does not support it yet\n", + gfn, level); + return -EOPNOTSUPP; + } + ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); if (ret <= 0) return ret; -- 2.43.2