Use kvm_test_request() instead of kvm_check_request() when querying KVM_REQ_VM_DEAD, i.e. don't clear KVM_REQ_VM_DEAD, as the entire purpose of KVM_REQ_VM_DEAD is to prevent the vCPU from enterring the guest ever again, even if userspace insists on redoing KVM_RUN. Ensuring KVM_REQ_VM_DEAD is never cleared will allow relaxing KVM's rule that ioctls can't be invoked on dead VMs, to only disallow ioctls if the VM is bugged, i.e. if KVM hit a KVM_BUG_ON(). Opportunistically add compile-time assertions to guard against clearing KVM_REQ_VM_DEAD through the standard APIs. Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/vmx/tdx.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 9 +++++++-- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index f946926716b0..2fdc48c0fc4d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1013,7 +1013,7 @@ static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) return -EIO; if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..d09bd236a92d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4915,7 +4915,7 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level if (signal_pending(current)) return -EINTR; - if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) return -EIO; cond_resched(); diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 66744f5768c8..3e0d4edee849 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2010,7 +2010,7 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) break; - if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) { ret = -EIO; break; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..1700df68f12a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10649,7 +10649,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) { r = -EIO; goto out; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..627054d27222 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2261,13 +2261,18 @@ static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } -static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) +static __always_inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) { + BUILD_BUG_ON(req == KVM_REQ_VM_DEAD); + clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests); } -static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) +static __always_inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { + /* Once a VM is dead, it needs to stay dead. */ + BUILD_BUG_ON(req == KVM_REQ_VM_DEAD); + if (kvm_test_request(req, vcpu)) { kvm_clear_request(req, vcpu); -- 2.50.1.552.g942d659e1b-goog Exit to userspace with -EFAULT and a valid MEMORY_FAULT exit if a vCPU hits an unexpected pending S-EPT Violation instead of marking the VM dead. While it's unlikely the VM can continue on, whether or not to terminate the VM is not KVM's decision to make. Set memory_fault.size to zero to communicate to userspace that reported fault is "bad", and to effectively terminate the VM if userspace blindly treats the exit as a conversion attempt (KVM_SET_MEMORY_ATTRIBUTES will fail with -EINVAL if the size is zero). Opportunistically delete the pr_warn(), which could be abused to spam the kernel log, and is largely useless outside of interact debug as it doesn't specify which VM encountered a failure. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/tdx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 3e0d4edee849..c2ef03f39c32 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1937,10 +1937,8 @@ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { if (tdx_is_sept_violation_unexpected_pending(vcpu)) { - pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", - gpa, vcpu->vcpu_id); - kvm_vm_dead(vcpu->kvm); - return -EIO; + kvm_prepare_memory_fault_exit(vcpu, gpa, 0, true, false, true); + return -EFAULT; } /* * Always treat SEPT violations as write faults. Ignore the -- 2.50.1.552.g942d659e1b-goog Relax the protection against interacting with a buggy KVM to only reject ioctls if the VM is bugged, i.e. allow userspace to invoke ioctls if KVM deliberately terminated the VM. Drop kvm.vm_dead as there are no longer any readers, and KVM shouldn't rely on vm_dead for functional correctness. The only functional guarantees provided by kvm_vm_dead() come by way of KVM_REQ_VM_DEAD, which ensures that vCPU won't re-enter the guest. Practically speaking, this only affects x86, which uses kvm_vm_dead() to prevent running a VM whose resources have been partially freed or has run one or more of its vCPUs into an architecturally defined state. In these cases, there is no (known) danger to KVM, the goal is purely to prevent entering the guest. As evidenced by commit ecf371f8b02d ("KVM: SVM: Reject SEV{-ES} intra host migration if vCPU creation is in-flight"), the restriction on invoking ioctls only blocks _new_ ioctls. I.e. KVM mustn't rely on blocking ioctls for functional safety (whereas KVM_REQ_VM_DEAD is guaranteed to prevent vCPUs from entering the guest). Signed-off-by: Sean Christopherson --- arch/arm64/kvm/vgic/vgic-init.c | 2 +- include/linux/kvm_host.h | 2 -- tools/testing/selftests/kvm/x86/sev_migrate_tests.c | 5 +---- virt/kvm/kvm_main.c | 10 +++++----- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index eb1205654ac8..c2033bae73b2 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -612,7 +612,7 @@ int kvm_vgic_map_resources(struct kvm *kvm) mutex_unlock(&kvm->arch.config_lock); out_slots: if (ret) - kvm_vm_dead(kvm); + kvm_vm_bugged(kvm); mutex_unlock(&kvm->slots_lock); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 627054d27222..fa97d71577b5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -854,7 +854,6 @@ struct kvm { u32 dirty_ring_size; bool dirty_ring_with_bitmap; bool vm_bugged; - bool vm_dead; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; @@ -894,7 +893,6 @@ struct kvm { static inline void kvm_vm_dead(struct kvm *kvm) { - kvm->vm_dead = true; kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD); } diff --git a/tools/testing/selftests/kvm/x86/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86/sev_migrate_tests.c index 0a6dfba3905b..0580bee5888e 100644 --- a/tools/testing/selftests/kvm/x86/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86/sev_migrate_tests.c @@ -87,10 +87,7 @@ static void test_sev_migrate_from(bool es) sev_migrate_from(dst_vms[i], dst_vms[i - 1]); /* Migrate the guest back to the original VM. */ - ret = __sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]); - TEST_ASSERT(ret == -1 && errno == EIO, - "VM that was migrated from should be dead. ret %d, errno: %d", ret, - errno); + sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]); kvm_vm_free(src_vm); for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c07dd423458..f1f69e10a371 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4408,7 +4408,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; - if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) @@ -4651,7 +4651,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, void __user *argp = compat_ptr(arg); int r; - if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -4717,7 +4717,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_device *dev = filp->private_data; - if (dev->kvm->mm != current->mm || dev->kvm->vm_dead) + if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -5139,7 +5139,7 @@ static long kvm_vm_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; - if (kvm->mm != current->mm || kvm->vm_dead) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: @@ -5403,7 +5403,7 @@ static long kvm_vm_compat_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; int r; - if (kvm->mm != current->mm || kvm->vm_dead) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg); -- 2.50.1.552.g942d659e1b-goog Use the main for-loop in the "SEV migrate from" testcase to handle all successful migrations, as there is nothing inherently unique about the original source VM beyond it needing to be created as an SEV VM. No functional change intended. Signed-off-by: Sean Christopherson --- .../selftests/kvm/x86/sev_migrate_tests.c | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86/sev_migrate_tests.c index 0580bee5888e..b501c916edf5 100644 --- a/tools/testing/selftests/kvm/x86/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86/sev_migrate_tests.c @@ -14,7 +14,7 @@ #include "kselftest.h" #define NR_MIGRATE_TEST_VCPUS 4 -#define NR_MIGRATE_TEST_VMS 3 +#define NR_MIGRATE_TEST_VMS 4 #define NR_LOCK_TESTING_THREADS 3 #define NR_LOCK_TESTING_ITERATIONS 10000 @@ -72,26 +72,23 @@ static void sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src) static void test_sev_migrate_from(bool es) { - struct kvm_vm *src_vm; - struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS]; - int i, ret; + struct kvm_vm *vms[NR_MIGRATE_TEST_VMS]; + int i; - src_vm = sev_vm_create(es); - for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) - dst_vms[i] = aux_vm_create(true); - - /* Initial migration from the src to the first dst. */ - sev_migrate_from(dst_vms[0], src_vm); - - for (i = 1; i < NR_MIGRATE_TEST_VMS; i++) - sev_migrate_from(dst_vms[i], dst_vms[i - 1]); + vms[0] = sev_vm_create(es); + for (i = 1; i < NR_MIGRATE_TEST_VMS; ++i) + vms[i] = aux_vm_create(true); - /* Migrate the guest back to the original VM. */ - sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]); + /* + * Migrate in N times, in a chain from the initial SEV VM to each "aux" + * VM, and finally back to the original SEV VM. KVM disallows KVM_RUN + * on the source after migration, but all other ioctls should succeed. + */ + for (i = 0; i < NR_MIGRATE_TEST_VMS; i++) + sev_migrate_from(vms[(i + 1) % NR_MIGRATE_TEST_VMS], vms[i]); - kvm_vm_free(src_vm); for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) - kvm_vm_free(dst_vms[i]); + kvm_vm_free(vms[i]); } struct locking_thread_input { -- 2.50.1.552.g942d659e1b-goog Add sub-ioctl KVM_TDX_TERMINATE_VM to release the HKID prior to shutdown, which enables more efficient reclaim of private memory. Private memory is removed from MMU/TDP when guest_memfds are closed. If the HKID has not been released, the TDX VM is still in the RUNNABLE state, and so pages must be removed using "Dynamic Page Removal" procedure (refer to the TDX Module Base spec) which involves a number of steps: Block further address translation Exit each VCPU Clear Secure EPT entry Flush/write-back/invalidate relevant caches However, when the HKID is released, the TDX VM moves to TD_TEARDOWN state, where all TDX VM pages are effectively unmapped, so pages can be reclaimed directly. Reclaiming TD Pages in TD_TEARDOWN State was seen to decrease the total reclaim time. For example: VCPUs Size (GB) Before (secs) After (secs) 4 18 72 24 32 107 517 134 64 400 5539 467 Add kvm_tdx_capabilities.supported_caps along with KVM_TDX_CAP_TERMINATE_VM to advertise support to userspace. Use a new field in kvm_tdx_capabilities instead of adding yet another generic KVM_CAP to avoid bleeding TDX details into common code (and #ifdefs), and so that userspace can query TDX capabilities in one shot. Enumerating capabilities as a mask of bits does limit supported_caps to 64 capabilities, but in the unlikely event KVM needs to enumerate more than 64 TDX capabilities, there are another 249 u64 entries reserved for future expansion. To preserve the KVM_BUG_ON() sanity check that deals with HKID assignment, track if a TD is terminated and assert that, when an S-EPT entry is removed, either the TD has an assigned HKID or the TD was explicitly terminated. Link: https://lore.kernel.org/r/Z-V0qyTn2bXdrPF7@google.com Link: https://lore.kernel.org/r/aAL4dT1pWG5dDDeo@google.com Co-developed-by: Adrian Hunter Signed-off-by: Adrian Hunter Acked-by: Vishal Annapurve Tested-by: Vishal Annapurve Tested-by: Xiaoyao Li Cc: Rick Edgecombe Cc: Nikolay Borisov Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/intel-tdx.rst | 22 +++++++++++++- arch/x86/include/uapi/asm/kvm.h | 7 ++++- arch/x86/kvm/vmx/tdx.c | 37 +++++++++++++++++++----- arch/x86/kvm/vmx/tdx.h | 1 + 4 files changed, 57 insertions(+), 10 deletions(-) diff --git a/Documentation/virt/kvm/x86/intel-tdx.rst b/Documentation/virt/kvm/x86/intel-tdx.rst index 5efac62c92c7..bcfa97e0c9e7 100644 --- a/Documentation/virt/kvm/x86/intel-tdx.rst +++ b/Documentation/virt/kvm/x86/intel-tdx.rst @@ -38,6 +38,7 @@ ioctl with TDX specific sub-ioctl() commands. KVM_TDX_INIT_MEM_REGION, KVM_TDX_FINALIZE_VM, KVM_TDX_GET_CPUID, + KVM_TDX_TERMINATE_VM, KVM_TDX_CMD_NR_MAX, }; @@ -92,7 +93,10 @@ to be configured to the TDX guest. __u64 kernel_tdvmcallinfo_1_r12; __u64 user_tdvmcallinfo_1_r12; - __u64 reserved[250]; + /* Misc capabilities enumerated via the KVM_TDX_CAP_* namespace. */ + __u64 supported_caps; + + __u64 reserved[249]; /* Configurable CPUID bits for userspace */ struct kvm_cpuid2 cpuid; @@ -227,6 +231,22 @@ struct kvm_cpuid2. __u32 padding[3]; }; +KVM_TDX_TERMINATE_VM +-------------------- +:Capability: KVM_TDX_CAP_TERMINATE_VM +:Type: vm ioctl +:Returns: 0 on success, <0 on error + +Release Host Key ID (HKID) to allow more efficient reclaim of private memory. +After this, the TD is no longer in a runnable state. + +Using KVM_TDX_TERMINATE_VM is optional. + +- id: KVM_TDX_TERMINATE_VM +- flags: must be 0 +- data: must be 0 +- hw_error: must be 0 + KVM TDX creation flow ===================== In addition to the standard KVM flow, new TDX ioctls need to be called. The diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0f15d683817d..e019111e2150 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -940,6 +940,7 @@ enum kvm_tdx_cmd_id { KVM_TDX_INIT_MEM_REGION, KVM_TDX_FINALIZE_VM, KVM_TDX_GET_CPUID, + KVM_TDX_TERMINATE_VM, KVM_TDX_CMD_NR_MAX, }; @@ -962,6 +963,8 @@ struct kvm_tdx_cmd { __u64 hw_error; }; +#define KVM_TDX_CAP_TERMINATE_VM _BITULL(0) + struct kvm_tdx_capabilities { __u64 supported_attrs; __u64 supported_xfam; @@ -971,7 +974,9 @@ struct kvm_tdx_capabilities { __u64 kernel_tdvmcallinfo_1_r12; __u64 user_tdvmcallinfo_1_r12; - __u64 reserved[250]; + __u64 supported_caps; + + __u64 reserved[249]; /* Configurable CPUID bits for userspace */ struct kvm_cpuid2 cpuid; diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index c2ef03f39c32..ae059daf1a20 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -188,6 +188,8 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, if (!caps->supported_xfam) return -EIO; + caps->supported_caps = KVM_TDX_CAP_TERMINATE_VM; + caps->cpuid.nent = td_conf->num_cpuid_config; caps->user_tdvmcallinfo_1_r11 = @@ -520,6 +522,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) goto out; } + write_lock(&kvm->mmu_lock); for_each_online_cpu(i) { if (packages_allocated && cpumask_test_and_set_cpu(topology_physical_package_id(i), @@ -544,7 +547,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) } else { tdx_hkid_free(kvm_tdx); } - + write_unlock(&kvm->mmu_lock); out: mutex_unlock(&tdx_lock); cpus_read_unlock(); @@ -1884,13 +1887,13 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, struct page *page = pfn_to_page(pfn); int ret; - /* - * HKID is released after all private pages have been removed, and set - * before any might be populated. Warn if zapping is attempted when - * there can't be anything populated in the private EPT. - */ - if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) - return -EINVAL; + if (!is_hkid_assigned(to_kvm_tdx(kvm))) { + KVM_BUG_ON(!to_kvm_tdx(kvm)->vm_terminated, kvm); + ret = tdx_reclaim_page(page); + if (!ret) + tdx_unpin(kvm, page); + return ret; + } ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); if (ret <= 0) @@ -2884,6 +2887,21 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) return 0; } +static int tdx_terminate_vm(struct kvm *kvm) +{ + if (kvm_trylock_all_vcpus(kvm)) + return -EBUSY; + + kvm_vm_dead(kvm); + to_kvm_tdx(kvm)->vm_terminated = true; + + kvm_unlock_all_vcpus(kvm); + + tdx_mmu_release_hkid(kvm); + + return 0; +} + int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_tdx_cmd tdx_cmd; @@ -2911,6 +2929,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) case KVM_TDX_FINALIZE_VM: r = tdx_td_finalize(kvm, &tdx_cmd); break; + case KVM_TDX_TERMINATE_VM: + r = tdx_terminate_vm(kvm); + break; default: r = -EINVAL; goto out; diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index ca39a9391db1..0abe70aa1644 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -45,6 +45,7 @@ struct kvm_tdx { * Set/unset is protected with kvm->mmu_lock. */ bool wait_for_sept_zap; + bool vm_terminated; }; /* TDX module vCPU states */ -- 2.50.1.552.g942d659e1b-goog