From: David Woodhouse Commit 3617c0ee7decb ("KVM: x86/xen: Only write Xen hypercall page for guest writes to MSR") blocked host-initiated writes from triggering the Xen hypercall page setup, to fix an SRCU usage violation when the hypercall MSR index collides with a real MSR written during vCPU reset. However, some VMMs legitimately need to trigger hypercall page setup from host context. For example, a VMM may intercept the guest's MSR write to track an epoch (for kexec/crash recovery), and then replay the write as a host-initiated KVM_SET_MSRS to populate the hypercall page. The host_initiated check breaks this use case. Add KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE as a new vcpu attribute that explicitly invokes kvm_xen_write_hypercall_page() under proper locking. This gives userspace a safe interface to trigger hypercall page setup without going through the MSR write path, preserving the host_initiated defence in depth while restoring the lost functionality. Fixes: 3617c0ee7dec ("KVM: x86/xen: Only write Xen hypercall page for guest writes to MSR") Signed-off-by: David Woodhouse --- Documentation/virt/kvm/api.rst | 11 +++ arch/x86/include/uapi/asm/kvm.h | 3 + arch/x86/kvm/x86.c | 3 +- arch/x86/kvm/xen.c | 7 ++ .../selftests/kvm/x86/xen_vmcall_test.c | 96 +++++++++++++++++++ 5 files changed, 119 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 52bbbb553ce1..63423c375a78 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5800,6 +5800,17 @@ KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR vector configured with HVM_PARAM_CALLBACK_IRQ. It is disabled by setting the vector to zero. +KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE + This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates + support for KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE. It triggers + population of the Xen hypercall page at the guest physical address + specified in ``gpa``, just as if the guest had written to the + hypercall MSR. This is intended for VMMs that intercept the guest's + MSR write (e.g. to track an epoch for kexec/crash recovery) and need + to replay the write from host context. Direct host-initiated writes + via KVM_SET_MSRS are blocked for safety; this attribute provides the + correct alternative. + 4.129 KVM_XEN_VCPU_GET_ATTR --------------------------- diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5f2b30d0405c..977f3aa66c18 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -596,6 +596,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) #define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) +#define KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE (1 << 9) #define KVM_XEN_MSR_MIN_INDEX 0x40000000u #define KVM_XEN_MSR_MAX_INDEX 0x4fffffffu @@ -704,6 +705,8 @@ struct kvm_xen_vcpu_attr { #define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE */ +#define KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE 0xa /* Secure Encrypted Virtualization command */ enum sev_cmd_id { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a1b63c63d1a..3facf0429c0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4891,7 +4891,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND | KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE | - KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA; + KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA | + KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE | KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 91fd3673c09a..c16b4560c9e7 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -907,6 +907,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int idx, r = -ENOENT; + /* + * kvm_xen_write_hypercall_page() manages its own locking. + * Handle it before taking xen_lock to avoid a deadlock. + */ + if (data->type == KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE) + return kvm_xen_write_hypercall_page(vcpu, data->u.gpa) ? -EIO : 0; + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/tools/testing/selftests/kvm/x86/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86/xen_vmcall_test.c index 2585087cdf5c..1536d510ab30 100644 --- a/tools/testing/selftests/kvm/x86/xen_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86/xen_vmcall_test.c @@ -12,6 +12,8 @@ #include "processor.h" #include "hyperv.h" +#include + #define HCALL_REGION_GPA 0xc0000000ULL #define HCALL_REGION_SLOT 10 @@ -26,6 +28,10 @@ #define HVCALL_SIGNAL_EVENT 0x005d #define HV_STATUS_INVALID_ALIGNMENT 4 +enum { + TEST_WRITE_HYPERCALL_PAGE = 1, +}; + static void guest_code(void) { unsigned long rax = INPUTVALUE; @@ -76,17 +82,65 @@ static void guest_code(void) "r"(r8)); GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT); + /* + * Test KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE: ask userspace + * to set up MSR filtering, then write the MSR. The WRMSR will exit + * to userspace (not populate the page). Userspace verifies the page + * is empty, uses the attr to populate it, then resumes us. + */ + GUEST_SYNC(TEST_WRITE_HYPERCALL_PAGE); + + __asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR), + "a" (HCALL_REGION_GPA & 0xffffffff), + "d" (HCALL_REGION_GPA >> 32)); + + /* Userspace populated the page via the attr — verify it works */ + rax = INPUTVALUE; + rdi = ARGVALUE(1); + rsi = ARGVALUE(2); + rdx = ARGVALUE(3); + r10 = ARGVALUE(4); + r8 = ARGVALUE(5); + r9 = ARGVALUE(6); + __asm__ __volatile__("call *%1" : "=a"(rax) : + "r"(HCALL_REGION_GPA + INPUTVALUE * 32), + "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx), + "r"(r10), "r"(r8), "r"(r9)); + GUEST_ASSERT(rax == RETVALUE); + GUEST_DONE(); } +static void setup_msr_filter(struct kvm_vm *vm) +{ + uint64_t deny_bits = 0; + struct kvm_msr_filter filter = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, + .ranges = { + { + .flags = KVM_MSR_FILTER_WRITE, + .nmsrs = 1, + .base = XEN_HYPERCALL_MSR, + .bitmap = (uint8_t *)&deny_bits, + }, + }, + }; + + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter); +} + int main(int argc, char *argv[]) { unsigned int xen_caps; struct kvm_vcpu *vcpu; struct kvm_vm *vm; + bool msr_filter_ready = false; xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); + TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE); + TEST_REQUIRE(kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR)); + TEST_REQUIRE(kvm_check_cap(KVM_CAP_X86_MSR_FILTER)); vm = vm_create_with_one_vcpu(&vcpu, guest_code); vcpu_set_hv_cpuid(vcpu); @@ -123,6 +177,36 @@ int main(int argc, char *argv[]) continue; } + if (run->exit_reason == KVM_EXIT_X86_WRMSR) { + /* MSR filter caught the Xen hypercall MSR write */ + TEST_ASSERT(msr_filter_ready, + "Unexpected WRMSR exit before filter setup"); + TEST_ASSERT_EQ(run->msr.index, XEN_HYPERCALL_MSR); + + /* + * The host_initiated check should have prevented + * KVM from populating the page. Verify it's empty. + */ + uint8_t *hcall_page = addr_gpa2hva(vm, HCALL_REGION_GPA); + TEST_ASSERT_EQ(hcall_page[0], 0); + + /* + * Now use the attr to populate the page, as a + * VMM would after intercepting the MSR write. + */ + struct kvm_xen_vcpu_attr attr = { + .type = KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE, + .u.gpa = HCALL_REGION_GPA, + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &attr); + + /* Verify the page is now populated */ + TEST_ASSERT_EQ(hcall_page[0], 0xb8); + + run->msr.error = 0; + continue; + } + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); switch (get_ucall(vcpu, &uc)) { @@ -130,6 +214,18 @@ int main(int argc, char *argv[]) REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: + TEST_ASSERT_EQ(uc.args[1], TEST_WRITE_HYPERCALL_PAGE); + + /* + * Guest is about to write the Xen MSR. Clear the + * hypercall page, install MSR filter to intercept + * the write, and enable userspace MSR exits. + */ + memset(addr_gpa2hva(vm, HCALL_REGION_GPA), 0, PAGE_SIZE); + vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, + KVM_MSR_EXIT_REASON_FILTER); + setup_msr_filter(vm); + msr_filter_ready = true; break; case UCALL_DONE: goto done; -- 2.43.0