From: David Woodhouse <dwmw@amazon.co.uk>

Commit 3617c0ee7decb ("KVM: x86/xen: Only write Xen hypercall page for
guest writes to MSR") blocked host-initiated writes from triggering the
Xen hypercall page setup, to fix an SRCU usage violation when the
hypercall MSR index collides with a real MSR written during vCPU reset.

However, some VMMs legitimately need to intercept the hypercall MSR
write from host context, for example to track an epoch for kexec/crash
recovery, and allow the crash kernel to take over PV net/disk devices.

In such cases, the VMM then needs to trigger the effects that the guest
write to the MSR is intended to have: populating the hypercall page, and
latching the VM's "long mode". The host_initiated check breaks this use
case.

Add KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE as a new vcpu attribute
that triggers the same effects from host context. Move the code into a
common __kvm_xen_write_hypercall_page() function with a normal 0/-errno
return convention, and turn the existing kvm_xen_write_hypercall_page()
into a wrapper which returns the 0/1 return value for MSR write
functions.

The __kvm_xen_write_hypercall_page() function now runs with the xen_lock
held for its entire duration, which isn't strictly necessary, but this
is hardly a fast path as a guest generally does this only once while
it's starting up.

The handling of the Xen hypercall page is a bit of a historical anomaly.
It was added in commit ffde22ac53b ("KVM: Xen PV-on-HVM guest support")
in 2009 with a comment saying, "A generic mechanism to delegate MSR
writes to userspace seems overkill and risks encouraging similar MSR
abuse in the future. Thus this patch adds special support for the Xen
HVM MSR."

The original mode where the contents of the page were provided by
userspace is no longer used. In 2018 when the current Xen support was
added, commit 23200b7a30d ("KVM: x86/xen: intercept xen hypercalls if
enabled") said, "Since this means KVM owns the ABI, dispense with the
facility for the VMM to provide its own copy of the hypercall pages;
just fill them in directly using VMCALL/VMMCALL as we do for the Hyper-V
hypercall page."

Strictly, it would be *possible* for userspace to do all this for
itself; there is already KVM_XEN_ATTR_TYPE_LONG_MODE, and userspace
could write the hypercall page for itself. But the 2018 logic about KVM
owning the ABI, and avoiding duplication of the instruction choice made
by patch_hypercall(), still stands. So provide userspace with a simple
method which triggers all the (side) effects of the MSR write, now that
it can't just write the MSR as before.

Fixes: 3617c0ee7dec ("KVM: x86/xen: Only write Xen hypercall page for guest writes to MSR")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 Documentation/virt/kvm/api.rst                | 13 +++
 arch/x86/include/uapi/asm/kvm.h               |  3 +
 arch/x86/kvm/x86.c                            |  3 +-
 arch/x86/kvm/xen.c                            | 45 ++++++---
 .../selftests/kvm/x86/xen_vmcall_test.c       | 96 +++++++++++++++++++
 5 files changed, 144 insertions(+), 16 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index a5f9ee92f43e..3c6aee2a35bf 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5812,6 +5812,19 @@ KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
   vector configured with HVM_PARAM_CALLBACK_IRQ. It is disabled by
   setting the vector to zero.
 
+KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE
+  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
+  support for KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE. It triggers
+  population of the Xen hypercall page at the guest physical address
+  specified in ``gpa``, just as if the guest had written to the
+  hypercall MSR. This is intended for VMMs that intercept the guest's
+  MSR write (e.g. to track an epoch for kexec/crash recovery) and need
+  to replay the write from host context. Direct host-initiated writes
+  via KVM_SET_MSRS are blocked for safety; this attribute provides the
+  correct alternative. As with a non-intercepted guest write to the
+  hypercall MSR, it also latches the 'long mode' for the VM, which
+  determines the layout of the shared data structures.
+
 
 4.129 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 1585ec804066..7732b92a4db0 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -598,6 +598,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG	(1 << 6)
 #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE	(1 << 7)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA	(1 << 8)
+#define KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE	(1 << 9)
 
 #define KVM_XEN_MSR_MIN_INDEX			0x40000000u
 #define KVM_XEN_MSR_MAX_INDEX			0x4fffffffu
@@ -706,6 +707,8 @@ struct kvm_xen_vcpu_attr {
 #define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR	0x8
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA	0x9
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE */
+#define KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE 0xa
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index afcac1042947..d7f23c0bff74 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4869,7 +4869,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
 		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
 		    KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
-		    KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
+		    KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA |
+		    KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE;
 		if (sched_info_on())
 			r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
 			     KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 694b31c1fcc9..50423df6e22d 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -32,6 +32,7 @@
 static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
 static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
 static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
+static int __kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
 
 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
 
@@ -1138,6 +1139,10 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		}
 		break;
 
+	case KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE:
+		r = __kvm_xen_write_hypercall_page(vcpu, data->u.gpa);
+		break;
+
 	default:
 		break;
 	}
@@ -1273,7 +1278,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 	return r;
 }
 
-int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+static int __kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
 	u32 page_num = data & ~PAGE_MASK;
@@ -1281,7 +1286,6 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 	bool lm = is_long_mode(vcpu);
 	int r = 0;
 
-	mutex_lock(&kvm->arch.xen.xen_lock);
 	if (kvm->arch.xen.long_mode != lm) {
 		kvm->arch.xen.long_mode = lm;
 
@@ -1289,11 +1293,9 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 		 * Re-initialize shared_info to put the wallclock in the
 		 * correct place.
 		 */
-		if (kvm->arch.xen.shinfo_cache.active &&
-		    kvm_xen_shared_info_init(kvm))
-			r = 1;
+		if (kvm->arch.xen.shinfo_cache.active)
+			r = kvm_xen_shared_info_init(kvm);
 	}
-	mutex_unlock(&kvm->arch.xen.xen_lock);
 
 	if (r)
 		return r;
@@ -1309,7 +1311,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 		int i;
 
 		if (page_num)
-			return 1;
+			return -EINVAL;
 
 		/* mov imm32, %eax */
 		instructions[0] = 0xb8;
@@ -1325,10 +1327,11 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 
 		for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
 			*(u32 *)&instructions[1] = i;
-			if (kvm_vcpu_write_guest(vcpu,
+			r = kvm_vcpu_write_guest(vcpu,
 						 page_addr + (i * sizeof(instructions)),
-						 instructions, sizeof(instructions)))
-				return 1;
+						 instructions, sizeof(instructions));
+			if (r)
+				return r;
 		}
 	} else {
 		/*
@@ -1340,10 +1343,9 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 		u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64
 				  : kvm->arch.xen.hvm_config.blob_size_32;
 		u8 *page;
-		int ret;
 
 		if (page_num >= blob_size)
-			return 1;
+			return -EINVAL;
 
 		blob_addr += page_num * PAGE_SIZE;
 
@@ -1351,14 +1353,27 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 		if (IS_ERR(page))
 			return PTR_ERR(page);
 
-		ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
+		r = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
 		kfree(page);
-		if (ret)
-			return 1;
+		if (r)
+			return r;
 	}
 	return 0;
 }
 
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+	guard(mutex)(&vcpu->kvm->arch.xen.xen_lock);
+
+	/*
+	 * The MSR write path expects a 0/1 return; convert the errno from
+	 * the shared helper accordingly. Callers that want the real error
+	 * (e.g. the KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE attribute)
+	 * invoke __kvm_xen_write_hypercall_page() directly.
+	 */
+	return __kvm_xen_write_hypercall_page(vcpu, data) ? 1 : 0;
+}
+
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 {
 	/* Only some feature flags need to be *enabled* by userspace */
diff --git a/tools/testing/selftests/kvm/x86/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86/xen_vmcall_test.c
index 2585087cdf5c..1536d510ab30 100644
--- a/tools/testing/selftests/kvm/x86/xen_vmcall_test.c
+++ b/tools/testing/selftests/kvm/x86/xen_vmcall_test.c
@@ -12,6 +12,8 @@
 #include "processor.h"
 #include "hyperv.h"
 
+#include <string.h>
+
 #define HCALL_REGION_GPA	0xc0000000ULL
 #define HCALL_REGION_SLOT	10
 
@@ -26,6 +28,10 @@
 #define HVCALL_SIGNAL_EVENT		0x005d
 #define HV_STATUS_INVALID_ALIGNMENT	4
 
+enum {
+	TEST_WRITE_HYPERCALL_PAGE = 1,
+};
+
 static void guest_code(void)
 {
 	unsigned long rax = INPUTVALUE;
@@ -76,17 +82,65 @@ static void guest_code(void)
 			     "r"(r8));
 	GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT);
 
+	/*
+	 * Test KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE: ask userspace
+	 * to set up MSR filtering, then write the MSR. The WRMSR will exit
+	 * to userspace (not populate the page). Userspace verifies the page
+	 * is empty, uses the attr to populate it, then resumes us.
+	 */
+	GUEST_SYNC(TEST_WRITE_HYPERCALL_PAGE);
+
+	__asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR),
+			     "a" (HCALL_REGION_GPA & 0xffffffff),
+			     "d" (HCALL_REGION_GPA >> 32));
+
+	/* Userspace populated the page via the attr — verify it works */
+	rax = INPUTVALUE;
+	rdi = ARGVALUE(1);
+	rsi = ARGVALUE(2);
+	rdx = ARGVALUE(3);
+	r10 = ARGVALUE(4);
+	r8 = ARGVALUE(5);
+	r9 = ARGVALUE(6);
+	__asm__ __volatile__("call *%1" : "=a"(rax) :
+			     "r"(HCALL_REGION_GPA + INPUTVALUE * 32),
+			     "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+			     "r"(r10), "r"(r8), "r"(r9));
+	GUEST_ASSERT(rax == RETVALUE);
+
 	GUEST_DONE();
 }
 
+static void setup_msr_filter(struct kvm_vm *vm)
+{
+	uint64_t deny_bits = 0;
+	struct kvm_msr_filter filter = {
+		.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+		.ranges = {
+			{
+				.flags = KVM_MSR_FILTER_WRITE,
+				.nmsrs = 1,
+				.base = XEN_HYPERCALL_MSR,
+				.bitmap = (uint8_t *)&deny_bits,
+			},
+		},
+	};
+
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter);
+}
+
 int main(int argc, char *argv[])
 {
 	unsigned int xen_caps;
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
+	bool msr_filter_ready = false;
 
 	xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
 	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_WRITE_HYPERCALL_PAGE);
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR));
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_X86_MSR_FILTER));
 
 	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 	vcpu_set_hv_cpuid(vcpu);
@@ -123,6 +177,36 @@ int main(int argc, char *argv[])
 			continue;
 		}
 
+		if (run->exit_reason == KVM_EXIT_X86_WRMSR) {
+			/* MSR filter caught the Xen hypercall MSR write */
+			TEST_ASSERT(msr_filter_ready,
+				    "Unexpected WRMSR exit before filter setup");
+			TEST_ASSERT_EQ(run->msr.index, XEN_HYPERCALL_MSR);
+
+			/*
+			 * The host_initiated check should have prevented
+			 * KVM from populating the page. Verify it's empty.
+			 */
+			uint8_t *hcall_page = addr_gpa2hva(vm, HCALL_REGION_GPA);
+			TEST_ASSERT_EQ(hcall_page[0], 0);
+
+			/*
+			 * Now use the attr to populate the page, as a
+			 * VMM would after intercepting the MSR write.
+			 */
+			struct kvm_xen_vcpu_attr attr = {
+				.type = KVM_XEN_VCPU_ATTR_TYPE_WRITE_HYPERCALL_PAGE,
+				.u.gpa = HCALL_REGION_GPA,
+			};
+			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &attr);
+
+			/* Verify the page is now populated */
+			TEST_ASSERT_EQ(hcall_page[0], 0xb8);
+
+			run->msr.error = 0;
+			continue;
+		}
+
 		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 		switch (get_ucall(vcpu, &uc)) {
@@ -130,6 +214,18 @@ int main(int argc, char *argv[])
 			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		case UCALL_SYNC:
+			TEST_ASSERT_EQ(uc.args[1], TEST_WRITE_HYPERCALL_PAGE);
+
+			/*
+			 * Guest is about to write the Xen MSR. Clear the
+			 * hypercall page, install MSR filter to intercept
+			 * the write, and enable userspace MSR exits.
+			 */
+			memset(addr_gpa2hva(vm, HCALL_REGION_GPA), 0, PAGE_SIZE);
+			vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR,
+				      KVM_MSR_EXIT_REASON_FILTER);
+			setup_msr_filter(vm);
+			msr_filter_ready = true;
 			break;
 		case UCALL_DONE:
 			goto done;
-- 
2.43.0