From: Wanpeng Li <wanpengli@tencent.com>

On overcommitted hosts, a vCPU spinning on an IPI response is difficult
to distinguish from a vCPU spinning on a lock. kvm_vcpu_on_spin() can
therefore yield to an unrelated vCPU based only on coarse preemption
state.

Add per-vCPU IPI tracking for directed yield. struct kvm_vcpu_arch now
records the last sender and receiver vCPU indexes, the vector, a pending
flag, and a monotonic timestamp. Add helpers to record a send, query
whether a vCPU is the recent IPI receiver of another vCPU, and clear or
reset the context. Accesses use READ_ONCE() and WRITE_ONCE() because the
state is only a best-effort scheduling hint.

Add module parameters to enable tracking and to control the recency
window. Provide a weak generic kvm_vcpu_is_ipi_receiver() stub so
non-x86 builds keep the existing behavior. The state is reset on vCPU
create and destroy, and cleared on INIT.

This adds only state and helpers; directed-yield candidate selection is
unchanged.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  19 ++++++
 arch/x86/kvm/lapic.c            | 102 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |   3 +
 arch/x86/kvm/x86.h              |   8 +++
 include/linux/kvm_host.h        |   8 +++
 virt/kvm/kvm_main.c             |   6 ++
 6 files changed, 146 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f14009f25a3b..a26623716a53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1065,6 +1065,25 @@ struct kvm_vcpu_arch {
 	int pending_external_vector;
 	int highest_stale_pending_ioapic_eoi;
 
+	/*
+	 * IPI tracking for directed-yield optimization.
+	 *
+	 * Populated by kvm_track_ipi_communication() when a unicast fixed
+	 * IPI is delivered, and queried by kvm_vcpu_is_ipi_receiver() from
+	 * kvm_vcpu_on_spin() to prefer the confirmed IPI target before
+	 * generic preempted-lock-holder heuristics.
+	 *
+	 * All accesses are lockless READ_ONCE/WRITE_ONCE; best-effort by
+	 * design (see comment on kvm_vcpu_is_good_yield_candidate()).
+	 */
+	struct {
+		int	last_ipi_sender;	/* vCPU idx of last IPI sender  */
+		int	last_ipi_receiver;	/* vCPU idx of last IPI target  */
+		u8	vector;			/* vector of the pending IPI    */
+		bool	pending_ipi;		/* awaiting IPI response        */
+		u64	ipi_time_ns;		/* mono timestamp of IPI send   */
+	} ipi_context;
+
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4078e624ca66..515409e0e22c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -78,6 +78,29 @@ module_param(lapic_timer_advance, bool, 0444);
 static bool __read_mostly vector_hashing_enabled = true;
 module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444);
 
+/*
+ * IPI tracking for directed-yield optimization.
+ *
+ * ipi_tracking_enabled  - master switch (default on). When off, the
+ *                         tracking hooks become no-ops and
+ *                         kvm_vcpu_is_ipi_receiver() always returns
+ *                         false, falling back to the legacy
+ *                         preempted-in-kernel heuristic.
+ *
+ * ipi_window_ns         - recency window. An IPI older than this is
+ *                         treated as stale and does not influence
+ *                         directed-yield selection. Long enough to
+ *                         cover typical spin-on-IPI-response periods,
+ *                         short enough to avoid stale state inflating
+ *                         boost priority on throughput-sensitive
+ *                         workloads.
+ */
+static bool ipi_tracking_enabled = true;
+module_param(ipi_tracking_enabled, bool, 0644);
+
+static unsigned long ipi_window_ns = 50 * NSEC_PER_MSEC;
+module_param(ipi_window_ns, ulong, 0644);
+
 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
 static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
 
@@ -1144,6 +1167,85 @@ static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
+/*
+ * Record a sender -> receiver IPI relationship for directed-yield use.
+ *
+ * Accessed lockless (READ_ONCE/WRITE_ONCE); this is best-effort, racy
+ * information consumed only as a scheduling hint by
+ * kvm_vcpu_on_spin(), so occasional torn or stale reads are harmless.
+ *
+ * Callers should already have filtered out self-IPIs and non-unicast
+ * or non-fixed-mode deliveries; this function only records the state.
+ */
+void kvm_track_ipi_communication(struct kvm_vcpu *sender,
+				 struct kvm_vcpu *receiver, u8 vector)
+{
+	if (!sender || !receiver || sender == receiver)
+		return;
+	if (unlikely(!READ_ONCE(ipi_tracking_enabled)))
+		return;
+
+	WRITE_ONCE(sender->arch.ipi_context.last_ipi_receiver,
+		   receiver->vcpu_idx);
+	WRITE_ONCE(sender->arch.ipi_context.vector, vector);
+	WRITE_ONCE(sender->arch.ipi_context.pending_ipi, true);
+	WRITE_ONCE(sender->arch.ipi_context.ipi_time_ns,
+		   ktime_get_mono_fast_ns());
+
+	WRITE_ONCE(receiver->arch.ipi_context.last_ipi_sender,
+		   sender->vcpu_idx);
+	WRITE_ONCE(receiver->arch.ipi_context.vector, vector);
+}
+
+/*
+ * Return true if @receiver is the confirmed recent IPI target of
+ * @sender, within the configured recency window. Directed yield uses
+ * this as a high-confidence signal that selecting @receiver may
+ * unblock @sender's spin loop.
+ */
+bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender,
+			      struct kvm_vcpu *receiver)
+{
+	u64 then, now;
+
+	if (unlikely(!READ_ONCE(ipi_tracking_enabled)))
+		return false;
+
+	if (!READ_ONCE(sender->arch.ipi_context.pending_ipi))
+		return false;
+
+	if (READ_ONCE(sender->arch.ipi_context.last_ipi_receiver) !=
+	    receiver->vcpu_idx)
+		return false;
+
+	then = READ_ONCE(sender->arch.ipi_context.ipi_time_ns);
+	now = ktime_get_mono_fast_ns();
+	return now - then <= READ_ONCE(ipi_window_ns);
+}
+
+/*
+ * Clear the IPI tracking state of a single vCPU, typically when the
+ * associated interrupt has been acknowledged (EOI) or the vCPU has
+ * been reset/destroyed.
+ *
+ * Leaves the monotonic timestamp untouched to keep staleness checks
+ * on other vCPUs that may reference this one well-defined; use
+ * kvm_vcpu_reset_ipi_context() for a hard reset.
+ */
+void kvm_vcpu_clear_ipi_context(struct kvm_vcpu *vcpu)
+{
+	WRITE_ONCE(vcpu->arch.ipi_context.pending_ipi, false);
+	WRITE_ONCE(vcpu->arch.ipi_context.last_ipi_sender, -1);
+	WRITE_ONCE(vcpu->arch.ipi_context.last_ipi_receiver, -1);
+	WRITE_ONCE(vcpu->arch.ipi_context.vector, 0);
+}
+
+void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_clear_ipi_context(vcpu);
+	WRITE_ONCE(vcpu->arch.ipi_context.ipi_time_ns, 0);
+}
+
 /* Return true if the interrupt can be handled by using *bitmap as index mask
  * for valid destinations in *dst array.
  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0550359ed798..dcedd09bac10 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12907,6 +12907,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 		goto free_guest_fpu;
 
 	kvm_xen_init_vcpu(vcpu);
+	kvm_vcpu_reset_ipi_context(vcpu);
 	vcpu_load(vcpu);
 	kvm_vcpu_after_set_cpuid(vcpu);
 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
@@ -12974,6 +12975,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
+	kvm_vcpu_reset_ipi_context(vcpu);
 	kvfree(vcpu->arch.cpuid_entries);
 }
 
@@ -13050,6 +13052,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		kvm_leave_nested(vcpu);
 
 	kvm_lapic_reset(vcpu, init_event);
+	kvm_vcpu_clear_ipi_context(vcpu);
 
 	WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
 	vcpu->arch.hflags = 0;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 38a905fa86de..eb7f50018f78 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -475,6 +475,14 @@ int handle_ud(struct kvm_vcpu *vcpu);
 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
 				   struct kvm_queued_exception *ex);
 
+/* IPI tracking helpers for directed-yield optimization (see lapic.c). */
+void kvm_track_ipi_communication(struct kvm_vcpu *sender,
+				 struct kvm_vcpu *receiver, u8 vector);
+bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender,
+			      struct kvm_vcpu *receiver);
+void kvm_vcpu_clear_ipi_context(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu);
+
 int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4c14aee1fb06..e54e72ae5ebb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1549,6 +1549,14 @@ static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
 
+/*
+ * IPI-aware directed-yield hook. Architectures that support IPI
+ * tracking (currently x86 via arch/x86/kvm/lapic.c) override this;
+ * the generic __weak stub in virt/kvm/kvm_main.c returns false.
+ */
+bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender,
+			      struct kvm_vcpu *receiver);
+
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 881f92d7a469..2e11c6cfc167 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3957,6 +3957,12 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
 	return false;
 }
 
+bool __weak kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender,
+				     struct kvm_vcpu *receiver)
+{
+	return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
 	int nr_vcpus, start, i, idx, yielded;
-- 
2.43.0