From: Wanpeng Li On overcommitted hosts, a vCPU spinning on an IPI response is difficult to distinguish from a vCPU spinning on a lock. kvm_vcpu_on_spin() can therefore yield to an unrelated vCPU based only on coarse preemption state. Add per-vCPU IPI tracking for directed yield. struct kvm_vcpu_arch now records the last sender and receiver vCPU indexes, the vector, a pending flag, and a monotonic timestamp. Add helpers to record a send, query whether a vCPU is the recent IPI receiver of another vCPU, and clear or reset the context. Accesses use READ_ONCE() and WRITE_ONCE() because the state is only a best-effort scheduling hint. Add module parameters to enable tracking and to control the recency window. Provide a weak generic kvm_vcpu_is_ipi_receiver() stub so non-x86 builds keep the existing behavior. The state is reset on vCPU create and destroy, and cleared on INIT. This adds only state and helpers; directed-yield candidate selection is unchanged. Signed-off-by: Wanpeng Li --- arch/x86/include/asm/kvm_host.h | 19 ++++++ arch/x86/kvm/lapic.c | 102 ++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 3 + arch/x86/kvm/x86.h | 8 +++ include/linux/kvm_host.h | 8 +++ virt/kvm/kvm_main.c | 6 ++ 6 files changed, 146 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f14009f25a3b..a26623716a53 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1065,6 +1065,25 @@ struct kvm_vcpu_arch { int pending_external_vector; int highest_stale_pending_ioapic_eoi; + /* + * IPI tracking for directed-yield optimization. + * + * Populated by kvm_track_ipi_communication() when a unicast fixed + * IPI is delivered, and queried by kvm_vcpu_is_ipi_receiver() from + * kvm_vcpu_on_spin() to prefer the confirmed IPI target before + * generic preempted-lock-holder heuristics. + * + * All accesses are lockless READ_ONCE/WRITE_ONCE; best-effort by + * design (see comment on kvm_vcpu_is_good_yield_candidate()). + */ + struct { + int last_ipi_sender; /* vCPU idx of last IPI sender */ + int last_ipi_receiver; /* vCPU idx of last IPI target */ + u8 vector; /* vector of the pending IPI */ + bool pending_ipi; /* awaiting IPI response */ + u64 ipi_time_ns; /* mono timestamp of IPI send */ + } ipi_context; + /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4078e624ca66..515409e0e22c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -78,6 +78,29 @@ module_param(lapic_timer_advance, bool, 0444); static bool __read_mostly vector_hashing_enabled = true; module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444); +/* + * IPI tracking for directed-yield optimization. + * + * ipi_tracking_enabled - master switch (default on). When off, the + * tracking hooks become no-ops and + * kvm_vcpu_is_ipi_receiver() always returns + * false, falling back to the legacy + * preempted-in-kernel heuristic. + * + * ipi_window_ns - recency window. An IPI older than this is + * treated as stale and does not influence + * directed-yield selection. Long enough to + * cover typical spin-on-IPI-response periods, + * short enough to avoid stale state inflating + * boost priority on throughput-sensitive + * workloads. + */ +static bool ipi_tracking_enabled = true; +module_param(ipi_tracking_enabled, bool, 0644); + +static unsigned long ipi_window_ns = 50 * NSEC_PER_MSEC; +module_param(ipi_window_ns, ulong, 0644); + static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); @@ -1144,6 +1167,85 @@ static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +/* + * Record a sender -> receiver IPI relationship for directed-yield use. + * + * Accessed lockless (READ_ONCE/WRITE_ONCE); this is best-effort, racy + * information consumed only as a scheduling hint by + * kvm_vcpu_on_spin(), so occasional torn or stale reads are harmless. + * + * Callers should already have filtered out self-IPIs and non-unicast + * or non-fixed-mode deliveries; this function only records the state. + */ +void kvm_track_ipi_communication(struct kvm_vcpu *sender, + struct kvm_vcpu *receiver, u8 vector) +{ + if (!sender || !receiver || sender == receiver) + return; + if (unlikely(!READ_ONCE(ipi_tracking_enabled))) + return; + + WRITE_ONCE(sender->arch.ipi_context.last_ipi_receiver, + receiver->vcpu_idx); + WRITE_ONCE(sender->arch.ipi_context.vector, vector); + WRITE_ONCE(sender->arch.ipi_context.pending_ipi, true); + WRITE_ONCE(sender->arch.ipi_context.ipi_time_ns, + ktime_get_mono_fast_ns()); + + WRITE_ONCE(receiver->arch.ipi_context.last_ipi_sender, + sender->vcpu_idx); + WRITE_ONCE(receiver->arch.ipi_context.vector, vector); +} + +/* + * Return true if @receiver is the confirmed recent IPI target of + * @sender, within the configured recency window. Directed yield uses + * this as a high-confidence signal that selecting @receiver may + * unblock @sender's spin loop. + */ +bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, + struct kvm_vcpu *receiver) +{ + u64 then, now; + + if (unlikely(!READ_ONCE(ipi_tracking_enabled))) + return false; + + if (!READ_ONCE(sender->arch.ipi_context.pending_ipi)) + return false; + + if (READ_ONCE(sender->arch.ipi_context.last_ipi_receiver) != + receiver->vcpu_idx) + return false; + + then = READ_ONCE(sender->arch.ipi_context.ipi_time_ns); + now = ktime_get_mono_fast_ns(); + return now - then <= READ_ONCE(ipi_window_ns); +} + +/* + * Clear the IPI tracking state of a single vCPU, typically when the + * associated interrupt has been acknowledged (EOI) or the vCPU has + * been reset/destroyed. + * + * Leaves the monotonic timestamp untouched to keep staleness checks + * on other vCPUs that may reference this one well-defined; use + * kvm_vcpu_reset_ipi_context() for a hard reset. + */ +void kvm_vcpu_clear_ipi_context(struct kvm_vcpu *vcpu) +{ + WRITE_ONCE(vcpu->arch.ipi_context.pending_ipi, false); + WRITE_ONCE(vcpu->arch.ipi_context.last_ipi_sender, -1); + WRITE_ONCE(vcpu->arch.ipi_context.last_ipi_receiver, -1); + WRITE_ONCE(vcpu->arch.ipi_context.vector, 0); +} + +void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu) +{ + kvm_vcpu_clear_ipi_context(vcpu); + WRITE_ONCE(vcpu->arch.ipi_context.ipi_time_ns, 0); +} + /* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0550359ed798..dcedd09bac10 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12907,6 +12907,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_guest_fpu; kvm_xen_init_vcpu(vcpu); + kvm_vcpu_reset_ipi_context(vcpu); vcpu_load(vcpu); kvm_vcpu_after_set_cpuid(vcpu); kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); @@ -12974,6 +12975,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); + kvm_vcpu_reset_ipi_context(vcpu); kvfree(vcpu->arch.cpuid_entries); } @@ -13050,6 +13052,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_leave_nested(vcpu); kvm_lapic_reset(vcpu, init_event); + kvm_vcpu_clear_ipi_context(vcpu); WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu)); vcpu->arch.hflags = 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 38a905fa86de..eb7f50018f78 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -475,6 +475,14 @@ int handle_ud(struct kvm_vcpu *vcpu); void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, struct kvm_queued_exception *ex); +/* IPI tracking helpers for directed-yield optimization (see lapic.c). */ +void kvm_track_ipi_communication(struct kvm_vcpu *sender, + struct kvm_vcpu *receiver, u8 vector); +bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, + struct kvm_vcpu *receiver); +void kvm_vcpu_clear_ipi_context(struct kvm_vcpu *vcpu); +void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu); + int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4c14aee1fb06..e54e72ae5ebb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1549,6 +1549,14 @@ static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu) int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode); +/* + * IPI-aware directed-yield hook. Architectures that support IPI + * tracking (currently x86 via arch/x86/kvm/lapic.c) override this; + * the generic __weak stub in virt/kvm/kvm_main.c returns false. + */ +bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, + struct kvm_vcpu *receiver); + void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages); void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 881f92d7a469..2e11c6cfc167 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3957,6 +3957,12 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) return false; } +bool __weak kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, + struct kvm_vcpu *receiver) +{ + return false; +} + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { int nr_vcpus, start, i, idx, yielded; -- 2.43.0