From: David Woodhouse <dwmw@amazon.co.uk>

If xen_timer_callback() can't deliver an event directly to the guest
(e.g. due to memslot changes causing the GPC to need refreshing), it
sets the timer_pending flag and kicks the vCPU.

However, the pending timer was only injected from the outer vcpu_run()
loop via kvm_inject_pending_timer_irqs(), not from the inner loop in
vcpu_enter_guest(). This means that the timer could be delayed until
something else causes vcpu_enter_guest() to return to the outer loop.

Thus, timer delivery could be delayed by a whole scheduler tick, or
hypothetically for ever in a NOHZ_FULL environment.

Subsume Xen timer handling into kvm_xen_has_pending_events() and
kvm_xen_inject_pending_events(), and use those directly from the inner
vcpu_enter_guest() loop. This ensures deferred timer delivery happens
on the next VM-entry rather than waiting for the scheduler.

Remove the Xen timer handling from kvm_inject_pending_timer_irqs() and
from kvm_cpu_has_pending_timer(), since kvm_vcpu_has_events() already
covers the wakeup case via kvm_xen_has_pending_events().

Pull the actual event injection into kvm_xen_inject_pending_events()
and remove kvm_xen_inject_timer_irqs() to avoid a double check of
arch.xen.timer_pending in caller and callee. Its other caller can
just call kvm_xen_inject_pending_events() (to ensure pending timers
are flushed when setting them from userspace).

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/irq.c |  4 ----
 arch/x86/kvm/x86.c |  3 +++
 arch/x86/kvm/xen.c | 35 +++++++++++++++++------------------
 arch/x86/kvm/xen.h | 21 ++-------------------
 4 files changed, 22 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 9519fec09ee6..7527c9bfe244 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -30,8 +30,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 
 	if (lapic_in_kernel(vcpu))
 		r = apic_has_pending_timer(vcpu);
-	if (kvm_xen_timer_enabled(vcpu))
-		r += kvm_xen_has_pending_timer(vcpu);
 
 	return r;
 }
@@ -170,8 +168,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	if (lapic_in_kernel(vcpu))
 		kvm_inject_apic_timer_irqs(vcpu);
-	if (kvm_xen_timer_enabled(vcpu))
-		kvm_xen_inject_timer_irqs(vcpu);
 }
 
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e62f4a9ad334..c8e58a18a3e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11254,6 +11254,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
+		if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu) &&
+		    kvm_xen_has_pending_events(vcpu))
+			kvm_xen_inject_pending_events(vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
 			kvm_pmu_handle_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index b1fae42bf295..16b8c154243c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -105,22 +105,6 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
 	return ret;
 }
 
-void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
-		struct kvm_xen_evtchn e;
-
-		e.vcpu_id = vcpu->vcpu_id;
-		e.vcpu_idx = vcpu->vcpu_idx;
-		e.port = vcpu->arch.xen.timer_virq;
-		e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
-
-		kvm_xen_set_evtchn(&e, vcpu->kvm);
-
-		vcpu->arch.xen.timer_expires = 0;
-		atomic_set(&vcpu->arch.xen.timer_pending, 0);
-	}
-}
 
 static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
 {
@@ -634,9 +618,24 @@ void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
  */
 void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 {
-	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+	unsigned long evtchn_pending_sel;
 	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
 
+	if (kvm_xen_timer_enabled(v) && atomic_read(&v->arch.xen.timer_pending)) {
+		struct kvm_xen_evtchn e;
+
+		e.vcpu_id = v->vcpu_id;
+		e.vcpu_idx = v->vcpu_idx;
+		e.port = v->arch.xen.timer_virq;
+		e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		kvm_xen_set_evtchn(&e, v->kvm);
+
+		v->arch.xen.timer_expires = 0;
+		atomic_set(&v->arch.xen.timer_pending, 0);
+	}
+
+	evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
 	if (!evtchn_pending_sel)
 		return;
 
@@ -1238,7 +1237,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		 */
 		if (vcpu->arch.xen.timer_expires) {
 			hrtimer_cancel(&vcpu->arch.xen.timer);
-			kvm_xen_inject_timer_irqs(vcpu);
+			kvm_xen_inject_pending_events(vcpu);
 		}
 
 		data->u.timer.port = vcpu->arch.xen.timer_virq;
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 59e6128a7bd3..029026853af5 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -92,7 +92,8 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
 {
 	return static_branch_unlikely(&kvm_xen_enabled.key) &&
-		vcpu->arch.xen.evtchn_pending_sel;
+		(vcpu->arch.xen.evtchn_pending_sel ||
+		 atomic_read(&vcpu->arch.xen.timer_pending));
 }
 
 static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
@@ -100,15 +101,6 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
 	return !!vcpu->arch.xen.timer_virq;
 }
 
-static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
-		return atomic_read(&vcpu->arch.xen.timer_pending);
-
-	return 0;
-}
-
-void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
 #else
 static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 {
@@ -164,15 +156,6 @@ static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
 	return false;
 }
 
-static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
-static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
-{
-}
-
 static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
 {
 	return false;
-- 
2.51.0