On PREEMPT_RT, kvm_xen_set_evtchn_fast() acquires a sleeping lock (gpc->lock) from hard IRQ context (xen_timer_callback), triggering: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 0, name: swapper/5 preempt_count: 10100, expected: 0 RCU nest depth: 0, expected: 0 4 locks held by swapper/5/0: INFO: lockdep is turned off. irq event stamp: 1766 hardirqs last enabled at (1765): [] tick_nohz_idle_got_tick+0x84/0x90 hardirqs last disabled at (1766): [] sysvec_apic_timer_interrupt+0x11/0xd0 softirqs last enabled at (0): [] copy_process+0x1586/0x58b0 softirqs last disabled at (0): [<0000000000000000>] 0x0 Preempt disabled at: [] sysvec_apic_timer_interrupt+0x7c/0xd0 CPU: 5 UID: 0 PID: 0 Comm: swapper/5 Not tainted 6.13.0-rc1-syzkaller-00026-g2d5404caa8c7 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 __might_resched+0x30d/0x8f0 kernel/sched/core.c:10318 rt_spin_lock+0x70/0x130 kernel/locking/spinlock_rt.c:48 kvm_xen_set_evtchn_fast+0x20b/0xa40 arch/x86/kvm/xen.c:1820 xen_timer_callback+0x91/0x1a0 arch/x86/kvm/xen.c:142 __run_hrtimer kernel/time/hrtimer.c:1739 [inline] __hrtimer_run_queues+0x20b/0xa00 kernel/time/hrtimer.c:1803 The Xen timer uses HRTIMER_MODE_ABS_HARD for latency-sensitive event delivery (see commit 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen timer delivery")). On PREEMPT_RT, hard IRQ hrtimers execute in hard IRQ context where sleeping locks cannot be acquired. Use irq_work to defer event injection to a context where sleeping locks are permitted on PREEMPT_RT. This preserves the hard IRQ timer precision on non-RT kernels while avoiding the lock context violation on RT. The approach follows the existing pvclock_irq_work pattern in arch/x86/kvm/x86.c. Tested on PREEMPT_RT kernel (CONFIG_PREEMPT_RT=y) with the syzbot C reproducer - no crash observed after 30+ minutes of continuous execution. Also tested on non-RT kernel (CONFIG_PREEMPT_RT=n) to verify no regression in the fast path. Reported-by: syzbot+919877893c9d28162dc2@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=919877893c9d28162dc2 Fixes: 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen timer delivery") Signed-off-by: shaikh.kamal --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/xen.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a3bfa293e8b..533b45289d53 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -746,6 +746,7 @@ struct kvm_vcpu_xen { u64 timer_expires; /* In guest epoch */ atomic_t timer_pending; struct hrtimer timer; + struct irq_work timer_inject_irqwork; int poll_evtchn; struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index d6b2a665b499..01fa7b165355 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -122,6 +122,24 @@ void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) } } +static void xen_timer_inject_irqwork(struct irq_work *work) +{ + struct kvm_vcpu_xen *xen = container_of(work, struct kvm_vcpu_xen, + timer_inject_irqwork); + struct kvm_vcpu *vcpu = container_of(xen, struct kvm_vcpu, arch.xen); + struct kvm_xen_evtchn e; + int rc; + + e.vcpu_id = vcpu->vcpu_id; + e.vcpu_idx = vcpu->vcpu_idx; + e.port = vcpu->arch.xen.timer_virq; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm); + if (rc != -EWOULDBLOCK) + vcpu->arch.xen.timer_expires = 0; +} + static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) { struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, @@ -132,6 +150,17 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) if (atomic_read(&vcpu->arch.xen.timer_pending)) return HRTIMER_NORESTART; + /* + * On PREEMPT_RT, this callback runs in hard IRQ context where + * kvm_xen_set_evtchn_fast() cannot acquire sleeping locks + * (specifically gpc->lock). Defer to irq_work which runs in + * thread context on RT. + */ + if (in_hardirq()) { + irq_work_queue(&vcpu->arch.xen.timer_inject_irqwork); + return HRTIMER_NORESTART; + } + e.vcpu_id = vcpu->vcpu_id; e.vcpu_idx = vcpu->vcpu_idx; e.port = vcpu->arch.xen.timer_virq; @@ -2303,6 +2332,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + init_irq_work(&vcpu->arch.xen.timer_inject_irqwork, + xen_timer_inject_irqwork); kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); -- 2.43.0