Make the "default" API for waking a vhost task safe against the underlying task exiting due to a fatal signal. This fixes a bug in KVM x86 where KVM attempts to wake an NX hugepage recovery task that exiting before being explicitly stopped, resulting in a use-after-free and thus crashes, hangs, and other badness. Oops: general protection fault, probably for non-canonical address 0xff0e899fa1566052: 0000 [#1] SMP CPU: 51 UID: 0 PID: 53807 Comm: tee Tainted: G S O 6.17.0-smp--38183c31756a-next #826 NONE Tainted: [S]=CPU_OUT_OF_SPEC, [O]=OOT_MODULE Hardware name: Google LLC Indus/Indus_QC_03, BIOS 30.110.0 09/13/2024 RIP: 0010:queued_spin_lock_slowpath+0x123/0x250 Code: ... <48> 89 8c 02 c0 da 47 a2 83 79 08 00 75 08 f3 90 83 79 08 00 74 f8 RSP: 0018:ffffbf55cffe7cf8 EFLAGS: 00010006 RAX: ff0e899fff0e8562 RBX: 0000000000d00000 RCX: ffffa39b40aefac0 RDX: 0000000000000030 RSI: fffffffffffffff8 RDI: ffffa39d0592e68c RBP: 0000000000d00000 R08: 00000000ffffff80 R09: 0000000400000000 R10: ffffa36cce4fe401 R11: 0000000000000800 R12: 0000000000000003 R13: 0000000000000000 R14: ffffa39d0592e68c R15: ffffa39b9e672000 FS: 00007f233b2e9740(0000) GS:ffffa39b9e672000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f233b39fda0 CR3: 00000004d031f002 CR4: 00000000007726f0 PKRU: 55555554 Call Trace: _raw_spin_lock_irqsave+0x50/0x60 try_to_wake_up+0x4f/0x5d0 set_nx_huge_pages+0xe4/0x1c0 [kvm] param_attr_store+0x89/0xf0 module_attr_store+0x1e/0x30 kernfs_fop_write_iter+0xe4/0x160 vfs_write+0x2cb/0x420 ksys_write+0x7f/0xf0 do_syscall_64+0x6f/0x1f0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f233b4178b3 R13: 0000000000000002 R14: 00000000226ff3d0 R15: 0000000000000002 Handle VHOST_TASK_FLAGS_KILLED in vhost_task_wake() instead of forcing KVM to solve the problem, as KVM would literally just add an equivalent flag, along with a new lock to protect said flag. In general, forcing simple usage of vhost task to care about signals _and_ take non-trivial action to do the right thing isn't developer friendly, and is likely to lead to similar bugs in the future. Keep the existing behavior for vhost (by calling __vhost_task_wake() instead of vhost_task_wake()), as vhost_worker_killed() takes extra care to stop and flush all workers, i.e. doesn't need the extra protection, and because vhost_vq_work_queue() calls vhost_worker_queue() | -> worker->ops->wakeup(worker) | -> vhost_task_wakeup() | -> vhost_task_wake() while holding RCU and so can't sleep, i.e. can't take exit_mutex. rcu_read_lock(); worker = rcu_dereference(vq->worker); if (worker) { queued = true; vhost_worker_queue(worker, work); } rcu_read_unlock(); Debugged-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/all/aKkLEtoDXKxAAWju@google.com Link: https://lore.kernel.org/all/aJ_vEP2EHj6l0xRT@google.com Suggested-by: Sebastian Andrzej Siewior Fixes: d96c77bd4eeb ("KVM: x86: switch hugepage recovery thread to vhost_task") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson --- drivers/vhost/vhost.c | 2 +- include/linux/sched/vhost_task.h | 1 + kernel/vhost_task.c | 52 +++++++++++++++++++++++++++----- 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8570fdf2e14a..dafce01a9c0d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -744,7 +744,7 @@ static void vhost_workers_free(struct vhost_dev *dev) static void vhost_task_wakeup(struct vhost_worker *worker) { - return vhost_task_wake(worker->vtsk); + return __vhost_task_wake(worker->vtsk); } static void vhost_kthread_wakeup(struct vhost_worker *worker) diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h index 25446c5d3508..1a9c2ac65c9a 100644 --- a/include/linux/sched/vhost_task.h +++ b/include/linux/sched/vhost_task.h @@ -10,5 +10,6 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), void vhost_task_start(struct vhost_task *vtsk); void vhost_task_stop(struct vhost_task *vtsk); void vhost_task_wake(struct vhost_task *vtsk); +void __vhost_task_wake(struct vhost_task *vtsk); #endif /* _LINUX_SCHED_VHOST_TASK_H */ diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index bc738fa90c1d..bd213d0b6da3 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -67,16 +67,52 @@ static int vhost_task_fn(void *data) do_exit(0); } -/** - * vhost_task_wake - wakeup the vhost_task - * @vtsk: vhost_task to wake - * - * wake up the vhost_task worker thread - */ -void vhost_task_wake(struct vhost_task *vtsk) +static void vhost_task_wake_up_process(struct vhost_task *vtsk) { wake_up_process(vtsk->task); } + +/** + * __vhost_task_wake - wakeup the vhost_task + * @vtsk: vhost_task to wake + * + * Wake up the vhost_task worker thread. The caller is responsible for ensuring + * that the task hasn't exited. + */ +void __vhost_task_wake(struct vhost_task *vtsk) +{ + /* + * Checking VHOST_TASK_FLAGS_KILLED can race with signal delivery, but + * a race can only result in false negatives and this is just a sanity + * check, i.e. if KILLED is set, the caller is buggy no matter what. + */ + if (WARN_ON_ONCE(test_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags))) + return; + + vhost_task_wake_up_process(vtsk); +} +EXPORT_SYMBOL_GPL(__vhost_task_wake); + +/** + * vhost_task_wake - wakeup the vhost_task if it hasn't been killed + * @vtsk: vhost_task to wake + * + * Wake up the vhost_task worker thread if the task hasn't exited, e.g. due to + * a signal. + */ +void vhost_task_wake(struct vhost_task *vtsk) +{ + guard(mutex)(&vtsk->exit_mutex); + + /* Attempting to wake a task that has been explicitly stopped is a bug. */ + if (WARN_ON_ONCE(test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags))) + return; + + if (test_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags)) + return; + + vhost_task_wake_up_process(vtsk); +} EXPORT_SYMBOL_GPL(vhost_task_wake); /** @@ -91,7 +127,7 @@ void vhost_task_stop(struct vhost_task *vtsk) mutex_lock(&vtsk->exit_mutex); if (!test_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags)) { set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); - vhost_task_wake(vtsk); + vhost_task_wake_up_process(vtsk); } mutex_unlock(&vtsk->exit_mutex); -- 2.51.0.268.g9569e192d0-goog Now that vhost_task provides an API to safely wake a task without relying on the caller to react to signals, make handle_sigkill() optional and WARN if the "unsafe" __vhost_task_wake() is used without hooking sigkill. Requiring the user to react to sigkill adds no meaningful value, e.g. it didn't help KVM do the right thing with respect to signals, and adding a sanity check in __vhost_task_wake() gives developers a hint as to what needs to be done in response to sigkill. Signed-off-by: Sean Christopherson --- kernel/vhost_task.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index bd213d0b6da3..01bf7b0e2c5b 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -59,7 +59,8 @@ static int vhost_task_fn(void *data) */ if (!test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) { set_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags); - vtsk->handle_sigkill(vtsk->data); + if (vtsk->handle_sigkill) + vtsk->handle_sigkill(vtsk->data); } mutex_unlock(&vtsk->exit_mutex); complete(&vtsk->exited); @@ -81,6 +82,13 @@ static void vhost_task_wake_up_process(struct vhost_task *vtsk) */ void __vhost_task_wake(struct vhost_task *vtsk) { + /* + * Waking the task without taking exit_mutex is safe if and only if the + * implementation hooks sigkill, as that's the only way the caller can + * know if the task has exited prematurely due to a signal. + */ + WARN_ON_ONCE(!vtsk->handle_sigkill); + /* * Checking VHOST_TASK_FLAGS_KILLED can race with signal delivery, but * a race can only result in false negatives and this is just a sanity -- 2.51.0.268.g9569e192d0-goog Don't register a sigkill callback with vhost_task when creating NX hugepage recovery threads now that said callback is optional. In addition to removing what is effectively dead code, not registering a sigkill "handler" also guards against improper use of __vhost_task_wake(). Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..ace302137533 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7677,10 +7677,6 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static void kvm_nx_huge_page_recovery_worker_kill(void *data) -{ -} - static bool kvm_nx_huge_page_recovery_worker(void *data) { struct kvm *kvm = data; @@ -7713,8 +7709,7 @@ static int kvm_mmu_start_lpage_recovery(struct once *once) struct vhost_task *nx_thread; kvm->arch.nx_huge_page_last = get_jiffies_64(); - nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, - kvm_nx_huge_page_recovery_worker_kill, + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, NULL, kvm, "kvm-nx-lpage-recovery"); if (IS_ERR(nx_thread)) -- 2.51.0.268.g9569e192d0-goog