From: David Woodhouse __rwbase_read_unlock() uses raw_spin_lock_irq()/raw_spin_unlock_irq() which unconditionally disables and re-enables interrupts. When read_unlock() is called from hardirq context (e.g. after a successful read_trylock() in a timer callback), the raw_spin_unlock_irq() incorrectly re-enables interrupts within the hardirq handler. This causes lockdep warnings ('hardirqs_on_prepare' from hardirq context) and can lead to IRQ state corruption. Using read_trylock() in hardirq context on PREEMPT_RT is safe because it does not record the lock owner. The read_unlock() acquires the wait_lock which is hardirq safe. This change additionally allows rwlock_t during early boot. Switch to raw_spin_lock_irqsave()/raw_spin_unlock_irqrestore() to preserve the caller's IRQ state. Signed-off-by: David Woodhouse Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Sean Christopherson --- kernel/locking/rwbase_rt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 82e078c0665a..25744862d627 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -153,8 +153,9 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, struct rt_mutex_base *rtm = &rwb->rtmutex; struct task_struct *owner; DEFINE_RT_WAKE_Q(wqh); + unsigned long flags; - raw_spin_lock_irq(&rtm->wait_lock); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); /* * Wake the writer, i.e. the rtmutex owner. It might release the * rtmutex concurrently in the fast path (due to a signal), but to @@ -167,7 +168,7 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */ preempt_disable(); - raw_spin_unlock_irq(&rtm->wait_lock); + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); rt_mutex_wake_up_q(&wqh); } -- 2.54.0.823.g6e5bcc1fc9-goog From: David Woodhouse kvm_xen_set_evtchn_fast() is called from hardirq context (timer callback, kvm_arch_set_irq_inatomic()). On PREEMPT_RT, rwlock_t is a sleeping lock, so read_lock_irqsave() cannot be used in this context. Switch to read_trylock() and return -EWOULDBLOCK on contention, which is the designed fallback — there is always a slow path for the case where the GPC is invalid and needs to be refreshed. Reported-by: syzbot+208f7f3e5f59c11aeb90@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=208f7f3e5f59c11aeb90 Fixes: 14243b387137 ("KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery") Signed-off-by: David Woodhouse Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 91fd3673c09a..9bdb8e3cad58 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -697,6 +697,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; + bool atomic = in_atomic() || !task_is_running(current); unsigned long flags; u8 rc = 0; @@ -713,7 +714,15 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) BUILD_BUG_ON(sizeof(rc) != sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); - read_lock_irqsave(&gpc->lock, flags); + if (atomic) { + local_irq_save(flags); + if (!read_trylock(&gpc->lock)) { + local_irq_restore(flags); + return 1; + } + } else { + read_lock_irqsave(&gpc->lock, flags); + } while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); @@ -725,7 +734,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * and we'll end up getting called again from a context where we *can* * fault in the page and wait for it. */ - if (in_atomic() || !task_is_running(current)) + if (atomic) return 1; if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) { @@ -1794,7 +1803,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct kvm_vcpu *vcpu; unsigned long *pending_bits, *mask_bits; - unsigned long flags; int port_word_bit; bool kick_vcpu = false; int vcpu_idx, idx, rc; @@ -1816,9 +1824,10 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) idx = srcu_read_lock(&kvm->srcu); - read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gpc_check(gpc, PAGE_SIZE)) + if (!read_trylock(&gpc->lock)) goto out_rcu; + if (!kvm_gpc_check(gpc, PAGE_SIZE)) + goto out_unlock; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { struct shared_info *shinfo = gpc->khva; @@ -1847,11 +1856,10 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) } else { rc = 1; /* Delivered to the bitmap in shared_info. */ /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); gpc = &vcpu->arch.xen.vcpu_info_cache; - read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { + if (!read_trylock(&gpc->lock)) { /* * Could not access the vcpu_info. Set the bit in-kernel * and prod the vCPU to deliver it for itself. @@ -1860,6 +1868,11 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) kick_vcpu = true; goto out_rcu; } + if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { + if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + kick_vcpu = true; + goto out_unlock; + } if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { struct vcpu_info *vcpu_info = gpc->khva; @@ -1883,8 +1896,9 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) } } + out_unlock: + read_unlock(&gpc->lock); out_rcu: - read_unlock_irqrestore(&gpc->lock, flags); srcu_read_unlock(&kvm->srcu, idx); if (kick_vcpu) { -- 2.54.0.823.g6e5bcc1fc9-goog From: David Woodhouse Now that the hardirq path (xen_timer_callback and set_evtchn_fast) uses read_trylock() instead of read_lock_irqsave(), the remaining GPC lock users in xen.c are only called from process context (vcpu_run, ioctls). There is no need to disable interrupts to prevent concurrent access from a hardirq user, since the hardirq path no longer takes the lock. Convert read_lock_irqsave()/read_unlock_irqrestore() to plain read_lock()/read_unlock() in: - kvm_xen_update_runstate_guest() - kvm_xen_shared_info_init() - xen_get_guest_pvclock() - kvm_xen_inject_pending_events() - __kvm_xen_has_interrupt() - wait_pending_event() Signed-off-by: David Woodhouse Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 60 +++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9bdb8e3cad58..b1fae42bf295 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -45,15 +45,15 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - read_lock_irq(&gpc->lock); + read_lock(&gpc->lock); while (!kvm_gpc_check(gpc, PAGE_SIZE)) { - read_unlock_irq(&gpc->lock); + read_unlock(&gpc->lock); ret = kvm_gpc_refresh(gpc, PAGE_SIZE); if (ret) goto out; - read_lock_irq(&gpc->lock); + read_lock(&gpc->lock); } /* @@ -96,7 +96,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) smp_wmb(); wc->version = wc_version + 1; - read_unlock_irq(&gpc->lock); + read_unlock(&gpc->lock); kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); @@ -155,22 +155,21 @@ static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu, struct gfn_to_pfn_cache *gpc, unsigned int offset) { - unsigned long flags; int r; - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) { - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock)); if (r) return r; - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); } memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock)); - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); /* * Sanity check TSC shift+multiplier to verify the guest's view of time @@ -325,7 +324,6 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; size_t user_len, user_len1, user_len2; struct vcpu_runstate_info rs; - unsigned long flags; size_t times_ofs; uint8_t *update_bit = NULL; uint64_t entry_time; @@ -421,16 +419,14 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * gfn_to_pfn caches that cover the region. */ if (atomic) { - local_irq_save(flags); if (!read_trylock(&gpc1->lock)) { - local_irq_restore(flags); return; } } else { - read_lock_irqsave(&gpc1->lock, flags); + read_lock(&gpc1->lock); } while (!kvm_gpc_check(gpc1, user_len1)) { - read_unlock_irqrestore(&gpc1->lock, flags); + read_unlock(&gpc1->lock); /* When invoked from kvm_sched_out() we cannot sleep */ if (atomic) @@ -439,7 +435,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) if (kvm_gpc_refresh(gpc1, user_len1)) return; - read_lock_irqsave(&gpc1->lock, flags); + read_lock(&gpc1->lock); } if (likely(!user_len2)) { @@ -467,7 +463,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_); if (atomic) { if (!read_trylock(&gpc2->lock)) { - read_unlock_irqrestore(&gpc1->lock, flags); + read_unlock(&gpc1->lock); return; } } else { @@ -476,7 +472,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) if (!kvm_gpc_check(gpc2, user_len2)) { read_unlock(&gpc2->lock); - read_unlock_irqrestore(&gpc1->lock, flags); + read_unlock(&gpc1->lock); /* When invoked from kvm_sched_out() we cannot sleep */ if (atomic) @@ -581,7 +577,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) } kvm_gpc_mark_dirty_in_slot(gpc1); - read_unlock_irqrestore(&gpc1->lock, flags); + read_unlock(&gpc1->lock); } void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) @@ -640,7 +636,6 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) { unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; - unsigned long flags; if (!evtchn_pending_sel) return; @@ -650,14 +645,14 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) * does anyway. Page it in and retry the instruction. We're just a * little more honest about it. */ - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) return; - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); } /* Now gpc->khva is a valid kernel address for the vcpu_info */ @@ -687,7 +682,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) } kvm_gpc_mark_dirty_in_slot(gpc); - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); /* For the per-vCPU lapic vector, deliver it as MSI. */ if (v->arch.xen.upcall_vector) @@ -698,7 +693,6 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; bool atomic = in_atomic() || !task_is_running(current); - unsigned long flags; u8 rc = 0; /* @@ -715,16 +709,13 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); if (atomic) { - local_irq_save(flags); - if (!read_trylock(&gpc->lock)) { - local_irq_restore(flags); + if (!read_trylock(&gpc->lock)) return 1; - } } else { - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); } while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); /* * This function gets called from kvm_vcpu_block() after setting the @@ -744,11 +735,11 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) */ return 0; } - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); } rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending; - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); return rc; } @@ -1445,12 +1436,11 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, struct kvm *kvm = vcpu->kvm; struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; unsigned long *pending_bits; - unsigned long flags; bool ret = true; int idx, i; idx = srcu_read_lock(&kvm->srcu); - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; @@ -1471,7 +1461,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, } out_rcu: - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); srcu_read_unlock(&kvm->srcu, idx); return ret; -- 2.54.0.823.g6e5bcc1fc9-goog From: David Woodhouse kvm_setup_guest_pvclock() is only called from kvm_guest_time_update() which runs in process context (vcpu_enter_guest or ioctl). There is no hardirq path that takes the GPC read lock for pvclock, so irqsave is unnecessary. Convert to plain read_lock()/read_unlock(). Signed-off-by: David Woodhouse Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e6f1dd84f22d..87e99756de0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3265,18 +3265,17 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, { struct pvclock_vcpu_time_info *guest_hv_clock; struct pvclock_vcpu_time_info hv_clock; - unsigned long flags; memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) return; - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); } guest_hv_clock = (void *)(gpc->khva + offset); @@ -3301,7 +3300,7 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, guest_hv_clock->version = ++hv_clock.version; kvm_gpc_mark_dirty_in_slot(gpc); - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); } -- 2.54.0.823.g6e5bcc1fc9-goog From: David Woodhouse Now that all hardirq/atomic GPC users (xen_timer_callback, kvm_xen_set_evtchn_fast) use read_trylock() instead of read_lock(), no hardirq path ever holds the GPC rwlock. There is therefore no risk of deadlock between the write side and a hardirq reader, and no need to disable interrupts when taking the lock. Convert all read_lock_irq()/write_lock_irq() and their unlock counterparts to plain read_lock()/write_lock() in pfncache.c. Signed-off-by: David Woodhouse Signed-off-by: Sean Christopherson --- virt/kvm/pfncache.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 728d2c1b488a..70b102095173 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -29,12 +29,12 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, spin_lock(&kvm->gpc_lock); list_for_each_entry(gpc, &kvm->gpc_list, list) { - read_lock_irq(&gpc->lock); + read_lock(&gpc->lock); /* Only a single page so no need to care about length */ if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && gpc->uhva >= start && gpc->uhva < end) { - read_unlock_irq(&gpc->lock); + read_unlock(&gpc->lock); /* * There is a small window here where the cache could @@ -44,15 +44,15 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, * acquired. */ - write_lock_irq(&gpc->lock); + write_lock(&gpc->lock); if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && gpc->uhva >= start && gpc->uhva < end) gpc->valid = false; - write_unlock_irq(&gpc->lock); + write_unlock(&gpc->lock); continue; } - read_unlock_irq(&gpc->lock); + read_unlock(&gpc->lock); } spin_unlock(&kvm->gpc_lock); } @@ -184,7 +184,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) mmu_seq = gpc->kvm->mmu_invalidate_seq; smp_rmb(); - write_unlock_irq(&gpc->lock); + write_unlock(&gpc->lock); /* * If the previous iteration "failed" due to an mmu_notifier @@ -225,7 +225,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) goto out_error; } - write_lock_irq(&gpc->lock); + write_lock(&gpc->lock); /* * Other tasks must wait for _this_ refresh to complete before @@ -248,7 +248,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) return 0; out_error: - write_lock_irq(&gpc->lock); + write_lock(&gpc->lock); return -EFAULT; } @@ -269,7 +269,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l lockdep_assert_held(&gpc->refresh_lock); - write_lock_irq(&gpc->lock); + write_lock(&gpc->lock); if (!gpc->active) { ret = -EINVAL; @@ -355,7 +355,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l unmap_old = (old_pfn != gpc->pfn); out_unlock: - write_unlock_irq(&gpc->lock); + write_unlock(&gpc->lock); if (unmap_old) gpc_unmap(old_pfn, old_khva); @@ -417,9 +417,9 @@ static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned * refresh must not establish a mapping until the cache is * reachable by mmu_notifier events. */ - write_lock_irq(&gpc->lock); + write_lock(&gpc->lock); gpc->active = true; - write_unlock_irq(&gpc->lock); + write_unlock(&gpc->lock); } return __kvm_gpc_refresh(gpc, gpa, uhva); } @@ -458,7 +458,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) * must stall mmu_notifier events until all users go away, i.e. * until gpc->lock is dropped and refresh is guaranteed to fail. */ - write_lock_irq(&gpc->lock); + write_lock(&gpc->lock); gpc->active = false; gpc->valid = false; @@ -473,7 +473,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) old_pfn = gpc->pfn; gpc->pfn = KVM_PFN_ERR_FAULT; - write_unlock_irq(&gpc->lock); + write_unlock(&gpc->lock); spin_lock(&kvm->gpc_lock); list_del(&gpc->list); -- 2.54.0.823.g6e5bcc1fc9-goog Use guard() to acquire and release kvm->srcu protection around gpc critical sections, so that said critical sections can also use the fancy __cleanup() functionality. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index b1fae42bf295..0c6b74b97408 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -42,8 +42,9 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) u32 *wc_sec_hi; u32 wc_version; u64 wall_nsec; - int ret = 0; - int idx = srcu_read_lock(&kvm->srcu); + int ret; + + guard(srcu)(&kvm->srcu); read_lock(&gpc->lock); while (!kvm_gpc_check(gpc, PAGE_SIZE)) { @@ -51,7 +52,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) ret = kvm_gpc_refresh(gpc, PAGE_SIZE); if (ret) - goto out; + return ret; read_lock(&gpc->lock); } @@ -99,10 +100,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) read_unlock(&gpc->lock); kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); - -out: - srcu_read_unlock(&kvm->srcu, idx); - return ret; + return 0; } void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) @@ -1437,9 +1435,10 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; unsigned long *pending_bits; bool ret = true; - int idx, i; + int i; + + guard(srcu)(&kvm->srcu); - idx = srcu_read_lock(&kvm->srcu); read_lock(&gpc->lock); if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; @@ -1462,8 +1461,6 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, out_rcu: read_unlock(&gpc->lock); - srcu_read_unlock(&kvm->srcu, idx); - return ret; } @@ -1795,7 +1792,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) unsigned long *pending_bits, *mask_bits; int port_word_bit; bool kick_vcpu = false; - int vcpu_idx, idx, rc; + int vcpu_idx, rc; vcpu_idx = READ_ONCE(xe->vcpu_idx); if (vcpu_idx >= 0) @@ -1812,10 +1809,11 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) rc = -EWOULDBLOCK; - idx = srcu_read_lock(&kvm->srcu); + guard(srcu)(&kvm->srcu); if (!read_trylock(&gpc->lock)) - goto out_rcu; + return rc; + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_unlock; @@ -1856,7 +1854,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) */ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) kick_vcpu = true; - goto out_rcu; + goto out_kick; } if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) @@ -1888,9 +1886,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) out_unlock: read_unlock(&gpc->lock); - out_rcu: - srcu_read_unlock(&kvm->srcu, idx); - + out_kick: if (kick_vcpu) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); -- 2.54.0.823.g6e5bcc1fc9-goog Hoist the actual fastpath delivery of an event to a vCPU into a separate helper so that CLASS()-based gpc locking+checking can be used without needing to implement scoped versions. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 100 ++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 0c6b74b97408..020ef0ddab01 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1775,6 +1775,57 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) } } +static void __kvm_xen_set_evtchn_fast(struct kvm_vcpu *vcpu, int port_word_bit) +{ + struct gfn_to_pfn_cache *gpc = &vcpu->arch.xen.vcpu_info_cache; + bool kick_vcpu = false; + + /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ + if (!read_trylock(&gpc->lock)) { + /* + * Could not access the vcpu_info. Set the bit in-kernel and + * prod the vCPU to deliver it for itself. + */ + if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + kick_vcpu = true; + goto out_kick; + } + if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { + if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + kick_vcpu = true; + goto out_unlock; + } + + if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) { + struct vcpu_info *vcpu_info = gpc->khva; + if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) { + WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); + kick_vcpu = true; + } + } else { + struct compat_vcpu_info *vcpu_info = gpc->khva; + if (!test_and_set_bit(port_word_bit, + (unsigned long *)&vcpu_info->evtchn_pending_sel)) { + WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); + kick_vcpu = true; + } + } + + /* For the per-vCPU lapic vector, deliver it as MSI. */ + if (kick_vcpu && vcpu->arch.xen.upcall_vector) { + kvm_xen_inject_vcpu_vector(vcpu); + kick_vcpu = false; + } + +out_unlock: + read_unlock(&gpc->lock); +out_kick: + if (kick_vcpu) { + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); + kvm_vcpu_kick(vcpu); + } +} + /* * The return value from this function is propagated to kvm_set_irq() API, * so it returns: @@ -1791,7 +1842,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long *pending_bits, *mask_bits; int port_word_bit; - bool kick_vcpu = false; int vcpu_idx, rc; vcpu_idx = READ_ONCE(xe->vcpu_idx); @@ -1843,55 +1893,13 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) kvm_xen_check_poller(vcpu, xe->port); } else { rc = 1; /* Delivered to the bitmap in shared_info. */ - /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ - read_unlock(&gpc->lock); - gpc = &vcpu->arch.xen.vcpu_info_cache; - - if (!read_trylock(&gpc->lock)) { - /* - * Could not access the vcpu_info. Set the bit in-kernel - * and prod the vCPU to deliver it for itself. - */ - if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) - kick_vcpu = true; - goto out_kick; - } - if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { - if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) - kick_vcpu = true; - goto out_unlock; - } - - if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { - struct vcpu_info *vcpu_info = gpc->khva; - if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) { - WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); - kick_vcpu = true; - } - } else { - struct compat_vcpu_info *vcpu_info = gpc->khva; - if (!test_and_set_bit(port_word_bit, - (unsigned long *)&vcpu_info->evtchn_pending_sel)) { - WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); - kick_vcpu = true; - } - } - - /* For the per-vCPU lapic vector, deliver it as MSI. */ - if (kick_vcpu && vcpu->arch.xen.upcall_vector) { - kvm_xen_inject_vcpu_vector(vcpu); - kick_vcpu = false; - } } - out_unlock: +out_unlock: read_unlock(&gpc->lock); - out_kick: - if (kick_vcpu) { - kvm_make_request(KVM_REQ_UNBLOCK, vcpu); - kvm_vcpu_kick(vcpu); - } + if (rc == 1) + __kvm_xen_set_evtchn_fast(vcpu, port_word_bit); return rc; } -- 2.54.0.823.g6e5bcc1fc9-goog Explicitly mark the Xen shared info page as never being dirty tracked so that higher-level gpc APIs can be added to automatically take care of things like dirty tracking, without reintroducing the bug fixed by commit 55749769fe60 ("KVM: x86: Fix wall clock writes in Xen shared_info not to mark page dirty"). And because the code _looks_ buggy. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 2 +- include/linux/kvm_host.h | 13 ++++++++++--- include/linux/kvm_types.h | 1 + virt/kvm/pfncache.c | 4 +++- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 020ef0ddab01..ab8e95647406 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -2334,7 +2334,7 @@ void kvm_xen_init_vm(struct kvm *kvm) { mutex_init(&kvm->arch.xen.xen_lock); idr_init(&kvm->arch.xen.evtchn_ports); - kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm); + __kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, true); } void kvm_xen_destroy_vm(struct kvm *kvm) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 27498e990dff..0dc4eb78b6d9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1427,16 +1427,23 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); /** - * kvm_gpc_init - initialize gfn_to_pfn_cache. + * __kvm_gpc_init - initialize gfn_to_pfn_cache. * * @gpc: struct gfn_to_pfn_cache object. * @kvm: pointer to kvm instance. + * @never_dirty: %true if the associated gfn should never be marked dirty * * This sets up a gfn_to_pfn_cache by initializing locks and assigning the * immutable attributes. Note, the cache must be zero-allocated (or zeroed by * the caller before init). */ -void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm); +void __kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, + bool never_dirty); + +static inline void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm) +{ + __kvm_gpc_init(gpc, kvm, false); +} /** * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest @@ -1942,7 +1949,7 @@ static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) { lockdep_assert_held(&gpc->lock); - if (!gpc->memslot) + if (!gpc->memslot || gpc->never_dirty) return; mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa)); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index a568d8e6f4e8..e850adc3f47e 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -94,6 +94,7 @@ struct gfn_to_pfn_cache { kvm_pfn_t pfn; bool active; bool valid; + bool never_dirty; }; #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 70b102095173..9209f06c46b4 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -382,7 +382,8 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) return __kvm_gpc_refresh(gpc, gpc->gpa, uhva); } -void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm) +void __kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, + bool never_dirty) { rwlock_init(&gpc->lock); mutex_init(&gpc->refresh_lock); @@ -392,6 +393,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm) gpc->gpa = INVALID_GPA; gpc->uhva = KVM_HVA_ERR_BAD; gpc->active = gpc->valid = false; + gpc->never_dirty = never_dirty; } static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva, -- 2.54.0.823.g6e5bcc1fc9-goog Give the Xen per-vCPU info page the same treatment as the per-VM shared info page, and never mark it dirty, as KVM clearly relies on userspace to assume the page is always dirty. While the page is marked dirty on writes via kvm_xen_inject_pending_events(), it's not marked dirty when written by __kvm_xen_set_evtchn_fast(). Furthermore, as was the case with the shared info page, writes in the event channel fastpath may be done without an active vCPU, e.g. when called via timer callback or irqfd injection. I.e. attempting to fix the faspath would run afoul of same issue that was fixed by commit ("KVM: x86: Fix wall clock writes in Xen shared_info not to mark page dirty"). Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index ab8e95647406..7b527a983cfc 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -679,7 +679,6 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) WRITE_ONCE(vi->evtchn_upcall_pending, 1); } - kvm_gpc_mark_dirty_in_slot(gpc); read_unlock(&gpc->lock); /* For the per-vCPU lapic vector, deliver it as MSI. */ @@ -2313,7 +2312,7 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); - kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm); + __kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, true); kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm); } -- 2.54.0.823.g6e5bcc1fc9-goog Move the helpers to convert between pfns and physical addresses (for host and guest) into kvm_types.h so that they can be used throughout kvm_host.h (and any other KVM-related header) without having to worry about ordering. No functional change intended. Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 15 --------------- include/linux/kvm_types.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0dc4eb78b6d9..ffbae1e6e84e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1923,21 +1923,6 @@ hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) return slot->base_gfn + gfn_offset; } -static inline gpa_t gfn_to_gpa(gfn_t gfn) -{ - return (gpa_t)gfn << PAGE_SHIFT; -} - -static inline gfn_t gpa_to_gfn(gpa_t gpa) -{ - return (gfn_t)(gpa >> PAGE_SHIFT); -} - -static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) -{ - return (hpa_t)pfn << PAGE_SHIFT; -} - static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) { unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index e850adc3f47e..961572e102f0 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifdef KVM_SUB_MODULES @@ -73,6 +74,21 @@ typedef u64 hfn_t; typedef hfn_t kvm_pfn_t; +static inline gpa_t gfn_to_gpa(gfn_t gfn) +{ + return (gpa_t)gfn << PAGE_SHIFT; +} + +static inline gfn_t gpa_to_gfn(gpa_t gpa) +{ + return (gfn_t)(gpa >> PAGE_SHIFT); +} + +static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) +{ + return (hpa_t)pfn << PAGE_SHIFT; +} + struct gfn_to_hva_cache { u64 generation; gpa_t gpa; -- 2.54.0.823.g6e5bcc1fc9-goog Add CLASS() definitions for locally mapping a PFN given a gpc (gfn_to_pfn cache), to eventually deduplicate a number of users that do: lock(); while (!check()) { unlock(); if (refresh()) return err; lock() } ... mark_dirty(); unlock(); Implement read-only (for cases where KVM is only reading) and "try" (for use in atomic code where rwlock might sleep due to PREEMPT_RT) variations. Use "map local" as the primary terminology as the basic concept is more or less the same as kmap_local(): ensure the current CPU has a kernel mapping to the underlying memory. Convert the pvclock code as the first user, as it is straightforward and thus easier to audit for correctness. For all intents and purposes, no functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 17 ++++---------- include/linux/kvm_host.h | 48 +++++++++++++++++++++++++++++++--------- virt/kvm/pfncache.c | 36 ++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 87e99756de0a..ea10ed4ab06f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3268,17 +3268,11 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); - read_lock(&gpc->lock); - while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { - read_unlock(&gpc->lock); + CLASS(gpc_map_local, clock_map)(gpc, offset + sizeof(*guest_hv_clock)); + if (IS_ERR(clock_map)) + return; - if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) - return; - - read_lock(&gpc->lock); - } - - guest_hv_clock = (void *)(gpc->khva + offset); + guest_hv_clock = *clock_map + offset; /* * This VCPU is paused, but it's legal for a guest to read another @@ -3299,9 +3293,6 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, guest_hv_clock->version = ++hv_clock.version; - kvm_gpc_mark_dirty_in_slot(gpc); - read_unlock(&gpc->lock); - trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ffbae1e6e84e..d70fa91cda0c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1535,6 +1535,44 @@ static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc) return gpc->active && kvm_is_error_gpa(gpc->gpa); } +static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) +{ + lockdep_assert_held(&gpc->lock); + + if (!gpc->memslot || gpc->never_dirty) + return; + + mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa)); +} + +void **gpc_map_local_lock(struct gfn_to_pfn_cache *gpc, unsigned long len); +void **gpc_try_map_local_lock(struct gfn_to_pfn_cache *gpc, unsigned long len); + +static inline void gpc_map_local_unlock(void **khva) +{ + struct gfn_to_pfn_cache *gpc = container_of(khva, struct gfn_to_pfn_cache, khva); + + kvm_gpc_mark_dirty_in_slot(gpc); + + read_unlock(&gpc->lock); +} + +static inline void gpc_map_local_unlock_ro(void **khva) +{ + read_unlock(&container_of(khva, struct gfn_to_pfn_cache, khva)->lock); +} + +#define DEFINE_GPC_CLASS(try, ro) \ +DEFINE_CLASS(gpc##try##_map_local##ro, void **, \ + if (!IS_ERR(_T)) gpc_map_local_unlock##ro(_T), \ + gpc##try##_map_local_lock(gpc, len), \ + struct gfn_to_pfn_cache *gpc, unsigned long len) \ + +DEFINE_GPC_CLASS(,); +DEFINE_GPC_CLASS(_try,); +DEFINE_GPC_CLASS(, _ro); +DEFINE_GPC_CLASS(_try, _ro); + void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); @@ -1930,16 +1968,6 @@ static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) return !kvm_is_error_hva(hva); } -static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) -{ - lockdep_assert_held(&gpc->lock); - - if (!gpc->memslot || gpc->never_dirty) - return; - - mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa)); -} - enum kvm_stat_kind { KVM_STAT_VM, KVM_STAT_VCPU, diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 9209f06c46b4..d3e02a2bac38 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -484,3 +484,39 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) gpc_unmap(old_pfn, old_khva); } } + +void **gpc_try_map_local_lock(struct gfn_to_pfn_cache *gpc, unsigned long len) +{ + if (!read_trylock(&gpc->lock)) + return ERR_PTR(-EWOULDBLOCK); + + if (!kvm_gpc_check(gpc, len)) { + read_unlock(&gpc->lock); + return ERR_PTR(-EWOULDBLOCK); + } + + return &gpc->khva; +} + +void **gpc_map_local_lock(struct gfn_to_pfn_cache *gpc, unsigned long len) +{ + /* + * Yes, this is an open-coded loop. But that's just what put_user() + * does anyway. Page it in and retry the instruction. We're just a + * little more honest about it. + */ + for (;;) { + int r; + + read_lock(&gpc->lock); + + if (kvm_gpc_check(gpc, len)) + return &gpc->khva; + + read_unlock(&gpc->lock); + + r = kvm_gpc_refresh(gpc, len); + if (r) + return ERR_PTR(r); + } +} -- 2.54.0.823.g6e5bcc1fc9-goog Use the newfangled "map local" CLASS() APIs to access the Xen shared info page via its gpc. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 7b527a983cfc..065b4c92f7ed 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -35,27 +35,19 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); -static int kvm_xen_shared_info_init(struct kvm *kvm) +static int __kvm_xen_shared_info_init(struct kvm *kvm) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct pvclock_wall_clock *wc; u32 *wc_sec_hi; u32 wc_version; u64 wall_nsec; - int ret; guard(srcu)(&kvm->srcu); - read_lock(&gpc->lock); - while (!kvm_gpc_check(gpc, PAGE_SIZE)) { - read_unlock(&gpc->lock); - - ret = kvm_gpc_refresh(gpc, PAGE_SIZE); - if (ret) - return ret; - - read_lock(&gpc->lock); - } + CLASS(gpc_map_local, shinfo_map)(gpc, PAGE_SIZE); + if (IS_ERR(shinfo_map)) + return PTR_ERR(shinfo_map); /* * This code mirrors kvm_write_wall_clock() except that it writes @@ -74,14 +66,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c); if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { - struct shared_info *shinfo = gpc->khva; + struct shared_info *shinfo = *shinfo_map; wc_sec_hi = &shinfo->wc_sec_hi; wc = &shinfo->wc; } else #endif { - struct compat_shared_info *shinfo = gpc->khva; + struct compat_shared_info *shinfo = *shinfo_map; wc_sec_hi = &shinfo->arch.wc_sec_hi; wc = &shinfo->wc; @@ -97,12 +89,20 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) smp_wmb(); wc->version = wc_version + 1; - read_unlock(&gpc->lock); - - kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); return 0; } +static int kvm_xen_shared_info_init(struct kvm *kvm) +{ + int r; + + r = __kvm_xen_shared_info_init(kvm); + if (!r) + kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); + + return r; +} + void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) { if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) { -- 2.54.0.823.g6e5bcc1fc9-goog Use trylock instead of waiting on the gpc->lock when locking and checking the shared info page for SCHEDOP_poll, as odds are very good that if the lock is contended, then the check will fail. This will allow using the new CLASS() APIs for local gpc mappings, without having to add a version that waits on the lock, but doesn't refresh on failure. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 065b4c92f7ed..d9b09809e243 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1438,7 +1438,9 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, guard(srcu)(&kvm->srcu); - read_lock(&gpc->lock); + if (!read_trylock(&gpc->lock)) + return ret; + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; -- 2.54.0.823.g6e5bcc1fc9-goog Use gpc's CLASS() interface to lock and check the shared info page when processing a SCHEDOP_poll hypercall. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index d9b09809e243..8f822acb11a4 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1433,36 +1433,28 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, struct kvm *kvm = vcpu->kvm; struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; unsigned long *pending_bits; - bool ret = true; int i; guard(srcu)(&kvm->srcu); - if (!read_trylock(&gpc->lock)) - return ret; + CLASS(gpc_try_map_local_ro, shinfo_map)(gpc, PAGE_SIZE); + if (IS_ERR(shinfo_map)) + return true; - if (!kvm_gpc_check(gpc, PAGE_SIZE)) - goto out_rcu; - - ret = false; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { - struct shared_info *shinfo = gpc->khva; + struct shared_info *shinfo = *shinfo_map; pending_bits = (unsigned long *)&shinfo->evtchn_pending; } else { - struct compat_shared_info *shinfo = gpc->khva; + struct compat_shared_info *shinfo = *shinfo_map; pending_bits = (unsigned long *)&shinfo->evtchn_pending; } for (i = 0; i < nr_ports; i++) { - if (test_bit(ports[i], pending_bits)) { - ret = true; - break; - } + if (test_bit(ports[i], pending_bits)) + return true; } - out_rcu: - read_unlock(&gpc->lock); - return ret; + return false; } static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, -- 2.54.0.823.g6e5bcc1fc9-goog Convert the event channel fastpath to the "map local" CLASS() APIs, using the "try" variants as the faspath can't block. Note! The vcpu_info mapping is read/write, even though there is no existing call to mark the page dirty. Like Xen's shared info page, the vCPU info page is assumed to be dirty at all times, and so isn't marked dirty after every write. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 8f822acb11a4..47750316f132 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1774,29 +1774,28 @@ static void __kvm_xen_set_evtchn_fast(struct kvm_vcpu *vcpu, int port_word_bit) bool kick_vcpu = false; /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ - if (!read_trylock(&gpc->lock)) { - /* - * Could not access the vcpu_info. Set the bit in-kernel and - * prod the vCPU to deliver it for itself. - */ + CLASS(gpc_try_map_local, vcpu_info_map)(gpc, sizeof(struct vcpu_info)); + + /* + * If the vcpu_info is inaccessible, set the bit in-kernel and prod the + * vCPU to deliver it for itself. + */ + if (IS_ERR(vcpu_info_map)) { if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) kick_vcpu = true; goto out_kick; } - if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { - if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) - kick_vcpu = true; - goto out_unlock; - } if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) { - struct vcpu_info *vcpu_info = gpc->khva; + struct vcpu_info *vcpu_info = *vcpu_info_map; + if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) { WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); kick_vcpu = true; } } else { - struct compat_vcpu_info *vcpu_info = gpc->khva; + struct compat_vcpu_info *vcpu_info = *vcpu_info_map; + if (!test_and_set_bit(port_word_bit, (unsigned long *)&vcpu_info->evtchn_pending_sel)) { WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); @@ -1810,8 +1809,6 @@ static void __kvm_xen_set_evtchn_fast(struct kvm_vcpu *vcpu, int port_word_bit) kick_vcpu = false; } -out_unlock: - read_unlock(&gpc->lock); out_kick: if (kick_vcpu) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); @@ -1850,23 +1847,19 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) if (xe->port >= max_evtchn_port(kvm)) return -EINVAL; - rc = -EWOULDBLOCK; - guard(srcu)(&kvm->srcu); - if (!read_trylock(&gpc->lock)) - return rc; - - if (!kvm_gpc_check(gpc, PAGE_SIZE)) - goto out_unlock; + CLASS(gpc_try_map_local, shinfo_map)(gpc, PAGE_SIZE); + if (IS_ERR(shinfo_map)) + return -EWOULDBLOCK; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { - struct shared_info *shinfo = gpc->khva; + struct shared_info *shinfo = *shinfo_map; pending_bits = (unsigned long *)&shinfo->evtchn_pending; mask_bits = (unsigned long *)&shinfo->evtchn_mask; port_word_bit = xe->port / 64; } else { - struct compat_shared_info *shinfo = gpc->khva; + struct compat_shared_info *shinfo = *shinfo_map; pending_bits = (unsigned long *)&shinfo->evtchn_pending; mask_bits = (unsigned long *)&shinfo->evtchn_mask; port_word_bit = xe->port / 32; @@ -1888,9 +1881,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) rc = 1; /* Delivered to the bitmap in shared_info. */ } -out_unlock: - read_unlock(&gpc->lock); - if (rc == 1) __kvm_xen_set_evtchn_fast(vcpu, port_word_bit); return rc; -- 2.54.0.823.g6e5bcc1fc9-goog Convert the Xen pvclock reads to the "map local" CLASS() APIs, using a read-only variants as KVM simply copying data from the pvclock. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 47750316f132..89daad3fe712 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -153,21 +153,11 @@ static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu, struct gfn_to_pfn_cache *gpc, unsigned int offset) { - int r; + CLASS(gpc_map_local_ro, pvclock_map)(gpc, offset + sizeof(*hv_clock)); + if (IS_ERR(pvclock_map)) + return PTR_ERR(pvclock_map); - read_lock(&gpc->lock); - while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) { - read_unlock(&gpc->lock); - - r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock)); - if (r) - return r; - - read_lock(&gpc->lock); - } - - memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock)); - read_unlock(&gpc->lock); + memcpy(hv_clock, *pvclock_map + offset, sizeof(*hv_clock)); /* * Sanity check TSC shift+multiplier to verify the guest's view of time -- 2.54.0.823.g6e5bcc1fc9-goog Now that the CLASS()-based gpc mappings allow for early returns, drop the local "kick_vcpu" from the event channel fastpath, and simply return early if the vCPU doesn't need to be kicked. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 89daad3fe712..2c776e475a4f 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1761,7 +1761,6 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) static void __kvm_xen_set_evtchn_fast(struct kvm_vcpu *vcpu, int port_word_bit) { struct gfn_to_pfn_cache *gpc = &vcpu->arch.xen.vcpu_info_cache; - bool kick_vcpu = false; /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ CLASS(gpc_try_map_local, vcpu_info_map)(gpc, sizeof(struct vcpu_info)); @@ -1771,39 +1770,37 @@ static void __kvm_xen_set_evtchn_fast(struct kvm_vcpu *vcpu, int port_word_bit) * vCPU to deliver it for itself. */ if (IS_ERR(vcpu_info_map)) { - if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) - kick_vcpu = true; + if (test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + return; goto out_kick; } if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) { struct vcpu_info *vcpu_info = *vcpu_info_map; - if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) { - WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); - kick_vcpu = true; - } + if (test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) + return; + + WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); } else { struct compat_vcpu_info *vcpu_info = *vcpu_info_map; - if (!test_and_set_bit(port_word_bit, - (unsigned long *)&vcpu_info->evtchn_pending_sel)) { - WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); - kick_vcpu = true; - } + if (test_and_set_bit(port_word_bit, + (unsigned long *)&vcpu_info->evtchn_pending_sel)) + return; + + WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); } /* For the per-vCPU lapic vector, deliver it as MSI. */ - if (kick_vcpu && vcpu->arch.xen.upcall_vector) { + if (vcpu->arch.xen.upcall_vector) { kvm_xen_inject_vcpu_vector(vcpu); - kick_vcpu = false; + return; } out_kick: - if (kick_vcpu) { - kvm_make_request(KVM_REQ_UNBLOCK, vcpu); - kvm_vcpu_kick(vcpu); - } + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); + kvm_vcpu_kick(vcpu); } /* -- 2.54.0.823.g6e5bcc1fc9-goog Convert Xen event injection to the "map local" CLASS() APIs. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 2c776e475a4f..3ebde7ba5558 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -628,24 +628,12 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) if (!evtchn_pending_sel) return; - /* - * Yes, this is an open-coded loop. But that's just what put_user() - * does anyway. Page it in and retry the instruction. We're just a - * little more honest about it. - */ - read_lock(&gpc->lock); - while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { - read_unlock(&gpc->lock); + CLASS(gpc_map_local, vcpu_info_map)(gpc, sizeof(struct vcpu_info)); + if (IS_ERR(vcpu_info_map)) + return; - if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) - return; - - read_lock(&gpc->lock); - } - - /* Now gpc->khva is a valid kernel address for the vcpu_info */ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { - struct vcpu_info *vi = gpc->khva; + struct vcpu_info *vi = *vcpu_info_map; asm volatile(LOCK_PREFIX "orq %0, %1\n" "notq %0\n" @@ -657,7 +645,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) WRITE_ONCE(vi->evtchn_upcall_pending, 1); } else { u32 evtchn_pending_sel32 = evtchn_pending_sel; - struct compat_vcpu_info *vi = gpc->khva; + struct compat_vcpu_info *vi = *vcpu_info_map; asm volatile(LOCK_PREFIX "orl %0, %1\n" "notl %0\n" @@ -669,8 +657,6 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) WRITE_ONCE(vi->evtchn_upcall_pending, 1); } - read_unlock(&gpc->lock); - /* For the per-vCPU lapic vector, deliver it as MSI. */ if (v->arch.xen.upcall_vector) kvm_xen_inject_vcpu_vector(v); -- 2.54.0.823.g6e5bcc1fc9-goog Extend the CLASS() APIs for gpcs to allow choosing between the "normal" and the "try" versions at runtime, depending on whether or not the caller is running in atomic context. Convert the "has interrupt" helper as the first user, as it is called from IRQ context, but also needs to wait when called from non-atomic context, i.e. can't tolerate false negatives in that case. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 52 +++++++++++++--------------------------- include/linux/kvm_host.h | 10 ++++++++ 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 3ebde7ba5558..a2e88a76e8d9 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -666,7 +666,6 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; bool atomic = in_atomic() || !task_is_running(current); - u8 rc = 0; /* * If the global upcall vector (HVMIRQ_callback_vector) is set and @@ -676,44 +675,25 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) /* No need for compat handling here */ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); - BUILD_BUG_ON(sizeof(rc) != - sizeof_field(struct vcpu_info, evtchn_upcall_pending)); - BUILD_BUG_ON(sizeof(rc) != + BUILD_BUG_ON(sizeof_field(struct vcpu_info, evtchn_upcall_pending) != sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); - if (atomic) { - if (!read_trylock(&gpc->lock)) - return 1; - } else { - read_lock(&gpc->lock); - } - while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { - read_unlock(&gpc->lock); + /* + * This function gets called from kvm_vcpu_block() after setting the + * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately + * from a HLT. So we really mustn't sleep. If the page ended up absent + * at that point, just return 1 in order to trigger an immediate wake, + * and we'll end up getting called again from a context where we *can* + * fault in the page and wait for it. + * + * For normal, non-atomic usage, nothing can be done if userspace has + * screwed up the vcpu_info mapping. No interrupts for you. + */ + CLASS(gpc_map_local_ro_ex, info_map)(gpc, sizeof(struct vcpu_info), atomic); + if (IS_ERR(info_map)) + return atomic ? 1 : 0; - /* - * This function gets called from kvm_vcpu_block() after setting the - * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately - * from a HLT. So we really mustn't sleep. If the page ended up absent - * at that point, just return 1 in order to trigger an immediate wake, - * and we'll end up getting called again from a context where we *can* - * fault in the page and wait for it. - */ - if (atomic) - return 1; - - if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) { - /* - * If this failed, userspace has screwed up the - * vcpu_info mapping. No interrupts for you. - */ - return 0; - } - read_lock(&gpc->lock); - } - - rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending; - read_unlock(&gpc->lock); - return rc; + return ((struct vcpu_info *)*info_map)->evtchn_upcall_pending; } int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d70fa91cda0c..0602d0ca731c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1573,6 +1573,16 @@ DEFINE_GPC_CLASS(_try,); DEFINE_GPC_CLASS(, _ro); DEFINE_GPC_CLASS(_try, _ro); +#define DEFINE_GPC_CLASS_EX(ro) \ +DEFINE_CLASS(gpc_map_local##ro##_ex, void **, \ + if (!IS_ERR(_T)) gpc_map_local_unlock##ro(_T), \ + atomic ? gpc_try_map_local_lock(gpc, len) : \ + gpc_map_local_lock(gpc, len), \ + struct gfn_to_pfn_cache *gpc, unsigned long len, bool atomic) + +DEFINE_GPC_CLASS_EX(); +DEFINE_GPC_CLASS_EX(_ro); + void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); -- 2.54.0.823.g6e5bcc1fc9-goog From: Carsten Stollmaier This largely reverts commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status"), which dropped the use of the gfn_to_pfn_cache because it was not integrated with the MMU notifiers at the time. That shortcoming has long since been addressed, making the GPC work correctly for this use case. Aside from cleaning up the last open-coded assembler access to user addresses and associated explicit asm exception fixups, moving back to the now-functional GPC also resolves an issue with contention on the mmap_lock with userfaultfd. The contention issue is as follows: On vcpu_run, before entering the guest, the update of the steal time information causes a page-fault if the page is not present. In our scenario, this gets handled by do_user_addr_fault() and successively handle_userfault() because the region is registered to that. Since handle_userfault() uses TASK_INTERRUPTIBLE, it is interruptible by signals. But do_user_addr_fault() then busy-retries if the pending signal is non-fatal, which leads to heavy contention of the mmap_lock. By restoring the use of GPC for accessing the guest steal time, the contention is avoided and refreshing the GPC happens when the vCPU is next scheduled. Since the gfn_to_pfn_cache gives a kernel mapping rather than a userspace HVA, accesses are now plain C instead of unsafe_put_user() et al. Use READ_ONCE()/WRITE_ONCE() to prevent the compiler from reordering or tearing the accesses, and add an smp_wmb() before the final version increment to ensure the data writes are ordered before the seqcount update — the old unsafe_put_user() inline assembly acted as an implicit compiler barrier. In kvm_steal_time_set_preempted(), use read_trylock() instead of read_lock_irqsave() since this is called from the scheduler path where rwlock_t is not safe on PREEMPT_RT (it becomes sleepable). Since we only trylock and bail on failure, there is no risk of deadlock with an interrupt handler, so no need to disable interrupts at all. Setting the preempted flag is best-effort anyway. Signed-off-by: Carsten Stollmaier Co-developed-by: David Woodhouse Signed-off-by: David Woodhouse Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.c | 122 ++++++++++++++------------------ 2 files changed, 54 insertions(+), 70 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ae7d539af90..9f652dcdda93 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -983,7 +983,7 @@ struct kvm_vcpu_arch { u8 preempted; u64 msr_val; u64 last_steal; - struct gfn_to_hva_cache cache; + struct gfn_to_pfn_cache cache; } st; u64 l1_tsc_offset; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea10ed4ab06f..1b27dd9ba0aa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3728,10 +3728,8 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests); static void record_steal_time(struct kvm_vcpu *vcpu) { - struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; - struct kvm_steal_time __user *st; - struct kvm_memslots *slots; - gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache; + struct kvm_steal_time *st; u64 steal; u32 version; @@ -3746,42 +3744,20 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; - slots = kvm_memslots(vcpu->kvm); + /* We rely on the fact that it fits in a single page. */ + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); - if (unlikely(slots->generation != ghc->generation || - gpa != ghc->gpa || - kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { - /* We rely on the fact that it fits in a single page. */ - BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); + CLASS(gpc_map_local, st_map)(gpc, sizeof(*st)); + if (IS_ERR(st_map)) + return; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || - kvm_is_error_hva(ghc->hva) || !ghc->memslot) - return; - } - - st = (struct kvm_steal_time __user *)ghc->hva; + st = *st_map; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { - u8 st_preempted = 0; - int err = -EFAULT; - - if (!user_access_begin(st, sizeof(*st))) - return; - - asm volatile("1: xchgb %0, %2\n" - "xor %1, %1\n" - "2:\n" - _ASM_EXTABLE_UA(1b, 2b) - : "+q" (st_preempted), - "+&r" (err), - "+m" (st->preempted)); - if (err) - goto out; - - user_access_end(); + u8 st_preempted = xchg(&st->preempted, 0); vcpu->arch.st.preempted = 0; @@ -3789,39 +3765,30 @@ static void record_steal_time(struct kvm_vcpu *vcpu) st_preempted & KVM_VCPU_FLUSH_TLB); if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); - - if (!user_access_begin(st, sizeof(*st))) - goto dirty; } else { - if (!user_access_begin(st, sizeof(*st))) - return; - - unsafe_put_user(0, &st->preempted, out); + WRITE_ONCE(st->preempted, 0); vcpu->arch.st.preempted = 0; } - unsafe_get_user(version, &st->version, out); + version = READ_ONCE(st->version); if (version & 1) version += 1; /* first time write, random junk */ version += 1; - unsafe_put_user(version, &st->version, out); + WRITE_ONCE(st->version, version); smp_wmb(); - unsafe_get_user(steal, &st->steal, out); + steal = READ_ONCE(st->steal); steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - unsafe_put_user(steal, &st->steal, out); + WRITE_ONCE(st->steal, steal); + + smp_wmb(); version += 1; - unsafe_put_user(version, &st->version, out); - - out: - user_access_end(); - dirty: - mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); + WRITE_ONCE(st->version, version); } /* @@ -4162,8 +4129,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.st.msr_val = data; - if (!(data & KVM_MSR_ENABLED)) - break; + if (data & KVM_MSR_ENABLED) + kvm_gpc_activate(&vcpu->arch.st.cache, data & ~KVM_MSR_ENABLED, + sizeof(struct kvm_steal_time)); + else + kvm_gpc_deactivate(&vcpu->arch.st.cache); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); @@ -5231,11 +5201,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { - struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; - struct kvm_steal_time __user *st; - struct kvm_memslots *slots; - static const u8 preempted = KVM_VCPU_PREEMPTED; - gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache; + struct kvm_steal_time *st; /* * The vCPU can be marked preempted if and only if the VM-Exit was on @@ -5260,20 +5227,32 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (unlikely(current->mm != vcpu->kvm->mm)) return; - slots = kvm_memslots(vcpu->kvm); - - if (unlikely(slots->generation != ghc->generation || - gpa != ghc->gpa || - kvm_is_error_hva(ghc->hva) || !ghc->memslot)) + /* + * Use a trylock as this is called from the scheduler path (via + * kvm_sched_out), where rwlock_t is not safe on PREEMPT_RT (it + * becomes sleepable). Setting preempted is best-effort anyway; + * the old HVA-based code used copy_to_user_nofault() which could + * also silently fail. + * + * Since we only trylock and bail on failure, there is no risk of + * deadlock with an interrupt handler, so no need to disable + * interrupts. + */ + CLASS(gpc_try_map_local, st_map)(gpc, sizeof(st->preempted)); + if (IS_ERR(st_map)) return; - st = (struct kvm_steal_time __user *)ghc->hva; - BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); + st = *st_map; + WRITE_ONCE(st->preempted, KVM_VCPU_PREEMPTED); + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; +} - if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) - vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; - - mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); +static void kvm_steal_time_reset(struct kvm_vcpu *vcpu) +{ + kvm_gpc_deactivate(&vcpu->arch.st.cache); + vcpu->arch.st.preempted = 0; + vcpu->arch.st.msr_val = 0; + vcpu->arch.st.last_steal = 0; } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -12819,6 +12798,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); + kvm_gpc_init(&vcpu->arch.st.cache, vcpu->kvm); + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else @@ -12926,6 +12907,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_clear_async_pf_completion_queue(vcpu); kvm_mmu_unload(vcpu); + kvm_steal_time_reset(vcpu); + kvmclock_reset(vcpu); for_each_possible_cpu(cpu) @@ -13046,7 +13029,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_en_val = 0; vcpu->arch.apf.msr_int_val = 0; - vcpu->arch.st.msr_val = 0; + + kvm_steal_time_reset(vcpu); kvmclock_reset(vcpu); -- 2.54.0.823.g6e5bcc1fc9-goog