Skip the WRMSR and HLT fastpaths in SVM's VM-Exit handler if the next RIP isn't valid, e.g. because KVM is running with nrips=false. SVM must decode and emulate to skip the instruction if the CPU doesn't provide the next RIP, and getting the instruction bytes to decode requires reading guest memory. Reading guest memory through the emulator can fault, i.e. can sleep, which is disallowed since the fastpath handlers run with IRQs disabled. BUG: sleeping function called from invalid context at ./include/linux/uaccess.h:106 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 32611, name: qemu preempt_count: 1, expected: 0 INFO: lockdep is turned off. irq event stamp: 30580 hardirqs last enabled at (30579): [] vcpu_run+0x1787/0x1db0 [kvm] hardirqs last disabled at (30580): [] __schedule+0x1e2/0xed0 softirqs last enabled at (30570): [] fpu_swap_kvm_fpstate+0x44/0x210 softirqs last disabled at (30568): [] fpu_swap_kvm_fpstate+0x44/0x210 CPU: 298 UID: 0 PID: 32611 Comm: qemu Tainted: G U 6.16.0-smp--e6c618b51cfe-sleep #782 NONE Tainted: [U]=USER Hardware name: Google Astoria-Turin/astoria, BIOS 0.20241223.2-0 01/17/2025 Call Trace: dump_stack_lvl+0x7d/0xb0 __might_resched+0x271/0x290 __might_fault+0x28/0x80 kvm_vcpu_read_guest_page+0x8d/0xc0 [kvm] kvm_fetch_guest_virt+0x92/0xc0 [kvm] __do_insn_fetch_bytes+0xf3/0x1e0 [kvm] x86_decode_insn+0xd1/0x1010 [kvm] x86_emulate_instruction+0x105/0x810 [kvm] __svm_skip_emulated_instruction+0xc4/0x140 [kvm_amd] handle_fastpath_invd+0xc4/0x1a0 [kvm] vcpu_run+0x11a1/0x1db0 [kvm] kvm_arch_vcpu_ioctl_run+0x5cc/0x730 [kvm] kvm_vcpu_ioctl+0x578/0x6a0 [kvm] __se_sys_ioctl+0x6d/0xb0 do_syscall_64+0x8a/0x2c0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f479d57a94b Note, this is essentially a reapply of commit 5c30e8101e8d ("KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid"), but with different justification (KVM now grabs SRCU when skipping the instruction for other reasons). Fixes: b439eb8ab578 ("Revert "KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid"") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc6..829d9d46718d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4181,13 +4181,21 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + /* + * Next RIP must be provided as IRQs are disabled, and accessing guest + * memory to decode the instruction might fault, i.e. might sleep. + */ + if (!nrips || !control->next_rip) + return EXIT_FASTPATH_NONE; if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - switch (svm->vmcb->control.exit_code) { + switch (control->exit_code) { case SVM_EXIT_MSR: - if (!svm->vmcb->control.exit_info_1) + if (!control->exit_info_1) break; return handle_fastpath_set_msr_irqoff(vcpu); case SVM_EXIT_HLT: -- 2.50.1.565.gc32cd1483b-goog Extract the code for converting an ICR message into a kvm_lapic_irq structure into a local helper so that a fast-only IPI path can share the conversion logic. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8172c2042dd6..9f9846980625 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1481,24 +1481,30 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) } EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); -void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) +static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low, + u32 icr_high, struct kvm_lapic_irq *irq) { - struct kvm_lapic_irq irq; - /* KVM has no delay and should always clear the BUSY/PENDING flag. */ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); - irq.vector = icr_low & APIC_VECTOR_MASK; - irq.delivery_mode = icr_low & APIC_MODE_MASK; - irq.dest_mode = icr_low & APIC_DEST_MASK; - irq.level = (icr_low & APIC_INT_ASSERT) != 0; - irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; - irq.shorthand = icr_low & APIC_SHORT_MASK; - irq.msi_redir_hint = false; + irq->vector = icr_low & APIC_VECTOR_MASK; + irq->delivery_mode = icr_low & APIC_MODE_MASK; + irq->dest_mode = icr_low & APIC_DEST_MASK; + irq->level = (icr_low & APIC_INT_ASSERT) != 0; + irq->trig_mode = icr_low & APIC_INT_LEVELTRIG; + irq->shorthand = icr_low & APIC_SHORT_MASK; + irq->msi_redir_hint = false; if (apic_x2apic_mode(apic)) - irq.dest_id = icr_high; + irq->dest_id = icr_high; else - irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); + irq->dest_id = GET_XAPIC_DEST_FIELD(icr_high); +} + +void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) +{ + struct kvm_lapic_irq irq; + + kvm_icr_to_lapic_irq(apic, icr_low, icr_high, &irq); trace_kvm_apic_ipi(icr_low, irq.dest_id); -- 2.50.1.565.gc32cd1483b-goog Explicitly restrict fastpath ICR writes to IPIs that are "fast", i.e. can be delivered without having to walk all vCPUs, and that target at most 16 vCPUs. Artificially restricting ICR writes to physical mode guarantees at most one vCPU will receive in IPI (because x2APIC IDs are read-only), but that delivery might not be "fast". E.g. even if the vCPU exists, KVM might have to iterate over 4096 vCPUs to find the right one. Limiting delivery to fast IPIs aligns the WRMSR fastpath with kvm_arch_set_irq_inatomic() (which also runs with IRQs disabled), and will allow dropping the semi-arbitrary restrictions on delivery mode and type. Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 27 +++++++++++++++++++++++++-- arch/x86/kvm/lapic.h | 2 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9f9846980625..bd3232dd7a63 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2439,7 +2439,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast) { if (data & X2APIC_ICR_RESERVED_BITS) return 1; @@ -2454,7 +2454,20 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) */ data &= ~APIC_ICR_BUSY; - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + if (fast) { + struct kvm_lapic_irq irq; + int ignored; + + kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq); + + if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq, + &ignored, NULL)) + return -EWOULDBLOCK; + + trace_kvm_apic_ipi((u32)data, irq.dest_id); + } else { + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + } if (kvm_x86_ops.x2apic_icr_is_split) { kvm_lapic_set_reg(apic, APIC_ICR, data); kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); @@ -2465,6 +2478,16 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) return 0; } +static int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +{ + return __kvm_x2apic_icr_write(apic, data, false); +} + +int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data) +{ + return __kvm_x2apic_icr_write(apic, data, true); +} + static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) { if (kvm_x86_ops.x2apic_icr_is_split) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 72de14527698..1b2d408816aa 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -137,7 +137,7 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data); +int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..8c8b7d7902a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2149,7 +2149,7 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && ((u32)(data >> 32) != X2APIC_BROADCAST)) - return kvm_x2apic_icr_write(vcpu->arch.apic, data); + return kvm_x2apic_icr_write_fast(vcpu->arch.apic, data); return 1; } -- 2.50.1.565.gc32cd1483b-goog Drop the restrictions on fastpath IPIs only working for fixed IRQs with a physical destination now that the fastpath is explicitly limited to "fast" delivery. Limiting delivery to a single physical APIC ID guarantees only one vCPU will receive the event, but that isn't necessary "fast", e.g. if the targeted vCPU is the last of 4096 vCPUs. And logical destination mode or shorthand (to self) can also be fast, e.g. if only a few vCPUs are being targeted. Lastly, there's nothing inherently slow about delivering an NMI, INIT, SIPI, SMI, etc., i.e. there's no reason to artificially limit fastpath delivery to fixed vector IRQs. Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c8b7d7902a0..ea117c4b20c8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2145,13 +2145,7 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) return 1; - if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && - ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && - ((u32)(data >> 32) != X2APIC_BROADCAST)) - return kvm_x2apic_icr_write_fast(vcpu->arch.apic, data); - - return 1; + return kvm_x2apic_icr_write_fast(vcpu->arch.apic, data); } static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) -- 2.50.1.565.gc32cd1483b-goog Drop the fastpath VM-Exit requirement that KVM can use the hypervisor timer to emulate the APIC timer in TSC deadline mode. I.e. unconditionally handle MSR_IA32_TSC_DEADLINE WRMSRs in the fastpath. Restricting the fastpath to *maybe* using the VMX preemption timer is ineffective and unnecessary. If the requested deadline can't be programmed into the VMX preemption timer, KVM will fall back to hrtimers, i.e. the restriction is ineffective as far as preventing any kind of worst case scenario. But guarding against a worst case scenario is completely unnecessary as the "slow" path, start_sw_tscdeadline() => hrtimer_start(), explicitly disables IRQs. In fact, the worst case scenario is when KVM thinks it can use the VMX preemption timer, as KVM will eat the overhead of calling into vmx_set_hv_timer() and falling back to hrtimers. Opportunistically limit kvm_can_use_hv_timer() to lapic.c as the fastpath code was the only external user. Stating the obvious, this allows handling MSR_IA32_TSC_DEADLINE writes in the fastpath on AMD CPUs. Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/lapic.h | 1 - arch/x86/kvm/x86.c | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bd3232dd7a63..e19545b8cc98 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -130,7 +130,7 @@ static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } -bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) +static bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) { return kvm_x86_ops.set_hv_timer && !(kvm_mwait_in_guest(vcpu->kvm) || diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1b2d408816aa..8b00e29741de 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -249,7 +249,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); -bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu); static inline enum lapic_mode kvm_apic_mode(u64 apic_base) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea117c4b20c8..63ca9185d133 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2150,9 +2150,6 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) { - if (!kvm_can_use_hv_timer(vcpu)) - return 1; - kvm_set_lapic_tscdeadline_msr(vcpu, data); return 0; } -- 2.50.1.565.gc32cd1483b-goog Acquire SRCU in the WRMSR fastpath if and only if an instruction needs to be skipped, i.e. only if the fastpath succeeds. The reasoning in commit 3f2739bd1e0b ("KVM: x86: Acquire SRCU read lock when handling fastpath MSR writes") about "avoid having to play whack-a-mole" seems sound, but in hindsight unconditionally acquiring SRCU does more harm than good. While acquiring/releasing SRCU isn't slow per se, the things that are _protected_ by kvm->srcu are generally safe to access only in the "slow" VM-Exit path. E.g. accessing memslots in generic helpers is never safe, because accessing guest memory with IRQs disabled is unless unsafe (except when kvm_vcpu_read_guest_atomic() is used, but that API should never be used in emulation helpers). In other words, playing whack-a-mole is actually desirable in this case, because every access to an asset protected by kvm->srcu warrants further scrutiny. Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63ca9185d133..69c668f4d2b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2158,10 +2158,8 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; - fastpath_t ret; bool handled; - - kvm_vcpu_srcu_read_lock(vcpu); + int r; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): @@ -2177,19 +2175,16 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) break; } - if (handled) { - if (!kvm_skip_emulated_instruction(vcpu)) - ret = EXIT_FASTPATH_EXIT_USERSPACE; - else - ret = EXIT_FASTPATH_REENTER_GUEST; - trace_kvm_msr_write(msr, data); - } else { - ret = EXIT_FASTPATH_NONE; - } + if (!handled) + return EXIT_FASTPATH_NONE; + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_skip_emulated_instruction(vcpu); kvm_vcpu_srcu_read_unlock(vcpu); - return ret; + trace_kvm_msr_write(msr, data); + + return r ? EXIT_FASTPATH_REENTER_GUEST : EXIT_FASTPATH_EXIT_USERSPACE; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); -- 2.50.1.565.gc32cd1483b-goog Always grab EDX:EAX in the WRMSR fastpath to deduplicate and simplify the case statements, and to prepare for handling immediate variants of WRMSRNS in the fastpath (the data register is explicitly provided in that case). There's no harm in reading the registers, as their values are always available, i.e. don't require VMREADs (or similarly slow operations). No real functional change intended. Cc: Xin Li Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 69c668f4d2b6..e6c221f9b92e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2156,18 +2156,16 @@ static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { + u64 data = kvm_read_edx_eax(vcpu); u32 msr = kvm_rcx_read(vcpu); - u64 data; bool handled; int r; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - data = kvm_read_edx_eax(vcpu); handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; case MSR_IA32_TSC_DEADLINE: - data = kvm_read_edx_eax(vcpu); handled = !handle_fastpath_set_tscdeadline(vcpu, data); break; default: -- 2.50.1.565.gc32cd1483b-goog Fold the per-MSR WRMSR fastpath helpers into the main handler now that the IPI path in particular is relatively tiny. In addition to eliminating a decent amount of boilerplate, this removes the ugly -errno/1/0 => bool conversion (which is "necessitated" by kvm_x2apic_icr_write_fast()). Opportunistically drop the comment about IPIs, as the purpose of the fastpath is hopefully self-evident, and _if_ it needs more documentation, the documentation (and rules!) should be placed in a more central location. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e6c221f9b92e..a4441f036929 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2133,48 +2133,24 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -/* - * The fast path for frequent and performance sensitive wrmsr emulation, - * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces - * the latency of virtual IPI by avoiding the expensive bits of transitioning - * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the - * other cases which must be called after interrupts are enabled on the host. - */ -static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) -{ - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) - return 1; - - return kvm_x2apic_icr_write_fast(vcpu->arch.apic, data); -} - -static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) -{ - kvm_set_lapic_tscdeadline_msr(vcpu, data); - return 0; -} - fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u64 data = kvm_read_edx_eax(vcpu); u32 msr = kvm_rcx_read(vcpu); - bool handled; int r; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || + kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) + return EXIT_FASTPATH_NONE; break; case MSR_IA32_TSC_DEADLINE: - handled = !handle_fastpath_set_tscdeadline(vcpu, data); + kvm_set_lapic_tscdeadline_msr(vcpu, data); break; default: - handled = false; - break; - } - - if (!handled) return EXIT_FASTPATH_NONE; + } kvm_vcpu_srcu_read_lock(vcpu); r = kvm_skip_emulated_instruction(vcpu); -- 2.50.1.565.gc32cd1483b-goog Move kvm_init_pmu_capability() to pmu.c so that future changes can access variables that have no business being visible outside of pmu.c. kvm_init_pmu_capability() is called once per module load, there's is zero reason it needs to be inlined. No functional change intended. Cc: Dapeng Mi Cc: Sandipan Das Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 47 +--------------------------------------------- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 75e9cfc689f8..eb17d90916ea 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -96,6 +96,53 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; + + /* + * Hybrid PMUs don't play nice with virtualization without careful + * configuration by userspace, and KVM's APIs for reporting supported + * vPMU features do not account for hybrid PMUs. Disable vPMU support + * for hybrid PMUs until KVM gains a way to let userspace opt-in. + */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + enable_pmu = false; + + if (enable_pmu) { + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. + */ + if (!kvm_pmu_cap.num_counters_gp || + WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_pmu_cap.version) + enable_pmu = false; + } + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_MAX_NR_FIXED_COUNTERS); + + kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); + kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); +} + static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd6005..13477066eb40 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -180,52 +180,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; -static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) -{ - bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; - int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; - - /* - * Hybrid PMUs don't play nice with virtualization without careful - * configuration by userspace, and KVM's APIs for reporting supported - * vPMU features do not account for hybrid PMUs. Disable vPMU support - * for hybrid PMUs until KVM gains a way to let userspace opt-in. - */ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - enable_pmu = false; - - if (enable_pmu) { - perf_get_x86_pmu_capability(&kvm_pmu_cap); - - /* - * WARN if perf did NOT disable hardware PMU if the number of - * architecturally required GP counters aren't present, i.e. if - * there are a non-zero number of counters, but fewer than what - * is architecturally required. - */ - if (!kvm_pmu_cap.num_counters_gp || - WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) - enable_pmu = false; - else if (is_intel && !kvm_pmu_cap.version) - enable_pmu = false; - } - - if (!enable_pmu) { - memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); - return; - } - - kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); - kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, - pmu_ops->MAX_NR_GP_COUNTERS); - kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, - KVM_MAX_NR_FIXED_COUNTERS); - - kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = - perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); - kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = - perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); -} +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { -- 2.50.1.565.gc32cd1483b-goog Add wrappers for triggering instruction retired and branch retired PMU events in anticipation of reworking the internal mechanisms to track which PMCs need to be evaluated, e.g. to avoid having to walk and check every PMC. Opportunistically bury "struct kvm_pmu_emulated_event_selectors" in pmu.c. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 22 ++++++++++++++++++---- arch/x86/kvm/pmu.h | 9 ++------- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eb17d90916ea..e1911b366c43 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -29,8 +29,11 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); -struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; -EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); +struct kvm_pmu_emulated_event_selectors { + u64 INSTRUCTIONS_RETIRED; + u64 BRANCH_INSTRUCTIONS_RETIRED; +}; +static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { @@ -907,7 +910,7 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user; } -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) +static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -944,7 +947,18 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) kvm_pmu_incr_counter(pmc); } } -EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); + +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); +} +EXPORT_SYMBOL_GPL(kvm_pmu_instruction_retired); + +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); +} +EXPORT_SYMBOL_GPL(kvm_pmu_branch_retired); static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) { diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 13477066eb40..740af816af37 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -23,11 +23,6 @@ #define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED -struct kvm_pmu_emulated_event_selectors { - u64 INSTRUCTIONS_RETIRED; - u64 BRANCH_INSTRUCTIONS_RETIRED; -}; - struct kvm_pmu_ops { struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -178,7 +173,6 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) } extern struct x86_pmu_capability kvm_pmu_cap; -extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); @@ -227,7 +221,8 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel); +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b8ea1969113d..db2fd4eedc90 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3690,7 +3690,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a4441f036929..f2b2eaaec6f8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8824,7 +8824,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); /* * rflags is the old, "raw" value of the flags. The new value has @@ -9158,9 +9158,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); if (ctxt->is_branch) - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); -- 2.50.1.565.gc32cd1483b-goog Calculate and track PMCs that are counting instructions/branches retired when the PMC's event selector (or fixed counter control) is modified instead evaluating the event selector on-demand. Immediately recalc a PMC's configuration on writes to avoid false negatives/positives when KVM skips an emulated WRMSR, which is guaranteed to occur before the main run loop processes KVM_REQ_PMU. Out of an abundance of caution, and because it's relatively cheap, recalc reprogrammed PMCs in kvm_pmu_handle_event() as well. Recalculating in response to KVM_REQ_PMU _should_ be unnecessary, but for now be paranoid to avoid introducing easily-avoidable bugs in edge cases. The code can be removed in the future if necessary, e.g. in the unlikely event that the overhead of recalculating to-be-emulated PMCs is noticeable. Note! Deliberately don't check the PMU event filters, as doing so could result in KVM consuming stale information. Tracking which PMCs are counting branches/instructions will allow grabbing SRCU in the fastpath VM-Exit handlers if and only if a PMC event might be triggered (to consult the event filters), and will also allow the upcoming mediated PMU to do the right thing with respect to counting instructions (the mediated PMU won't be able to update PMCs in the VM-Exit fastpath). Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/pmu.c | 75 ++++++++++++++++++++++++--------- arch/x86/kvm/pmu.h | 4 ++ 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..d7680612ba1e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -579,6 +579,9 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX); + u64 ds_area; u64 pebs_enable; u64 pebs_enable_rsvd; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e1911b366c43..b0f0275a2c2e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -542,6 +542,47 @@ static int reprogram_counter(struct kvm_pmc *pmc) eventsel & ARCH_PERFMON_EVENTSEL_INT); } +static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel) +{ + /* + * Ignore checks for edge detect (all events currently emulated by KVM + * are always rising edges), pin control (unsupported by modern CPUs), + * and counter mask and its invert flag (KVM doesn't emulate multiple + * events in a single clock cycle). + * + * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit + * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34). + * Checking the "in HLE/RTM transaction" flags is correct as the vCPU + * can't be in a transaction if KVM is emulating an instruction. + * + * Checking the reserved bits might be wrong if they are defined in the + * future, but so could ignoring them, so do the simple thing for now. + */ + return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB); +} + +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) +{ + bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1); + bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1); + + /* + * Do NOT consult the PMU event filters, as the filters must be checked + * at the time of emulation to ensure KVM uses fresh information, e.g. + * omitting a PMC from a bitmap could result in a missed event if the + * filter is changed to allow counting the event. + */ + if (!pmc_speculative_in_use(pmc)) + return; + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1); + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1); +} +EXPORT_SYMBOL_GPL(kvm_pmu_recalc_pmc_emulation); + void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); @@ -577,6 +618,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) */ if (unlikely(pmu->need_cleanup)) kvm_pmu_cleanup(vcpu); + + kvm_for_each_pmc(pmu, pmc, bit, bitmap) + kvm_pmu_recalc_pmc_emulation(pmu, pmc); } int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) @@ -910,7 +954,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user; } -static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) +static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, + const unsigned long *event_pmcs) { DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -919,29 +964,17 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); + if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX)) + return; + if (!kvm_pmu_has_perf_global_ctrl(pmu)) - bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); - else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx, + bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX); + else if (!bitmap_and(bitmap, event_pmcs, (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - /* - * Ignore checks for edge detect (all events currently emulated - * but KVM are always rising edges), pin control (unsupported - * by modern CPUs), and counter mask and its invert flag (KVM - * doesn't emulate multiple events in a single clock cycle). - * - * Note, the uppermost nibble of AMD's mask overlaps Intel's - * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved - * bits (bits 35:34). Checking the "in HLE/RTM transaction" - * flags is correct as the vCPU can't be in a transaction if - * KVM is emulating an instruction. Checking the reserved bits - * might be wrong if they are defined in the future, but so - * could ignoring them, so do the simple thing for now. - */ - if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) || - !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) + if (!pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); @@ -950,13 +983,13 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions); } EXPORT_SYMBOL_GPL(kvm_pmu_instruction_retired); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches); } EXPORT_SYMBOL_GPL(kvm_pmu_branch_retired); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 740af816af37..cb93a936a177 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -176,8 +176,12 @@ extern struct x86_pmu_capability kvm_pmu_cap; void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); + static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) { + kvm_pmu_recalc_pmc_emulation(pmc_to_pmu(pmc), pmc); + set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } -- 2.50.1.565.gc32cd1483b-goog Rename pmc_speculative_in_use() to pmc_is_locally_enabled() to better capture what it actually tracks, and to show its relationship to pmc_is_globally_enabled(). While neither AMD nor Intel refer to event selectors or the fixed counter control MSR as "local", it's the obvious name to pair with "global". As for "speculative", there's absolutely nothing speculative about the checks. E.g. for PMUs without PERF_GLOBAL_CTRL, from the guest's perspective, the counters are "in use" without any qualifications. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 6 +++--- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b0f0275a2c2e..e73c2a44028b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -493,7 +493,7 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) static bool pmc_event_is_allowed(struct kvm_pmc *pmc) { - return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && + return pmc_is_globally_enabled(pmc) && pmc_is_locally_enabled(pmc) && check_pmu_event_filter(pmc); } @@ -572,7 +572,7 @@ void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) * omitting a PMC from a bitmap could result in a missed event if the * filter is changed to allow counting the event. */ - if (!pmc_speculative_in_use(pmc)) + if (!pmc_is_locally_enabled(pmc)) return; if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED)) @@ -907,7 +907,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); kvm_for_each_pmc(pmu, pmc, i, bitmask) { - if (pmc->perf_event && !pmc_speculative_in_use(pmc)) + if (pmc->perf_event && !pmc_is_locally_enabled(pmc)) pmc_stop_counter(pmc); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cb93a936a177..08ae644db00e 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -160,7 +160,7 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) return NULL; } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 0b173602821b..07baff96300f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -762,7 +762,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) int bit, hw_idx; kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) { - if (!pmc_speculative_in_use(pmc) || + if (!pmc_is_locally_enabled(pmc) || !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; -- 2.50.1.565.gc32cd1483b-goog Open code pmc_event_is_allowed() in its callers, as kvm_pmu_trigger_event() only needs to check the event filter (both global and local enables are consulted outside of the loop). No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e73c2a44028b..a495ab5d0556 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -491,12 +491,6 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } -static bool pmc_event_is_allowed(struct kvm_pmc *pmc) -{ - return pmc_is_globally_enabled(pmc) && pmc_is_locally_enabled(pmc) && - check_pmu_event_filter(pmc); -} - static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -507,7 +501,8 @@ static int reprogram_counter(struct kvm_pmc *pmc) emulate_overflow = pmc_pause_counter(pmc); - if (!pmc_event_is_allowed(pmc)) + if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + !check_pmu_event_filter(pmc)) return 0; if (emulate_overflow) @@ -974,7 +969,8 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - if (!pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) + if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + !check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); -- 2.50.1.565.gc32cd1483b-goog When triggering PMC events in response to emulation, drop the redundant checks on a PMC being globally and locally enabled, as the passed in bitmap contains only PMCs that are locally enabled (and counting the right event), and the local copy of the bitmap has already been masked with global_ctrl. No true functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a495ab5d0556..bdcd9c6f0ec0 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -969,7 +969,7 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + if (!pmc_is_locally_enabled(pmc) || !check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) continue; -- 2.50.1.565.gc32cd1483b-goog Drop the check on a PMC being locally enabled when triggering emulated events, as the bitmap of passed-in PMCs only contains locally enabled PMCs. Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index bdcd9c6f0ec0..422af7734846 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -969,8 +969,7 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - if (!pmc_is_locally_enabled(pmc) || - !check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) + if (!check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); -- 2.50.1.565.gc32cd1483b-goog Rename check_pmu_event_filter() to make its polarity more obvious, and to connect the dots to is_gp_event_allowed() and is_fixed_event_allowed(). No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 422af7734846..e75671b6e88c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -476,7 +476,7 @@ static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, return true; } -static bool check_pmu_event_filter(struct kvm_pmc *pmc) +static bool pmc_is_event_allowed(struct kvm_pmc *pmc) { struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; @@ -502,7 +502,7 @@ static int reprogram_counter(struct kvm_pmc *pmc) emulate_overflow = pmc_pause_counter(pmc); if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || - !check_pmu_event_filter(pmc)) + !pmc_is_event_allowed(pmc)) return 0; if (emulate_overflow) @@ -969,7 +969,7 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, return; kvm_for_each_pmc(pmu, pmc, i, bitmap) { - if (!check_pmu_event_filter(pmc) || !cpl_is_matched(pmc)) + if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); -- 2.50.1.565.gc32cd1483b-goog Acquire SRCU in the VM-Exit fastpath if and only if KVM needs to check the PMU event filter, to further trim the amount of code that is executed with SRCU protection in the fastpath. Counter-intuitively, holding SRCU can do more harm than good due to masking potential bugs, and introducing a new SRCU-protected asset to code reachable via kvm_skip_emulated_instruction() would be quite notable, i.e. definitely worth auditing. E.g. the primary user of kvm->srcu is KVM's memslots, accessing memslots all but guarantees guest memory may be accessed, accessing guest memory can fault, and page faults might sleep, which isn't allowed while IRQs are disabled. Not acquiring SRCU means the (hypothetical) illegal sleep would be flagged when running with PROVE_RCU=y, even if DEBUG_ATOMIC_SLEEP=n. Note, performance is NOT a motivating factor, as SRCU lock/unlock only adds ~15 cycles of latency to fastpath VM-Exits. I.e. overhead isn't a concern _if_ SRCU protection needs to be extended beyond PMU events, e.g. to honor userspace MSR filters. Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 4 +++- arch/x86/kvm/x86.c | 18 +++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e75671b6e88c..3206412a35a1 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -955,7 +955,7 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - int i; + int i, idx; BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); @@ -968,12 +968,14 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) return; + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_for_each_pmc(pmu, pmc, i, bitmap) { if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc)) continue; kvm_pmu_incr_counter(pmc); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f2b2eaaec6f8..a56f83b40a55 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2137,7 +2137,6 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u64 data = kvm_read_edx_eax(vcpu); u32 msr = kvm_rcx_read(vcpu); - int r; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): @@ -2152,13 +2151,12 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } - kvm_vcpu_srcu_read_lock(vcpu); - r = kvm_skip_emulated_instruction(vcpu); - kvm_vcpu_srcu_read_unlock(vcpu); - trace_kvm_msr_write(msr, data); - return r ? EXIT_FASTPATH_REENTER_GUEST : EXIT_FASTPATH_EXIT_USERSPACE; + if (!kvm_skip_emulated_instruction(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); @@ -11251,13 +11249,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { - int ret; - - kvm_vcpu_srcu_read_lock(vcpu); - ret = kvm_emulate_halt(vcpu); - kvm_vcpu_srcu_read_unlock(vcpu); - - if (!ret) + if (!kvm_emulate_halt(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; if (kvm_vcpu_running(vcpu)) -- 2.50.1.565.gc32cd1483b-goog Add a fastpath handler for INVD so that the common fastpath logic can be trivially tested on both Intel and AMD. Under KVM, INVD is always: (a) intercepted, (b) available to the guest, and (c) emulated as a nop, with no side effects. Combined with INVD not having any inputs or outputs, i.e. no register constraints, INVD is the perfect instruction for exercising KVM's fastpath as it can be inserted into practically any guest-side code stream. Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 9 +++++++++ arch/x86/kvm/x86.h | 1 + 4 files changed, 14 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 829d9d46718d..f7e1e665a826 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4200,6 +4200,8 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return handle_fastpath_set_msr_irqoff(vcpu); case SVM_EXIT_HLT: return handle_fastpath_hlt(vcpu); + case SVM_EXIT_INVD: + return handle_fastpath_invd(vcpu); default: break; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..95765db52992 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7175,6 +7175,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); case EXIT_REASON_HLT: return handle_fastpath_hlt(vcpu); + case EXIT_REASON_INVD: + return handle_fastpath_invd(vcpu); default: return EXIT_FASTPATH_NONE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a56f83b40a55..5af2c5aed0f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2086,6 +2086,15 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) +{ + if (!kvm_emulate_invd(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; +} +EXPORT_SYMBOL_GPL(handle_fastpath_invd); + int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index bcfd9b719ada..46220b04cdf2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -439,6 +439,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; -- 2.50.1.565.gc32cd1483b-goog