Currently the tracking of the need to flush L1D for L1TF is tracked by two bits: one per-CPU and one per-vCPU. The per-vCPU bit is always set when the vCPU shows up on a core, so there is no interesting state that's truly per-vCPU. Indeed, this is a requirement, since L1D is a part of the physical CPU. So simplify this by combining the two bits. The vCPU bit was being written from preemption-enabled regions. For those cases, use raw_cpu_write() (via a variant of the setter function) to avoid DEBUG_PREEMPT failures. If the vCPU is getting migrated, the CPU that gets its bit set in these paths is not important; vcpu_load() must always set it on the destination CPU before the guest is resumed. Signed-off-by: Brendan Jackman --- Changes in v2: - Moved the bit back to irq_stat - Fixed DEBUG_PREEMPT issues by adding a _raw variant - Link to v1: https://lore.kernel.org/r/20251013-b4-l1tf-percpu-v1-1-d65c5366ea1a@google.com --- arch/x86/include/asm/hardirq.h | 6 ++++++ arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 20 +++++--------------- arch/x86/kvm/x86.c | 6 +++--- 6 files changed, 16 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index f00c09ffe6a95f07342bb0c6cea3769d71eecfa9..8a5c5deadb5912cc9ae080740c8a7372e6ef7577 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_HARDIRQ_H #define _ASM_X86_HARDIRQ_H +#include #include typedef struct { @@ -78,6 +79,11 @@ static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } +static __always_inline void kvm_set_cpu_l1tf_flush_l1d_raw(void) +{ + raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); +} + static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48598d017d6f3f07263a2ffffe670be2658eb9cb..fcdc65ab13d8383018577aacf19e832e6c4ceb0b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1055,9 +1055,6 @@ struct kvm_vcpu_arch { /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; - /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ - bool l1tf_flush_l1d; - /* Host CPU on which VM-entry was most recently attempted */ int last_vmentry_cpu; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 667d66cf76d5e52c22f9517914307244ae868eea..8c0dce401a42d977756ca82d249bb33c858b9c9f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4859,7 +4859,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); - vcpu->arch.l1tf_flush_l1d = true; + kvm_set_cpu_l1tf_flush_l1d(); if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 76271962cb7083b475de6d7d24bf9cb918050650..1d376b4e6aa4abc475c1aac2ee937dbedb834cb1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3880,7 +3880,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ - vmx->vcpu.arch.l1tf_flush_l1d = true; + kvm_set_cpu_l1tf_flush_l1d_raw(); /* * Must happen outside of nested_vmx_enter_non_root_mode() as it will diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 546272a5d34da301710df1d89414f41fc9b24a1f..6515beefa1fc8da042c0b66c207250ccf79c888e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6673,26 +6673,16 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) * 'always' */ if (static_branch_likely(&vmx_l1d_flush_cond)) { - bool flush_l1d; - /* - * Clear the per-vcpu flush bit, it gets set again if the vCPU + * Clear the per-cpu flush bit, it gets set again if the vCPU * is reloaded, i.e. if the vCPU is scheduled out or if KVM * exits to userspace, or if KVM reaches one of the unsafe - * VMEXIT handlers, e.g. if KVM calls into the emulator. + * VMEXIT handlers, e.g. if KVM calls into the emulator, + * or from the interrupt handlers. */ - flush_l1d = vcpu->arch.l1tf_flush_l1d; - vcpu->arch.l1tf_flush_l1d = false; - - /* - * Clear the per-cpu flush bit, it gets set again from - * the interrupt handlers. - */ - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); - kvm_clear_cpu_l1tf_flush_l1d(); - - if (!flush_l1d) + if (!kvm_get_cpu_l1tf_flush_l1d()) return; + kvm_clear_cpu_l1tf_flush_l1d(); } vcpu->stat.l1d_flush++; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b8138bd48572fd161eda73d2dbdc1dcd0bcbcac..dc886c4b9b1fe3d63a4c255ed4fc533d20fd1962 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5190,7 +5190,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - vcpu->arch.l1tf_flush_l1d = true; + kvm_set_cpu_l1tf_flush_l1d(); if (vcpu->scheduled_out && pmu->version && pmu->event_count) { pmu->need_cleanup = true; @@ -8000,7 +8000,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { /* kvm_write_guest_virt_system can pull in tons of pages. */ - vcpu->arch.l1tf_flush_l1d = true; + kvm_set_cpu_l1tf_flush_l1d_raw(); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); @@ -9396,7 +9396,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return handle_emulation_failure(vcpu, emulation_type); } - vcpu->arch.l1tf_flush_l1d = true; + kvm_set_cpu_l1tf_flush_l1d_raw(); if (!(emulation_type & EMULTYPE_NO_DECODE)) { kvm_clear_exception_queue(vcpu); --- base-commit: 6b36119b94d0b2bb8cea9d512017efafd461d6ac change-id: 20251013-b4-l1tf-percpu-793181fa5884 Best regards, -- Brendan Jackman