Historically, KVM always advertised x2APIC Suppress EOI Broadcast (SEOIB) support in split-irqchip mode, This is incorrect for userspace IOAPIC implementations without an EOI register (e.g. version 0x11). Furthermore, KVM did not actually honor guest suppression requests and continued to broadcast LAPIC EOIs to userspace IOAPIC. This can cause interrupt storms in guests that rely on Directed EOI semantics (notably Windows with Credential Guard, which experiences boot hangs). KVM is adding two new x2APIC API flags to control this behavior: - KVM_X2APIC_API_DISABLE_IGNORE_SUPPRESS_EOI_BROADCAST_QUIRK - KVM_X2APIC_API_DISABLE_SUPPRESS_EOI_BROADCAST [https://patchwork.kernel.org/project/kvm/patch/20251125180557.2022311-1-khushit.shah@nutanix.com/] Wire those flags into QEMU via a new machine-level state variable (kvm_lapic_seoib_state), which models three possible policies: - SEOIB_STATE_QUIRKED: Legacy behavior. SEOIB advertised but LAPIC EOIs are broadcasted even when guest turns on SEOIB. This is the default for backward compatibility. - SEOIB_STATE_RESPECTED: SEOIB advertised and suppression honored. - SEOIB_STATE_NOT_ADVERTISED: SEOIB not advertised (required for IOAPIC v0x11). For new VMs using split-irqchip, QEMU selects a policy based on the userspace IOAPIC version and programs KVM accordingly during x86_cpus_init(). If KVM does not support the new API, QEMU falls back to the quirked behavior with a warning. SEOIB state is migrated only when non-quirked. Legacy VMs remain in QUIRKED mode and behave exactly as before. Older VMs that migrate into a newer QEMU version will also be able to migrate back to an older QEMU version, as they always stay in the QUIRKED state. VMs powered on with new QEMU and a new kernel that use a non-quirked SEOIB state will not be able to migrate to older QEMU versions or older kernels. The state is applied on the destination in x86_seoib_post_load() to ensure correct KVM configuration before VM execution resumes. Additional changes: - Add qemu_will_load_snapshot() to detect loadvm scenarios - Move IOAPIC_VER_DEF to header for use in x86-common.c - Add get_ioapic_version_from_globals() helper - Add trace events (kvm_lapic_seoib_*) for debugging Signed-off-by: Khushit Shah --- hw/i386/x86-common.c | 98 ++++++++++++++++++++++++++++++++++++ hw/i386/x86.c | 1 + hw/intc/ioapic.c | 2 - include/hw/i386/x86.h | 12 +++++ include/hw/intc/ioapic.h | 2 + include/system/system.h | 1 + system/vl.c | 5 ++ target/i386/kvm/kvm.c | 46 +++++++++++++++++ target/i386/kvm/kvm_i386.h | 12 +++++ target/i386/kvm/trace-events | 4 ++ 10 files changed, 181 insertions(+), 2 deletions(-) diff --git a/hw/i386/x86-common.c b/hw/i386/x86-common.c index c8447499..72cfd295 100644 --- a/hw/i386/x86-common.c +++ b/hw/i386/x86-common.c @@ -35,10 +35,14 @@ #include "target/i386/cpu.h" #include "hw/rtc/mc146818rtc.h" #include "target/i386/sev.h" +#include "hw/qdev-properties.h" +#include "hw/intc/ioapic.h" #include "hw/acpi/cpu_hotplug.h" #include "hw/irq.h" #include "hw/loader.h" +#include "migration/migration.h" +#include "migration/vmstate.h" #include "multiboot.h" #include "elf.h" #include "standard-headers/asm-x86/bootparam.h" @@ -67,6 +71,65 @@ out: object_unref(cpu); } +static uint32_t get_ioapic_version_from_globals(void) +{ + Object *tmp = object_new(TYPE_IOAPIC); + const GlobalProperty *gp = qdev_find_global_prop(tmp, "version"); + uint32_t version = 0; + if (gp) { + qemu_strtoui(gp->value, NULL, 0, &version); + } else { + version = IOAPIC_VER_DEF; + } + object_unref(tmp); + return version; +} + +static int x86_seoib_post_load(void *opaque, int version_id) +{ + X86MachineState *x86ms = opaque; + + if (kvm_enabled() && kvm_irqchip_is_split()) { + /* Set KVM LAPIC SEOIB flags based on x86ms->kvm_lapic_seoib_state */ + if (!kvm_try_set_lapic_seoib_state(x86ms->kvm_lapic_seoib_state)) { + /* Migration from newer to older kernel. */ + error_report("Failed to set KVM LAPIC SEOIB flags"); + abort(); + } + } else { + /* + * SEOIB state is only valid for split irqchip mode. + * This should never happen. + */ + error_report("SEOIB state is only valid for split irqchip mode."); + abort(); + } + return 0; +} + +static bool x86_seoib_needed(void *opaque) +{ + /* + * Only migrate the SEOIB state if the state is not QUIRKED. This enables + * migration from new qemu version to older qemu version. + */ + return kvm_irqchip_is_split() && + ((X86MachineState *)opaque)->kvm_lapic_seoib_state != + SEOIB_STATE_QUIRKED; +} + +static const VMStateDescription vmstate_x86_seoib = { + .name = "x86-seoib-state", + .version_id = 1, + .minimum_version_id = 1, + .post_load = x86_seoib_post_load, + .needed = x86_seoib_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT32(kvm_lapic_seoib_state, X86MachineState), + VMSTATE_END_OF_LIST() + }, +}; + void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) { int i; @@ -76,6 +139,8 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) x86_cpu_set_default_version(default_cpu_version); + vmstate_register(NULL, 0, &vmstate_x86_seoib, x86ms); + /* * Calculates the limit to CPU APIC ID values * @@ -110,6 +175,39 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) apic_set_max_apic_id(x86ms->apic_id_limit); } + if (kvm_enabled() && kvm_irqchip_is_split()) { + /* + * If -incoming or -loadvm, then defer the flag setting to later after + * the migration/loadvm is complete, but this must be done before apic + * state is migrated/loaded. This is done in x86_seoib_post_load. This + * is because x2apic api does not have support to unset flags. And, at + * this point we cannot determine the incoming SEOIB state. + * e.g. for ioapic version 0x20, incoming state can be either RESPECTED + * or QUIRKED. + * + * But for new power-ons, this is right place to set the flags. + */ + if (!runstate_check(RUN_STATE_INMIGRATE) && + !qemu_will_load_snapshot()) { + uint32_t ioapic_version = get_ioapic_version_from_globals(); + if (ioapic_version >= 0x20) { + x86ms->kvm_lapic_seoib_state = SEOIB_STATE_RESPECTED; + } else { + x86ms->kvm_lapic_seoib_state = SEOIB_STATE_NOT_ADVERTISED; + } + + /* + * Try setting the KVM SEOIB flags if that flags are present + * in the kernel. + */ + if (!kvm_try_set_lapic_seoib_state(x86ms->kvm_lapic_seoib_state)) { + warn_report("Kernel does not support SEOIB flags; " + "Falling back to QUIRKED lapic SEOIB behavior."); + x86ms->kvm_lapic_seoib_state = SEOIB_STATE_QUIRKED; + } + } + } + possible_cpus = mc->possible_cpu_arch_ids(ms); for (i = 0; i < ms->smp.cpus; i++) { x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); diff --git a/hw/i386/x86.c b/hw/i386/x86.c index f80533df..1a671238 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -370,6 +370,7 @@ static void x86_machine_initfn(Object *obj) x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); x86ms->bus_lock_ratelimit = 0; x86ms->above_4g_mem_start = 4 * GiB; + x86ms->kvm_lapic_seoib_state = SEOIB_STATE_QUIRKED; } static void x86_machine_class_init(ObjectClass *oc, const void *data) diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c index 38e43846..5c22e697 100644 --- a/hw/intc/ioapic.c +++ b/hw/intc/ioapic.c @@ -450,8 +450,6 @@ static void ioapic_machine_done_notify(Notifier *notifier, void *data) #endif } -#define IOAPIC_VER_DEF 0x20 - static void ioapic_realize(DeviceState *dev, Error **errp) { IOAPICCommonState *s = IOAPIC_COMMON(dev); diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index 8755cad5..38891e5b 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -36,6 +36,15 @@ struct X86MachineClass { bool apic_xrupt_override; }; +typedef enum KvmLapicSEOIBState { + /* Legacy behavior. SEOIB advertised but LAPIC still broadcasts EOIs. */ + SEOIB_STATE_QUIRKED = 0, + /* SEOIB advertised and suppression honored. */ + SEOIB_STATE_RESPECTED = 1, + /* SEOIB not advertised (required for IOAPIC v0x11). */ + SEOIB_STATE_NOT_ADVERTISED = 2, +} KvmLapicSEOIBState; + struct X86MachineState { /*< private >*/ MachineState parent; @@ -95,6 +104,9 @@ struct X86MachineState { uint64_t bus_lock_ratelimit; IgvmCfg *igvm; + + /* KVM LAPIC SEOIB policy for the VM. */ + uint32_t kvm_lapic_seoib_state; }; #define X86_MACHINE_SMM "smm" diff --git a/include/hw/intc/ioapic.h b/include/hw/intc/ioapic.h index aa122e25..1e1317cb 100644 --- a/include/hw/intc/ioapic.h +++ b/include/hw/intc/ioapic.h @@ -28,6 +28,8 @@ #define TYPE_KVM_IOAPIC "kvm-ioapic" #define TYPE_IOAPIC "ioapic" +#define IOAPIC_VER_DEF 0x20 + void ioapic_eoi_broadcast(int vector); #endif /* HW_INTC_IOAPIC_H */ diff --git a/include/system/system.h b/include/system/system.h index 03a2d0e9..7a8e7abe 100644 --- a/include/system/system.h +++ b/include/system/system.h @@ -14,6 +14,7 @@ extern QemuUUID qemu_uuid; extern bool qemu_uuid_set; const char *qemu_get_vm_name(void); +bool qemu_will_load_snapshot(void); /* Exit notifiers will run with BQL held. */ void qemu_add_exit_notifier(Notifier *notify); diff --git a/system/vl.c b/system/vl.c index 5091fe52..eb0e6ab7 100644 --- a/system/vl.c +++ b/system/vl.c @@ -518,6 +518,11 @@ const char *qemu_get_vm_name(void) return qemu_name; } +bool qemu_will_load_snapshot(void) +{ + return loadvm != NULL; +} + static void default_driver_disable(const char *driver) { int i; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 60c79811..8abaa9b1 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -292,6 +292,52 @@ bool kvm_enable_x2apic(void) has_x2apic_api); } +bool kvm_try_set_lapic_seoib_state(KvmLapicSEOIBState state) +{ + KVMState *s = KVM_STATE(current_accel()); + + trace_kvm_lapic_seoib_set_state(state); + + if (state == SEOIB_STATE_QUIRKED) { + /* + * In case of SEOIB_STATE_QUIRKED, do nothing. + * The support will be advertised yet EOI broadcasts will still + * happen in case the guest decides to suppress EOI broadcasts. + */ + return true; + } + + uint64_t required = + KVM_X2APIC_API_DISABLE_IGNORE_SUPPRESS_EOI_BROADCAST_QUIRK | + KVM_X2APIC_API_DISABLE_SUPPRESS_EOI_BROADCAST; + + int supported = kvm_check_extension(s, KVM_CAP_X2APIC_API); + if ((supported & required) != required) { + trace_kvm_lapic_seoib_set_state_failed(state, supported, required); + return false; + } + + if (state == SEOIB_STATE_RESPECTED) { + /* + * The support will be advertised and the guest decision will be + * respected. + */ + return kvm_x2apic_api_set_flags( + KVM_X2APIC_API_DISABLE_IGNORE_SUPPRESS_EOI_BROADCAST_QUIRK); + } else if (state == SEOIB_STATE_NOT_ADVERTISED) { + /* + * The support will not be advertised and the guest decision will + * be ignored (does not matter as the support is not advertised). + */ + return kvm_x2apic_api_set_flags( + KVM_X2APIC_API_DISABLE_IGNORE_SUPPRESS_EOI_BROADCAST_QUIRK | + KVM_X2APIC_API_DISABLE_SUPPRESS_EOI_BROADCAST); + } else { + /* Invalid state.*/ + return false; + } +} + bool kvm_hv_vpindex_settable(void) { return hv_vpindex_settable; diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h index 2b653442..c31d7894 100644 --- a/target/i386/kvm/kvm_i386.h +++ b/target/i386/kvm/kvm_i386.h @@ -15,6 +15,14 @@ #define KVM_MAX_CPUID_ENTRIES 100 +#ifndef KVM_X2APIC_API_DISABLE_IGNORE_SUPPRESS_EOI_BROADCAST_QUIRK +#define KVM_X2APIC_API_DISABLE_IGNORE_SUPPRESS_EOI_BROADCAST_QUIRK (1ULL << 2) +#endif + +#ifndef KVM_X2APIC_API_DISABLE_SUPPRESS_EOI_BROADCAST +#define KVM_X2APIC_API_DISABLE_SUPPRESS_EOI_BROADCAST (1ULL << 3) +#endif + /* always false if !CONFIG_KVM */ #define kvm_pit_in_kernel() \ (kvm_irqchip_in_kernel() && !kvm_irqchip_is_split()) @@ -23,8 +31,12 @@ #define kvm_ioapic_in_kernel() \ (kvm_irqchip_in_kernel() && !kvm_irqchip_is_split()) +/* Forward declaration to avoid including x86.h here */ +typedef enum KvmLapicSEOIBState KvmLapicSEOIBState; + bool kvm_has_smm(void); bool kvm_enable_x2apic(void); +bool kvm_try_set_lapic_seoib_state(KvmLapicSEOIBState state); bool kvm_hv_vpindex_settable(void); bool kvm_enable_hypercall(uint64_t enable_mask); diff --git a/target/i386/kvm/trace-events b/target/i386/kvm/trace-events index 74a6234f..dfe46c3b 100644 --- a/target/i386/kvm/trace-events +++ b/target/i386/kvm/trace-events @@ -13,3 +13,7 @@ kvm_xen_soft_reset(void) "" kvm_xen_set_shared_info(uint64_t gfn) "shared info at gfn 0x%" PRIx64 kvm_xen_set_vcpu_attr(int cpu, int type, uint64_t gpa) "vcpu attr cpu %d type %d gpa 0x%" PRIx64 kvm_xen_set_vcpu_callback(int cpu, int vector) "callback vcpu %d vector %d" + +# kvm.c - x2APIC SEOIB +kvm_lapic_seoib_set_state(uint32_t state) "state=%" PRIu32 +kvm_lapic_seoib_set_state_failed(uint32_t state, int supported, uint64_t required) "state=%" PRIu32 " supported=0x%x required=0x%" PRIx64 -- 2.39.3