Add in a sanitization function for ID_AA64PFR2_EL1, preserving the already-present behaviour for the FPMR, MTEFAR, and MTESTOREONLY fields. Add sanitisation for the GCIE field, which is set to IMP if the host supports a GICv5 guest and NI, otherwise. Extend the sanitisation that takes place in kvm_vgic_create() to zero the ID_AA64PFR2.GCIE field when a non-GICv5 GIC is created. More importantly, move this sanitisation to a separate function, kvm_vgic_finalize_sysregs(), and call it from kvm_finalize_sys_regs(). We are required to finalize the GIC and GCIE fields a second time in kvm_finalize_sys_regs() due to how QEMU blindly reads out then verbatim restores the system register state. This avoids the issue where both the GCIE and GIC features are marked as present (an architecturally invalid combination), and hence guests fall over. See the comment in kvm_finalize_sys_regs() for more details. Overall, the following happens: * Before an irqchip is created, FEAT_GCIE is presented if the host supports GICv5-based guests. * Once an irqchip is created, all other supported irqchips are hidden from the guest; system register state reflects the guest's irqchip. * Userspace is allowed to set invalid irqchip feature combinations in the system registers, but... * ...invalid combinations are removed a second time prior to the first run of the guest, and things hopefully just work. All of this extra work is required to make sure that "legacy" GICv3 guests based on QEMU transparently work on compatible GICv5 hosts without modification. Signed-off-by: Sascha Bischoff --- arch/arm64/kvm/sys_regs.c | 70 +++++++++++++++++++++++++++++---- arch/arm64/kvm/vgic/vgic-init.c | 43 +++++++++++++------- include/kvm/arm_vgic.h | 1 + 3 files changed, 92 insertions(+), 22 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 11e75f2522f95..1039150716d43 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1758,6 +1758,7 @@ static u8 pmuver_to_perfmon(u8 pmuver) static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu, u64 val); static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); /* Read a sanitised cpufeature ID register by sys_reg_desc */ @@ -1783,10 +1784,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = sanitise_id_aa64pfr1_el1(vcpu, val); break; case SYS_ID_AA64PFR2_EL1: - val &= ID_AA64PFR2_EL1_FPMR | - (kvm_has_mte(vcpu->kvm) ? - ID_AA64PFR2_EL1_MTEFAR | ID_AA64PFR2_EL1_MTESTOREONLY : - 0); + val = sanitise_id_aa64pfr2_el1(vcpu, val); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -2024,6 +2022,23 @@ static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val) return val; } +static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + val &= ID_AA64PFR2_EL1_FPMR | + ID_AA64PFR2_EL1_MTEFAR | + ID_AA64PFR2_EL1_MTESTOREONLY; + + if (!kvm_has_mte(vcpu->kvm)) { + val &= ~ID_AA64PFR2_EL1_MTEFAR; + val &= ~ID_AA64PFR2_EL1_MTESTOREONLY; + } + + if (vgic_host_has_gicv5()) + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE, IMP); + + return val; +} + static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); @@ -2213,6 +2228,12 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, user_val); } +static int set_id_aa64pfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + return set_id_reg(vcpu, rd, user_val); +} + /* * Allow userspace to de-feature a stage-2 translation granule but prevent it * from claiming the impossible. @@ -3194,10 +3215,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR1_EL1_RES0 | ID_AA64PFR1_EL1_MPAM_frac | ID_AA64PFR1_EL1_MTE)), - ID_WRITABLE(ID_AA64PFR2_EL1, - ID_AA64PFR2_EL1_FPMR | - ID_AA64PFR2_EL1_MTEFAR | - ID_AA64PFR2_EL1_MTESTOREONLY), + ID_FILTERED(ID_AA64PFR2_EL1, id_aa64pfr2_el1, + ~(ID_AA64PFR2_EL1_FPMR | + ID_AA64PFR2_EL1_MTEFAR | + ID_AA64PFR2_EL1_MTESTOREONLY | + ID_AA64PFR2_EL1_GCIE)), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), @@ -5668,8 +5690,40 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) & ~ID_AA64PFR2_EL1_GCIE; + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, val); val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); + } else { + /* + * Certain userspace software - QEMU - samples the system + * register state without creating an irqchip, then blindly + * restores the state prior to running the final guest. This + * means that it restores the virtualization & emulation + * capabilities of the host system, rather than something that + * reflects the final guest state. Moreover, it checks that the + * state was "correctly" restored (i.e., verbatim), bailing if + * it isn't, so masking off invalid state isn't an option. + * + * On GICv5 hardware that supports FEAT_GCIE_LEGACY we can run + * both GICv3- and GICv5-based guests. Therefore, we initially + * present both ID_AA64PFR0.GIC and ID_AA64PFR2.GCIE as IMP to + * reflect that userspace can create EITHER a vGICv3 or a + * vGICv5. This is an architecturally invalid combination, of + * course. Once an in-kernel GIC is created, the sysreg state is + * updated to reflect the actual, valid configuration. + * + * Setting both the GIC and GCIE features to IMP unsurprisingly + * results in guests falling over, and hence we need to fix up + * this mess in KVM. Before running for the first time we yet + * again ensure that the GIC and GCIE fields accurately reflect + * the actual hardware the guest should see. + * + * This hack allows legacy QEMU-based GICv3 guests to run + * unmodified on compatible GICv5 hosts, and avoids the inverse + * problem for GICv5-based guests in the future. + */ + kvm_vgic_finalize_sysregs(kvm); } if (vcpu_has_nv(vcpu)) { diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 9b3091ad868cf..d1db384698238 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -71,7 +71,6 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); int kvm_vgic_create(struct kvm *kvm, u32 type) { struct kvm_vcpu *vcpu; - u64 aa64pfr0, pfr1; unsigned long i; int ret; @@ -162,19 +161,11 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; - pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; - - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { - kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - } else { - INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); - aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); - pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); - } - - kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); - kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); + /* + * We've now created the GIC. Update the system register state + * to accurately reflect what we've created. + */ + kvm_vgic_finalize_sysregs(kvm); if (type == KVM_DEV_TYPE_ARM_VGIC_V3) kvm->arch.vgic.nassgicap = system_supports_direct_sgis(); @@ -617,6 +608,30 @@ int kvm_vgic_map_resources(struct kvm *kvm) return ret; } +void kvm_vgic_finalize_sysregs(struct kvm *kvm) +{ + u32 type = kvm->arch.vgic.vgic_model; + u64 aa64pfr0, aa64pfr2, pfr1; + + aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + aa64pfr2 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) & ~ID_AA64PFR2_EL1_GCIE; + pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { + kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; + } else if (type == KVM_DEV_TYPE_ARM_VGIC_V3) { + INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); + aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); + } else { + aa64pfr2 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE, IMP); + } + + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, aa64pfr2); + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); +} + /* GENERIC PROBE */ void kvm_vgic_cpu_up(void) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9e4798333b46c..25e36f8b97a1e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -467,6 +467,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); int kvm_vgic_map_resources(struct kvm *kvm); +void kvm_vgic_finalize_sysregs(struct kvm *kvm); int kvm_vgic_hyp_init(void); void kvm_vgic_init_cpu_hardware(void); -- 2.34.1