Implement irq_set_vcpu_affinity() in the RISCV IOMMU driver. irq_set_vcpu_affinity() is the channel from a hypervisor to the IOMMU needed to ensure that assigned devices which direct MSIs to guest IMSIC addresses will have those MSI writes redirected to their corresponding guest interrupt files. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu-ir.c | 165 ++++++++++++++++++++++++++++++++- drivers/iommu/riscv/iommu.c | 5 +- drivers/iommu/riscv/iommu.h | 4 + 3 files changed, 171 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c index 059671f18267..48f424ce1a8d 100644 --- a/drivers/iommu/riscv/iommu-ir.c +++ b/drivers/iommu/riscv/iommu-ir.c @@ -10,6 +10,8 @@ #include #include +#include + #include "../iommu-pages.h" #include "iommu.h" @@ -164,6 +166,48 @@ static void riscv_iommu_ir_msitbl_inval(struct riscv_iommu_domain *domain, rcu_read_unlock(); } +static void riscv_iommu_ir_msitbl_clear(struct riscv_iommu_domain *domain) +{ + for (size_t i = 0; i < riscv_iommu_ir_nr_msiptes(domain); i++) { + riscv_iommu_ir_clear_pte(&domain->msi_root[i]); + refcount_set(&domain->msi_pte_counts[i], 0); + } +} + +static void riscv_iommu_ir_msiptp_update(struct riscv_iommu_domain *domain) +{ + struct riscv_iommu_bond *bond; + struct riscv_iommu_device *iommu, *prev; + struct riscv_iommu_dc new_dc = { + .ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | + RISCV_IOMMU_PC_TA_V, + .fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | + FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)), + .msiptp = virt_to_pfn(domain->msi_root) | + FIELD_PREP(RISCV_IOMMU_DC_MSIPTP_MODE, + RISCV_IOMMU_DC_MSIPTP_MODE_FLAT), + .msi_addr_mask = domain->msi_addr_mask, + .msi_addr_pattern = domain->msi_addr_pattern, + }; + + /* Like riscv_iommu_ir_msitbl_inval(), synchronize with riscv_iommu_bond_link() */ + smp_mb(); + + rcu_read_lock(); + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + if (iommu == prev) + continue; + + riscv_iommu_iodir_update(iommu, bond->dev, &new_dc); + prev = iommu; + } + + rcu_read_unlock(); +} + struct riscv_iommu_ir_chip_data { size_t idx; u32 config; @@ -279,12 +323,127 @@ static int riscv_iommu_ir_irq_set_affinity(struct irq_data *data, return ret; } +static bool riscv_iommu_ir_vcpu_check_config(struct riscv_iommu_domain *domain, + struct riscv_iommu_ir_vcpu_info *vcpu_info) +{ + return domain->msi_addr_mask == vcpu_info->msi_addr_mask && + domain->msi_addr_pattern == vcpu_info->msi_addr_pattern && + domain->group_index_bits == vcpu_info->group_index_bits && + domain->group_index_shift == vcpu_info->group_index_shift; +} + +static int riscv_iommu_ir_vcpu_new_config(struct riscv_iommu_domain *domain, + struct irq_data *data, + struct riscv_iommu_ir_vcpu_info *vcpu_info) +{ + struct riscv_iommu_msipte *pte; + size_t idx; + int ret; + + if (domain->pgd_mode) + riscv_iommu_ir_unmap_imsics(domain); + + riscv_iommu_ir_msitbl_clear(domain); + + domain->msi_addr_mask = vcpu_info->msi_addr_mask; + domain->msi_addr_pattern = vcpu_info->msi_addr_pattern; + domain->group_index_bits = vcpu_info->group_index_bits; + domain->group_index_shift = vcpu_info->group_index_shift; + domain->imsic_stride = SZ_4K; + domain->msitbl_config += 1; + + if (domain->pgd_mode) { + /* + * As in riscv_iommu_ir_irq_domain_create(), we do all stage1 + * mappings up front since the MSI table will manage the + * translations. + * + * XXX: Since irq-set-vcpu-affinity is called in atomic context + * we need GFP_ATOMIC. If the number of 4K dma pte allocations + * is considered too many for GFP_ATOMIC, then we can wrap + * riscv_iommu_pte_alloc()'s iommu_alloc_pages_node_sz() call + * in a mempool and try to ensure the pool has enough elements + * in riscv_iommu_ir_irq_domain_enable_msis(). + */ + ret = riscv_iommu_ir_map_imsics(domain, GFP_ATOMIC); + if (ret) + return ret; + } + + idx = riscv_iommu_ir_compute_msipte_idx(domain, vcpu_info->gpa); + pte = &domain->msi_root[idx]; + riscv_iommu_ir_irq_set_msitbl_info(data, idx, domain->msitbl_config); + riscv_iommu_ir_set_pte(pte, vcpu_info->hpa); + riscv_iommu_ir_msitbl_inval(domain, NULL); + refcount_set(&domain->msi_pte_counts[idx], 1); + + riscv_iommu_ir_msiptp_update(domain); + + return 0; +} + +static int riscv_iommu_ir_irq_set_vcpu_affinity(struct irq_data *data, void *arg) +{ + struct riscv_iommu_info *info = data->domain->host_data; + struct riscv_iommu_domain *domain = info->domain; + struct riscv_iommu_ir_vcpu_info *vcpu_info = arg; + struct riscv_iommu_msipte pteval; + struct riscv_iommu_msipte *pte; + bool inc = false, dec = false; + size_t old_idx, new_idx; + u32 old_config; + + if (!domain->msi_root) + return -EOPNOTSUPP; + + old_idx = riscv_iommu_ir_irq_msitbl_idx(data); + old_config = riscv_iommu_ir_irq_msitbl_config(data); + + if (!vcpu_info) { + riscv_iommu_ir_msitbl_unmap(domain, data, old_idx); + return 0; + } + + guard(raw_spinlock)(&domain->msi_lock); + + if (!riscv_iommu_ir_vcpu_check_config(domain, vcpu_info)) + return riscv_iommu_ir_vcpu_new_config(domain, data, vcpu_info); + + new_idx = riscv_iommu_ir_compute_msipte_idx(domain, vcpu_info->gpa); + riscv_iommu_ir_irq_set_msitbl_info(data, new_idx, domain->msitbl_config); + + pte = &domain->msi_root[new_idx]; + riscv_iommu_ir_set_pte(&pteval, vcpu_info->hpa); + + if (pteval.pte != pte->pte) { + *pte = pteval; + riscv_iommu_ir_msitbl_inval(domain, pte); + } + + if (old_config != domain->msitbl_config) + inc = true; + else if (new_idx != old_idx) + inc = dec = true; + + if (dec && refcount_dec_and_test(&domain->msi_pte_counts[old_idx])) { + pte = &domain->msi_root[old_idx]; + riscv_iommu_ir_clear_pte(pte); + riscv_iommu_ir_msitbl_inval(domain, pte); + } + + if (inc && !refcount_inc_not_zero(&domain->msi_pte_counts[new_idx])) + refcount_set(&domain->msi_pte_counts[new_idx], 1); + + return 0; +} + static struct irq_chip riscv_iommu_ir_irq_chip = { .name = "IOMMU-IR", .irq_ack = irq_chip_ack_parent, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, .irq_set_affinity = riscv_iommu_ir_irq_set_affinity, + .irq_set_vcpu_affinity = riscv_iommu_ir_irq_set_vcpu_affinity, }; static int riscv_iommu_ir_irq_domain_alloc_irqs(struct irq_domain *irqdomain, @@ -334,7 +493,11 @@ static void riscv_iommu_ir_irq_domain_free_irqs(struct irq_domain *irqdomain, config = riscv_iommu_ir_irq_msitbl_config(data); /* * Only irqs with matching config versions need to be unmapped here - * since config changes will unmap everything. + * since config changes will unmap everything and irq-set-vcpu-affinity + * irq deletions unmap at deletion time. An example of stale indices that + * don't need to be unmapped are those of irqs allocated by VFIO that a + * guest driver never used. The config change made for the guest will have + * already unmapped those, though, so there's no need to unmap them here. */ if (config == domain->msitbl_config) { idx = riscv_iommu_ir_irq_msitbl_idx(data); diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 440c3eb6f15a..02f38aa0b231 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -957,8 +957,9 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, * device is not quiesced might be disruptive, potentially causing * interim translation faults. */ -static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, - struct device *dev, struct riscv_iommu_dc *new_dc) +void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, + struct riscv_iommu_dc *new_dc) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_dc *dc; diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index 130f82e8392a..5ab2b4d6ee88 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -124,6 +124,10 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu); void riscv_iommu_remove(struct riscv_iommu_device *iommu); void riscv_iommu_disable(struct riscv_iommu_device *iommu); +void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, + struct riscv_iommu_dc *new_dc); + void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, struct riscv_iommu_command *cmd); void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, unsigned int timeout_us); -- 2.49.0