Provide a domain bus token for the upcoming support for the RISC-V IOMMU interrupt remapping domain, which needs to be distinguished from NEXUS domains. The new token name is generic, as the only information that needs to be conveyed is that the IRQ domain will remap MSIs, i.e. there's nothing RISC-V specific to convey. Since the MSI_REMAP domain implements init_dev_msi_info() with msi_parent_init_dev_msi_info(), which makes 'domain' point to the NEXUS domain, while keeping 'msi_parent_domain' pointing to itself, there's nothing to do in msi-lib to add support except to accept the token. Signed-off-by: Andrew Jones --- drivers/irqchip/irq-msi-lib.c | 8 ++++---- include/linux/irqdomain_defs.h | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-msi-lib.c b/drivers/irqchip/irq-msi-lib.c index 908944009c21..90ef0af866eb 100644 --- a/drivers/irqchip/irq-msi-lib.c +++ b/drivers/irqchip/irq-msi-lib.c @@ -36,14 +36,14 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, return false; /* - * MSI parent domain specific settings. For now there is only the - * root parent domain, e.g. NEXUS, acting as a MSI parent, but it is - * possible to stack MSI parents. See x86 vector -> irq remapping + * MSI parent domain specific settings. There may be only the root + * parent domain, e.g. NEXUS, acting as a MSI parent, or there may + * be stacked MSI parents, typically used for remapping. */ if (domain->bus_token == pops->bus_select_token) { if (WARN_ON_ONCE(domain != real_parent)) return false; - } else { + } else if (real_parent->bus_token != DOMAIN_BUS_MSI_REMAP) { WARN_ON_ONCE(1); return false; } diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h index 36653e2ee1c9..676eca8147ae 100644 --- a/include/linux/irqdomain_defs.h +++ b/include/linux/irqdomain_defs.h @@ -27,6 +27,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_AMDVI, DOMAIN_BUS_DEVICE_MSI, DOMAIN_BUS_WIRED_TO_MSI, + DOMAIN_BUS_MSI_REMAP, }; #endif /* _LINUX_IRQDOMAIN_DEFS_H */ -- 2.49.0 In order to add the interrupt remapping support in a separate file, share struct riscv_iommu_domain and struct riscv_iommu_info through the header. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu.c | 20 -------------------- drivers/iommu/riscv/iommu.h | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 0eae2f4bdc5e..901d02529a26 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -38,9 +38,6 @@ #define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) #define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) -#define dev_to_iommu(dev) \ - iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) - /* IOMMU PSCID allocation namespace. */ static DEFINE_IDA(riscv_iommu_pscids); #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) @@ -802,26 +799,9 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, return 0; } -/* This struct contains protection domain specific IOMMU driver data. */ -struct riscv_iommu_domain { - struct iommu_domain domain; - struct list_head bonds; - spinlock_t lock; /* protect bonds list updates. */ - int pscid; - bool amo_enabled; - int numa_node; - unsigned int pgd_mode; - unsigned long *pgd_root; -}; - #define iommu_domain_to_riscv(iommu_domain) \ container_of(iommu_domain, struct riscv_iommu_domain, domain) -/* Private IOMMU data for managed devices, dev_iommu_priv_* */ -struct riscv_iommu_info { - struct riscv_iommu_domain *domain; -}; - /* * Linkage between an iommu_domain and attached devices. * diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index 46df79dd5495..1d163cbd9e4d 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -17,8 +17,28 @@ #include "iommu-bits.h" +/* This struct contains protection domain specific IOMMU driver data. */ +struct riscv_iommu_domain { + struct iommu_domain domain; + struct list_head bonds; + spinlock_t lock; /* protect bonds list updates. */ + int pscid; + int amo_enabled; + int numa_node; + unsigned int pgd_mode; + unsigned long *pgd_root; +}; + +/* Private IOMMU data for managed devices, dev_iommu_priv_* */ +struct riscv_iommu_info { + struct riscv_iommu_domain *domain; +}; + struct riscv_iommu_device; +#define dev_to_iommu(dev) \ + iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) + struct riscv_iommu_queue { atomic_t prod; /* unbounded producer allocation index */ atomic_t head; /* unbounded shadow ring buffer consumer index */ -- 2.49.0 From: Zong Li The parameter will be increased when we need to set up more fields in the device context. Use a data structure to wrap them up. Signed-off-by: Zong Li Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 901d02529a26..a44c67a848fa 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -988,7 +988,7 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, * interim translation faults. */ static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, - struct device *dev, u64 fsc, u64 ta) + struct device *dev, struct riscv_iommu_dc *new_dc) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_dc *dc; @@ -1022,10 +1022,10 @@ static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, for (i = 0; i < fwspec->num_ids; i++) { dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); tc = READ_ONCE(dc->tc); - tc |= ta & RISCV_IOMMU_DC_TC_V; + tc |= new_dc->ta & RISCV_IOMMU_DC_TC_V; - WRITE_ONCE(dc->fsc, fsc); - WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); + WRITE_ONCE(dc->fsc, new_dc->fsc); + WRITE_ONCE(dc->ta, new_dc->ta & RISCV_IOMMU_PC_TA_PSCID); /* Update device context, write TC.V as the last step. */ dma_wmb(); WRITE_ONCE(dc->tc, tc); @@ -1304,20 +1304,20 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); - u64 fsc, ta; + struct riscv_iommu_dc dc = {0}; if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) return -ENODEV; - fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | - FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); - ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | - RISCV_IOMMU_PC_TA_V; + dc.fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | + FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); + dc.ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | + RISCV_IOMMU_PC_TA_V; if (riscv_iommu_bond_link(domain, dev)) return -ENOMEM; - riscv_iommu_iodir_update(iommu, dev, fsc, ta); + riscv_iommu_iodir_update(iommu, dev, &dc); riscv_iommu_bond_unlink(info->domain, dev); info->domain = domain; @@ -1408,9 +1408,12 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + struct riscv_iommu_dc dc = {0}; + + dc.fsc = RISCV_IOMMU_FSC_BARE; /* Make device context invalid, translation requests will fault w/ #258 */ - riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); + riscv_iommu_iodir_update(iommu, dev, &dc); riscv_iommu_bond_unlink(info->domain, dev); info->domain = NULL; @@ -1429,8 +1432,12 @@ static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + struct riscv_iommu_dc dc = {0}; + + dc.fsc = RISCV_IOMMU_FSC_BARE; + dc.ta = RISCV_IOMMU_PC_TA_V; - riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); + riscv_iommu_iodir_update(iommu, dev, &dc); riscv_iommu_bond_unlink(info->domain, dev); info->domain = NULL; -- 2.49.0 This is just a skeleton. Until irq-set-affinity functions are implemented the IRQ domain doesn't serve any purpose. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/Makefile | 2 +- drivers/iommu/riscv/iommu-ir.c | 114 +++++++++++++++++++++++++++++++++ drivers/iommu/riscv/iommu.c | 36 +++++++++++ drivers/iommu/riscv/iommu.h | 12 ++++ 4 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 drivers/iommu/riscv/iommu-ir.c diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile index b5929f9f23e6..9c83f877d50f 100644 --- a/drivers/iommu/riscv/Makefile +++ b/drivers/iommu/riscv/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += iommu.o iommu-platform.o +obj-y += iommu.o iommu-ir.o iommu-platform.o obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c new file mode 100644 index 000000000000..08cf159b587d --- /dev/null +++ b/drivers/iommu/riscv/iommu-ir.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * IOMMU Interrupt Remapping + * + * Copyright © 2025 Ventana Micro Systems Inc. + */ +#include +#include + +#include "iommu.h" + +static struct irq_chip riscv_iommu_ir_irq_chip = { + .name = "IOMMU-IR", + .irq_ack = irq_chip_ack_parent, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, +}; + +static int riscv_iommu_ir_irq_domain_alloc_irqs(struct irq_domain *irqdomain, + unsigned int irq_base, unsigned int nr_irqs, + void *arg) +{ + struct irq_data *data; + int i, ret; + + ret = irq_domain_alloc_irqs_parent(irqdomain, irq_base, nr_irqs, arg); + if (ret) + return ret; + + for (i = 0; i < nr_irqs; i++) { + data = irq_domain_get_irq_data(irqdomain, irq_base + i); + data->chip = &riscv_iommu_ir_irq_chip; + } + + return 0; +} + +static const struct irq_domain_ops riscv_iommu_ir_irq_domain_ops = { + .alloc = riscv_iommu_ir_irq_domain_alloc_irqs, + .free = irq_domain_free_irqs_parent, +}; + +static const struct msi_parent_ops riscv_iommu_ir_msi_parent_ops = { + .prefix = "IR-", + .supported_flags = MSI_GENERIC_FLAGS_MASK | + MSI_FLAG_PCI_MSIX, + .required_flags = MSI_FLAG_USE_DEF_DOM_OPS | + MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSI_MASK_PARENT, + .chip_flags = MSI_CHIP_FLAG_SET_ACK, + .init_dev_msi_info = msi_parent_init_dev_msi_info, +}; + +struct irq_domain *riscv_iommu_ir_irq_domain_create(struct riscv_iommu_device *iommu, + struct device *dev, + struct riscv_iommu_info *info) +{ + struct irq_domain *irqparent = dev_get_msi_domain(dev); + struct irq_domain *irqdomain; + struct fwnode_handle *fn; + char *fwname; + + fwname = kasprintf(GFP_KERNEL, "IOMMU-IR-%s", dev_name(dev)); + if (!fwname) + return NULL; + + fn = irq_domain_alloc_named_fwnode(fwname); + kfree(fwname); + if (!fn) { + dev_err(iommu->dev, "Couldn't allocate fwnode\n"); + return NULL; + } + + irqdomain = irq_domain_create_hierarchy(irqparent, 0, 0, fn, + &riscv_iommu_ir_irq_domain_ops, + info); + if (!irqdomain) { + dev_err(iommu->dev, "Failed to create IOMMU irq domain\n"); + irq_domain_free_fwnode(fn); + return NULL; + } + + irqdomain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + irqdomain->msi_parent_ops = &riscv_iommu_ir_msi_parent_ops; + irq_domain_update_bus_token(irqdomain, DOMAIN_BUS_MSI_REMAP); + + dev_set_msi_domain(dev, irqdomain); + + return irqdomain; +} + +void riscv_iommu_ir_irq_domain_remove(struct riscv_iommu_info *info) +{ + struct fwnode_handle *fn; + + if (!info->irqdomain) + return; + + fn = info->irqdomain->fwnode; + irq_domain_remove(info->irqdomain); + info->irqdomain = NULL; + irq_domain_free_fwnode(fn); +} + +int riscv_iommu_ir_attach_paging_domain(struct riscv_iommu_domain *domain, + struct device *dev) +{ + return 0; +} + +void riscv_iommu_ir_free_paging_domain(struct riscv_iommu_domain *domain) +{ +} diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index a44c67a848fa..db2acd9dc64b 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include @@ -1026,6 +1028,9 @@ static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, WRITE_ONCE(dc->fsc, new_dc->fsc); WRITE_ONCE(dc->ta, new_dc->ta & RISCV_IOMMU_PC_TA_PSCID); + WRITE_ONCE(dc->msiptp, new_dc->msiptp); + WRITE_ONCE(dc->msi_addr_mask, new_dc->msi_addr_mask); + WRITE_ONCE(dc->msi_addr_pattern, new_dc->msi_addr_pattern); /* Update device context, write TC.V as the last step. */ dma_wmb(); WRITE_ONCE(dc->tc, tc); @@ -1276,6 +1281,8 @@ static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) WARN_ON(!list_empty(&domain->bonds)); + riscv_iommu_ir_free_paging_domain(domain); + if ((int)domain->pscid > 0) ida_free(&riscv_iommu_pscids, domain->pscid); @@ -1305,15 +1312,28 @@ static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); struct riscv_iommu_dc dc = {0}; + int ret; if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) return -ENODEV; + ret = riscv_iommu_ir_attach_paging_domain(domain, dev); + if (ret) + return ret; + dc.fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); dc.ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | RISCV_IOMMU_PC_TA_V; + if (domain->msi_root) { + dc.msiptp = virt_to_pfn(domain->msi_root) | + FIELD_PREP(RISCV_IOMMU_DC_MSIPTP_MODE, + RISCV_IOMMU_DC_MSIPTP_MODE_FLAT); + dc.msi_addr_mask = domain->msi_addr_mask; + dc.msi_addr_pattern = domain->msi_addr_pattern; + } + if (riscv_iommu_bond_link(domain, dev)) return -ENOMEM; @@ -1466,6 +1486,8 @@ static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args static struct iommu_device *riscv_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + const struct imsic_global_config *imsic_global; + struct irq_domain *irqdomain = NULL; struct riscv_iommu_device *iommu; struct riscv_iommu_info *info; struct riscv_iommu_dc *dc; @@ -1489,6 +1511,18 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); + + imsic_global = imsic_get_global_config(); + if (imsic_global && imsic_global->nr_ids) { + irqdomain = riscv_iommu_ir_irq_domain_create(iommu, dev, info); + if (!irqdomain) { + kfree(info); + return ERR_PTR(-ENOMEM); + } + } + + info->irqdomain = irqdomain; + /* * Allocate and pre-configure device context entries in * the device directory. Do not mark the context valid yet. @@ -1499,6 +1533,7 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev) for (i = 0; i < fwspec->num_ids; i++) { dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); if (!dc) { + riscv_iommu_ir_irq_domain_remove(info); kfree(info); return ERR_PTR(-ENODEV); } @@ -1516,6 +1551,7 @@ static void riscv_iommu_release_device(struct device *dev) { struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + riscv_iommu_ir_irq_domain_remove(info); kfree_rcu_mightsleep(info); } diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index 1d163cbd9e4d..640d825f11b9 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -27,11 +27,15 @@ struct riscv_iommu_domain { int numa_node; unsigned int pgd_mode; unsigned long *pgd_root; + struct riscv_iommu_msipte *msi_root; + u64 msi_addr_mask; + u64 msi_addr_pattern; }; /* Private IOMMU data for managed devices, dev_iommu_priv_* */ struct riscv_iommu_info { struct riscv_iommu_domain *domain; + struct irq_domain *irqdomain; }; struct riscv_iommu_device; @@ -86,6 +90,14 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu); void riscv_iommu_remove(struct riscv_iommu_device *iommu); void riscv_iommu_disable(struct riscv_iommu_device *iommu); +struct irq_domain *riscv_iommu_ir_irq_domain_create(struct riscv_iommu_device *iommu, + struct device *dev, + struct riscv_iommu_info *info); +void riscv_iommu_ir_irq_domain_remove(struct riscv_iommu_info *info); +int riscv_iommu_ir_attach_paging_domain(struct riscv_iommu_domain *domain, + struct device *dev); +void riscv_iommu_ir_free_paging_domain(struct riscv_iommu_domain *domain); + #define riscv_iommu_readl(iommu, addr) \ readl_relaxed((iommu)->reg + (addr)) -- 2.49.0 Capture the IMSIC layout from its config and reserve all the addresses. Then use the IMSIC layout info to calculate the maximum number of PTEs the MSI table needs to support and allocate the MSI table when attaching a paging domain for the first time. Finally, at the same time, map the IMSIC addresses in the stage1 DMA table when the stage1 DMA table is not BARE. This ensures it doesn't fault as it will translate the addresses before the MSI table does. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu-ir.c | 186 +++++++++++++++++++++++++++++++++ drivers/iommu/riscv/iommu.c | 6 ++ drivers/iommu/riscv/iommu.h | 4 + 3 files changed, 196 insertions(+) diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c index 08cf159b587d..bed104c5333c 100644 --- a/drivers/iommu/riscv/iommu-ir.c +++ b/drivers/iommu/riscv/iommu-ir.c @@ -4,11 +4,108 @@ * * Copyright © 2025 Ventana Micro Systems Inc. */ +#include #include #include +#include +#include "../iommu-pages.h" #include "iommu.h" +static size_t riscv_iommu_ir_group_size(struct riscv_iommu_domain *domain) +{ + phys_addr_t mask = domain->msi_addr_mask; + + if (domain->group_index_bits) { + phys_addr_t group_mask = BIT(domain->group_index_bits) - 1; + phys_addr_t group_shift = domain->group_index_shift - 12; + + mask &= ~(group_mask << group_shift); + } + + return (mask + 1) << 12; +} + +static int riscv_iommu_ir_map_unmap_imsics(struct riscv_iommu_domain *domain, bool map, + gfp_t gfp, size_t *unmapped) +{ + phys_addr_t base = domain->msi_addr_pattern << 12, addr; + size_t stride = domain->imsic_stride, map_size = SZ_4K, size; + size_t i, j; + + size = riscv_iommu_ir_group_size(domain); + + if (stride == SZ_4K) + stride = map_size = size; + + for (i = 0; i < BIT(domain->group_index_bits); i++) { + for (j = 0; j < size; j += stride) { + addr = (base + j) | (i << domain->group_index_shift); + if (map) { + int ret = iommu_map(&domain->domain, addr, addr, map_size, + IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO, gfp); + if (ret) + return ret; + } else { + *unmapped += iommu_unmap(&domain->domain, addr, map_size); + } + } + } + + return 0; +} + +static size_t riscv_iommu_ir_unmap_imsics(struct riscv_iommu_domain *domain) +{ + size_t unmapped = 0; + + riscv_iommu_ir_map_unmap_imsics(domain, false, 0, &unmapped); + + return unmapped; +} + +static int riscv_iommu_ir_map_imsics(struct riscv_iommu_domain *domain, gfp_t gfp) +{ + int ret; + + ret = riscv_iommu_ir_map_unmap_imsics(domain, true, gfp, NULL); + if (ret) + riscv_iommu_ir_unmap_imsics(domain); + + return ret; +} + +static size_t riscv_iommu_ir_compute_msipte_idx(struct riscv_iommu_domain *domain, + phys_addr_t msi_pa) +{ + phys_addr_t mask = domain->msi_addr_mask; + phys_addr_t addr = msi_pa >> 12; + size_t idx; + + if (domain->group_index_bits) { + phys_addr_t group_mask = BIT(domain->group_index_bits) - 1; + phys_addr_t group_shift = domain->group_index_shift - 12; + phys_addr_t group = (addr >> group_shift) & group_mask; + + mask &= ~(group_mask << group_shift); + idx = addr & mask; + idx |= group << fls64(mask); + } else { + idx = addr & mask; + } + + return idx; +} + +static size_t riscv_iommu_ir_nr_msiptes(struct riscv_iommu_domain *domain) +{ + phys_addr_t base = domain->msi_addr_pattern << 12; + phys_addr_t max_addr = base | (domain->msi_addr_mask << 12); + size_t max_idx = riscv_iommu_ir_compute_msipte_idx(domain, max_addr); + + return max_idx + 1; +} + static struct irq_chip riscv_iommu_ir_irq_chip = { .name = "IOMMU-IR", .irq_ack = irq_chip_ack_parent, @@ -90,25 +187,114 @@ struct irq_domain *riscv_iommu_ir_irq_domain_create(struct riscv_iommu_device *i return irqdomain; } +static void riscv_iommu_ir_free_msi_table(struct riscv_iommu_domain *domain) +{ + iommu_free_pages(domain->msi_root); +} + void riscv_iommu_ir_irq_domain_remove(struct riscv_iommu_info *info) { + struct riscv_iommu_domain *domain = info->domain; struct fwnode_handle *fn; if (!info->irqdomain) return; + riscv_iommu_ir_free_msi_table(domain); + fn = info->irqdomain->fwnode; irq_domain_remove(info->irqdomain); info->irqdomain = NULL; irq_domain_free_fwnode(fn); } +static int riscv_ir_set_imsic_global_config(struct riscv_iommu_device *iommu, + struct riscv_iommu_domain *domain) +{ + const struct imsic_global_config *imsic_global; + u64 mask = 0; + + imsic_global = imsic_get_global_config(); + + mask |= (BIT(imsic_global->group_index_bits) - 1) << (imsic_global->group_index_shift - 12); + mask |= BIT(imsic_global->hart_index_bits + imsic_global->guest_index_bits) - 1; + domain->msi_addr_mask = mask; + domain->msi_addr_pattern = imsic_global->base_addr >> 12; + domain->group_index_bits = imsic_global->group_index_bits; + domain->group_index_shift = imsic_global->group_index_shift; + domain->imsic_stride = BIT(imsic_global->guest_index_bits + 12); + + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT) { + size_t nr_ptes = riscv_iommu_ir_nr_msiptes(domain); + + domain->msi_root = iommu_alloc_pages_node_sz(domain->numa_node, GFP_KERNEL_ACCOUNT, + nr_ptes * sizeof(*domain->msi_root)); + if (!domain->msi_root) + return -ENOMEM; + } + + return 0; +} + int riscv_iommu_ir_attach_paging_domain(struct riscv_iommu_domain *domain, struct device *dev) { + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + int ret; + + if (!info->irqdomain) + return 0; + + /* + * Do the domain's one-time setup of the msi configuration the + * first time the domain is attached and the msis are enabled. + */ + if (domain->msi_addr_mask == 0) { + ret = riscv_ir_set_imsic_global_config(iommu, domain); + if (ret) + return ret; + + /* + * The RISC-V IOMMU MSI table is checked after the stage1 DMA + * page tables. If we don't create identity mappings in the + * stage1 table then we'll fault and won't even get a chance + * to check the MSI table. + */ + if (domain->pgd_mode) { + ret = riscv_iommu_ir_map_imsics(domain, GFP_KERNEL_ACCOUNT); + if (ret) { + riscv_iommu_ir_free_msi_table(domain); + return ret; + } + } + } + return 0; } void riscv_iommu_ir_free_paging_domain(struct riscv_iommu_domain *domain) { + riscv_iommu_ir_free_msi_table(domain); +} + +void riscv_iommu_ir_get_resv_regions(struct device *dev, struct list_head *head) +{ + const struct imsic_global_config *imsic_global; + struct iommu_resv_region *reg; + phys_addr_t addr; + size_t size, i; + + imsic_global = imsic_get_global_config(); + if (!imsic_global || !imsic_global->nr_ids) + return; + + size = BIT(imsic_global->hart_index_bits + imsic_global->guest_index_bits + 12); + + for (i = 0; i < BIT(imsic_global->group_index_bits); i++) { + addr = imsic_global->base_addr | (i << imsic_global->group_index_shift); + reg = iommu_alloc_resv_region(addr, size, 0, IOMMU_RESV_MSI, GFP_KERNEL); + if (reg) + list_add_tail(®->list, head); + } } diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index db2acd9dc64b..0ba6504d4f33 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -1423,6 +1423,11 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) return &domain->domain; } +static void riscv_iommu_get_resv_regions(struct device *dev, struct list_head *head) +{ + riscv_iommu_ir_get_resv_regions(dev, head); +} + static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, struct device *dev) { @@ -1561,6 +1566,7 @@ static const struct iommu_ops riscv_iommu_ops = { .blocked_domain = &riscv_iommu_blocking_domain, .release_domain = &riscv_iommu_blocking_domain, .domain_alloc_paging = riscv_iommu_alloc_paging_domain, + .get_resv_regions = riscv_iommu_get_resv_regions, .device_group = riscv_iommu_device_group, .probe_device = riscv_iommu_probe_device, .release_device = riscv_iommu_release_device, diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index 640d825f11b9..dc2020b81bbc 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -30,6 +30,9 @@ struct riscv_iommu_domain { struct riscv_iommu_msipte *msi_root; u64 msi_addr_mask; u64 msi_addr_pattern; + u32 group_index_bits; + u32 group_index_shift; + size_t imsic_stride; }; /* Private IOMMU data for managed devices, dev_iommu_priv_* */ @@ -97,6 +100,7 @@ void riscv_iommu_ir_irq_domain_remove(struct riscv_iommu_info *info); int riscv_iommu_ir_attach_paging_domain(struct riscv_iommu_domain *domain, struct device *dev); void riscv_iommu_ir_free_paging_domain(struct riscv_iommu_domain *domain); +void riscv_iommu_ir_get_resv_regions(struct device *dev, struct list_head *head); #define riscv_iommu_readl(iommu, addr) \ readl_relaxed((iommu)->reg + (addr)) -- 2.49.0 Export more in iommu.h from iommu.c and implement functions needed to manage the MSI table. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu-bits.h | 7 ++++++ drivers/iommu/riscv/iommu-ir.c | 43 ++++++++++++++++++++++++++++++++ drivers/iommu/riscv/iommu.c | 36 +++----------------------- drivers/iommu/riscv/iommu.h | 32 ++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h index 98daf0e1a306..d72b982cf9bf 100644 --- a/drivers/iommu/riscv/iommu-bits.h +++ b/drivers/iommu/riscv/iommu-bits.h @@ -715,6 +715,13 @@ static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd) cmd->dword1 = 0; } +static inline void riscv_iommu_cmd_inval_gvma(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA); + cmd->dword1 = 0; +} + static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd, u64 addr) { diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c index bed104c5333c..290d91a6c6cd 100644 --- a/drivers/iommu/riscv/iommu-ir.c +++ b/drivers/iommu/riscv/iommu-ir.c @@ -106,6 +106,49 @@ static size_t riscv_iommu_ir_nr_msiptes(struct riscv_iommu_domain *domain) return max_idx + 1; } +static void riscv_iommu_ir_msitbl_inval(struct riscv_iommu_domain *domain, + struct riscv_iommu_msipte *pte) +{ + struct riscv_iommu_bond *bond; + struct riscv_iommu_device *iommu, *prev; + struct riscv_iommu_command cmd; + + riscv_iommu_cmd_inval_gvma(&cmd); + riscv_iommu_cmd_inval_set_gscid(&cmd, 0); + + if (pte) { + u64 addr = pfn_to_phys(FIELD_GET(RISCV_IOMMU_MSIPTE_PPN, pte->pte)); + riscv_iommu_cmd_inval_set_addr(&cmd, addr); + } + + /* Like riscv_iommu_iotlb_inval(), synchronize with riscv_iommu_bond_link() */ + smp_mb(); + + rcu_read_lock(); + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + if (iommu == prev) + continue; + + riscv_iommu_cmd_send(iommu, &cmd); + prev = iommu; + } + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + if (iommu == prev) + continue; + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + prev = iommu; + } + + rcu_read_unlock(); +} + static struct irq_chip riscv_iommu_ir_irq_chip = { .name = "IOMMU-IR", .irq_ack = irq_chip_ack_parent, diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 0ba6504d4f33..7418e91d8edd 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -26,12 +26,6 @@ #include "iommu-bits.h" #include "iommu.h" -/* Timeouts in [us] */ -#define RISCV_IOMMU_QCSR_TIMEOUT 150000 -#define RISCV_IOMMU_QUEUE_TIMEOUT 150000 -#define RISCV_IOMMU_DDTP_TIMEOUT 10000000 -#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 - /* Number of entries per CMD/FLT queue, should be <= INT_MAX */ #define RISCV_IOMMU_DEF_CQ_COUNT 8192 #define RISCV_IOMMU_DEF_FQ_COUNT 4096 @@ -480,15 +474,15 @@ static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) } /* Send command to the IOMMU command queue */ -static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, - struct riscv_iommu_command *cmd) +void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, + struct riscv_iommu_command *cmd) { riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); } /* Send IOFENCE.C command and wait for all scheduled commands to complete. */ -static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, - unsigned int timeout_us) +void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, + unsigned int timeout_us) { struct riscv_iommu_command cmd; unsigned int prod; @@ -804,28 +798,6 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, #define iommu_domain_to_riscv(iommu_domain) \ container_of(iommu_domain, struct riscv_iommu_domain, domain) -/* - * Linkage between an iommu_domain and attached devices. - * - * Protection domain requiring IOATC and DevATC translation cache invalidations, - * should be linked to attached devices using a riscv_iommu_bond structure. - * Devices should be linked to the domain before first use and unlinked after - * the translations from the referenced protection domain can no longer be used. - * Blocking and identity domains are not tracked here, as the IOMMU hardware - * does not cache negative and/or identity (BARE mode) translations, and DevATC - * is disabled for those protection domains. - * - * The device pointer and IOMMU data remain stable in the bond struct after - * _probe_device() where it's attached to the managed IOMMU, up to the - * completion of the _release_device() call. The release of the bond structure - * is synchronized with the device release. - */ -struct riscv_iommu_bond { - struct list_head list; - struct rcu_head rcu; - struct device *dev; -}; - static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, struct device *dev) { diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index dc2020b81bbc..1fe35f1210fb 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -17,6 +17,12 @@ #include "iommu-bits.h" +/* Timeouts in [us] */ +#define RISCV_IOMMU_QCSR_TIMEOUT 150000 +#define RISCV_IOMMU_QUEUE_TIMEOUT 150000 +#define RISCV_IOMMU_DDTP_TIMEOUT 10000000 +#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 + /* This struct contains protection domain specific IOMMU driver data. */ struct riscv_iommu_domain { struct iommu_domain domain; @@ -89,10 +95,36 @@ struct riscv_iommu_device { u64 *ddt_root; }; +/* + * Linkage between an iommu_domain and attached devices. + * + * Protection domain requiring IOATC and DevATC translation cache invalidations, + * should be linked to attached devices using a riscv_iommu_bond structure. + * Devices should be linked to the domain before first use and unlinked after + * the translations from the referenced protection domain can no longer be used. + * Blocking and identity domains are not tracked here, as the IOMMU hardware + * does not cache negative and/or identity (BARE mode) translations, and DevATC + * is disabled for those protection domains. + * + * The device pointer and IOMMU data remain stable in the bond struct after + * _probe_device() where it's attached to the managed IOMMU, up to the + * completion of the _release_device() call. The release of the bond structure + * is synchronized with the device release. + */ +struct riscv_iommu_bond { + struct list_head list; + struct rcu_head rcu; + struct device *dev; +}; + int riscv_iommu_init(struct riscv_iommu_device *iommu); void riscv_iommu_remove(struct riscv_iommu_device *iommu); void riscv_iommu_disable(struct riscv_iommu_device *iommu); +void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, + struct riscv_iommu_command *cmd); +void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, unsigned int timeout_us); + struct irq_domain *riscv_iommu_ir_irq_domain_create(struct riscv_iommu_device *iommu, struct device *dev, struct riscv_iommu_info *info); -- 2.49.0 The riscv iommu uses a specific set of bits for PPNs (53:10). Export the translation functions so iommu-ir can use them as well. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu-bits.h | 4 ++++ drivers/iommu/riscv/iommu.c | 14 +++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h index d72b982cf9bf..d3d98dbed709 100644 --- a/drivers/iommu/riscv/iommu-bits.h +++ b/drivers/iommu/riscv/iommu-bits.h @@ -36,6 +36,10 @@ #define RISCV_IOMMU_ATP_PPN_FIELD GENMASK_ULL(43, 0) #define RISCV_IOMMU_ATP_MODE_FIELD GENMASK_ULL(63, 60) +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ +#define riscv_iommu_phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) +#define riscv_iommu_ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) + /* 5.3 IOMMU Capabilities (64bits) */ #define RISCV_IOMMU_REG_CAPABILITIES 0x0000 #define RISCV_IOMMU_CAPABILITIES_VERSION GENMASK_ULL(7, 0) diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 7418e91d8edd..440c3eb6f15a 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -30,10 +30,6 @@ #define RISCV_IOMMU_DEF_CQ_COUNT 8192 #define RISCV_IOMMU_DEF_FQ_COUNT 4096 -/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ -#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) -#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) - /* IOMMU PSCID allocation namespace. */ static DEFINE_IDA(riscv_iommu_pscids); #define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) @@ -165,7 +161,7 @@ static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, if (!queue->base) return -ENOMEM; - qb = phys_to_ppn(queue->phys) | + qb = riscv_iommu_phys_to_ppn(queue->phys) | FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); /* Update base register and read back to verify hw accepted our write */ @@ -608,7 +604,7 @@ static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iomm do { ddt = READ_ONCE(*(unsigned long *)ddtp); if (ddt & RISCV_IOMMU_DDTE_V) { - ddtp = __va(ppn_to_phys(ddt)); + ddtp = __va(riscv_iommu_ppn_to_phys(ddt)); break; } @@ -616,7 +612,7 @@ static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iomm if (!ptr) return NULL; - new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; + new = riscv_iommu_phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); if (old == ddt) { @@ -683,7 +679,7 @@ static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) if (ddtp & RISCV_IOMMU_DDTP_BUSY) return -EBUSY; - iommu->ddt_phys = ppn_to_phys(ddtp); + iommu->ddt_phys = riscv_iommu_ppn_to_phys(ddtp); if (iommu->ddt_phys) iommu->ddt_root = devm_ioremap(iommu->dev, iommu->ddt_phys, PAGE_SIZE); @@ -730,7 +726,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, do { rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) - rq_ddtp |= phys_to_ppn(iommu->ddt_phys); + rq_ddtp |= riscv_iommu_phys_to_ppn(iommu->ddt_phys); riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); ddtp = riscv_iommu_read_ddtp(iommu); -- 2.49.0 When setting irq affinity extract the IMSIC address the device needs to access and add it to the MSI table. If the device no longer needs access to an IMSIC then remove it from the table to prohibit access. This allows isolating device MSIs to a set of harts so we can now add the IRQ_DOMAIN_FLAG_ISOLATED_MSI IRQ domain flag. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu-ir.c | 143 ++++++++++++++++++++++++++++++++- drivers/iommu/riscv/iommu.h | 2 + 2 files changed, 143 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c index 290d91a6c6cd..b97768cac4be 100644 --- a/drivers/iommu/riscv/iommu-ir.c +++ b/drivers/iommu/riscv/iommu-ir.c @@ -4,6 +4,7 @@ * * Copyright © 2025 Ventana Micro Systems Inc. */ +#include #include #include #include @@ -106,6 +107,20 @@ static size_t riscv_iommu_ir_nr_msiptes(struct riscv_iommu_domain *domain) return max_idx + 1; } +static void riscv_iommu_ir_set_pte(struct riscv_iommu_msipte *pte, u64 addr) +{ + pte->pte = FIELD_PREP(RISCV_IOMMU_MSIPTE_M, 3) | + riscv_iommu_phys_to_ppn(addr) | + FIELD_PREP(RISCV_IOMMU_MSIPTE_V, 1); + pte->mrif_info = 0; +} + +static void riscv_iommu_ir_clear_pte(struct riscv_iommu_msipte *pte) +{ + pte->pte = 0; + pte->mrif_info = 0; +} + static void riscv_iommu_ir_msitbl_inval(struct riscv_iommu_domain *domain, struct riscv_iommu_msipte *pte) { @@ -149,19 +164,99 @@ static void riscv_iommu_ir_msitbl_inval(struct riscv_iommu_domain *domain, rcu_read_unlock(); } +static void riscv_iommu_ir_msitbl_map(struct riscv_iommu_domain *domain, size_t idx, + phys_addr_t addr) +{ + struct riscv_iommu_msipte *pte; + + if (!domain->msi_root) + return; + + if (!refcount_inc_not_zero(&domain->msi_pte_counts[idx])) { + scoped_guard(raw_spinlock_irqsave, &domain->msi_lock) { + if (refcount_read(&domain->msi_pte_counts[idx]) == 0) { + pte = &domain->msi_root[idx]; + riscv_iommu_ir_set_pte(pte, addr); + riscv_iommu_ir_msitbl_inval(domain, pte); + refcount_set(&domain->msi_pte_counts[idx], 1); + } else { + refcount_inc(&domain->msi_pte_counts[idx]); + } + } + } +} + +static void riscv_iommu_ir_msitbl_unmap(struct riscv_iommu_domain *domain, size_t idx) +{ + struct riscv_iommu_msipte *pte; + + if (!domain->msi_root) + return; + + scoped_guard(raw_spinlock_irqsave, &domain->msi_lock) { + if (refcount_dec_and_test(&domain->msi_pte_counts[idx])) { + pte = &domain->msi_root[idx]; + riscv_iommu_ir_clear_pte(pte); + riscv_iommu_ir_msitbl_inval(domain, pte); + } + } +} + +static size_t riscv_iommu_ir_get_msipte_idx_from_target(struct riscv_iommu_domain *domain, + struct irq_data *data, phys_addr_t *addr) +{ + struct msi_msg msg; + + BUG_ON(irq_chip_compose_msi_msg(data, &msg)); + + *addr = ((phys_addr_t)msg.address_hi << 32) | msg.address_lo; + + return riscv_iommu_ir_compute_msipte_idx(domain, *addr); +} + +static int riscv_iommu_ir_irq_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + struct riscv_iommu_info *info = data->domain->host_data; + struct riscv_iommu_domain *domain = info->domain; + phys_addr_t old_addr, new_addr; + size_t old_idx, new_idx; + int ret; + + old_idx = riscv_iommu_ir_get_msipte_idx_from_target(domain, data, &old_addr); + + ret = irq_chip_set_affinity_parent(data, dest, force); + if (ret < 0) + return ret; + + new_idx = riscv_iommu_ir_get_msipte_idx_from_target(domain, data, &new_addr); + + if (new_idx == old_idx) + return ret; + + riscv_iommu_ir_msitbl_unmap(domain, old_idx); + riscv_iommu_ir_msitbl_map(domain, new_idx, new_addr); + + return ret; +} + static struct irq_chip riscv_iommu_ir_irq_chip = { .name = "IOMMU-IR", .irq_ack = irq_chip_ack_parent, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, - .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_set_affinity = riscv_iommu_ir_irq_set_affinity, }; static int riscv_iommu_ir_irq_domain_alloc_irqs(struct irq_domain *irqdomain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { + struct riscv_iommu_info *info = irqdomain->host_data; + struct riscv_iommu_domain *domain = info->domain; struct irq_data *data; + phys_addr_t addr; + size_t idx; int i, ret; ret = irq_domain_alloc_irqs_parent(irqdomain, irq_base, nr_irqs, arg); @@ -171,14 +266,36 @@ static int riscv_iommu_ir_irq_domain_alloc_irqs(struct irq_domain *irqdomain, for (i = 0; i < nr_irqs; i++) { data = irq_domain_get_irq_data(irqdomain, irq_base + i); data->chip = &riscv_iommu_ir_irq_chip; + idx = riscv_iommu_ir_get_msipte_idx_from_target(domain, data, &addr); + riscv_iommu_ir_msitbl_map(domain, idx, addr); } return 0; } +static void riscv_iommu_ir_irq_domain_free_irqs(struct irq_domain *irqdomain, + unsigned int irq_base, + unsigned int nr_irqs) +{ + struct riscv_iommu_info *info = irqdomain->host_data; + struct riscv_iommu_domain *domain = info->domain; + struct irq_data *data; + phys_addr_t addr; + size_t idx; + int i; + + for (i = 0; i < nr_irqs; i++) { + data = irq_domain_get_irq_data(irqdomain, irq_base + i); + idx = riscv_iommu_ir_get_msipte_idx_from_target(domain, data, &addr); + riscv_iommu_ir_msitbl_unmap(domain, idx); + } + + irq_domain_free_irqs_parent(irqdomain, irq_base, nr_irqs); +} + static const struct irq_domain_ops riscv_iommu_ir_irq_domain_ops = { .alloc = riscv_iommu_ir_irq_domain_alloc_irqs, - .free = irq_domain_free_irqs_parent, + .free = riscv_iommu_ir_irq_domain_free_irqs, }; static const struct msi_parent_ops riscv_iommu_ir_msi_parent_ops = { @@ -221,6 +338,19 @@ struct irq_domain *riscv_iommu_ir_irq_domain_create(struct riscv_iommu_device *i return NULL; } + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT) { + /* + * NOTE: The RISC-V IOMMU doesn't actually support isolated MSI because + * there is no MSI message validation (see the comment above + * msi_device_has_isolated_msi()). However, we claim isolated MSI here + * because applying the IOMMU ensures MSI messages may only be delivered + * to the mapped MSI addresses. This allows MSIs to be isolated to + * particular harts/vcpus where the unvalidated MSI messages can be + * tolerated. + */ + irqdomain->flags |= IRQ_DOMAIN_FLAG_ISOLATED_MSI; + } + irqdomain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; irqdomain->msi_parent_ops = &riscv_iommu_ir_msi_parent_ops; irq_domain_update_bus_token(irqdomain, DOMAIN_BUS_MSI_REMAP); @@ -233,6 +363,7 @@ struct irq_domain *riscv_iommu_ir_irq_domain_create(struct riscv_iommu_device *i static void riscv_iommu_ir_free_msi_table(struct riscv_iommu_domain *domain) { iommu_free_pages(domain->msi_root); + kfree(domain->msi_pte_counts); } void riscv_iommu_ir_irq_domain_remove(struct riscv_iommu_info *info) @@ -274,6 +405,14 @@ static int riscv_ir_set_imsic_global_config(struct riscv_iommu_device *iommu, nr_ptes * sizeof(*domain->msi_root)); if (!domain->msi_root) return -ENOMEM; + + domain->msi_pte_counts = kcalloc(nr_ptes, sizeof(refcount_t), GFP_KERNEL_ACCOUNT); + if (!domain->msi_pte_counts) { + iommu_free_pages(domain->msi_root); + return -ENOMEM; + } + + raw_spin_lock_init(&domain->msi_lock); } return 0; diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index 1fe35f1210fb..aeb5642f003c 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -34,6 +34,8 @@ struct riscv_iommu_domain { unsigned int pgd_mode; unsigned long *pgd_root; struct riscv_iommu_msipte *msi_root; + refcount_t *msi_pte_counts; + raw_spinlock_t msi_lock; u64 msi_addr_mask; u64 msi_addr_pattern; u32 group_index_bits; -- 2.49.0 From: Tomasz Jeznach With iommu/riscv driver available we can enable IOMMU_DMA support for RISC-V architecture. Signed-off-by: Tomasz Jeznach Signed-off-by: Andrew Jones --- drivers/iommu/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 70d29b14d851..9d8c90690275 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -150,7 +150,7 @@ config OF_IOMMU # IOMMU-agnostic DMA-mapping layer config IOMMU_DMA - def_bool ARM64 || X86 || S390 + def_bool ARM64 || X86 || S390 || RISCV select DMA_OPS_HELPERS select IOMMU_API select IOMMU_IOVA -- 2.49.0 The vcpu_info parameter to irq_set_vcpu_affinity() effectively defines an arch specific IOMMU <=> hypervisor protocol. Provide a definition for the RISCV IOMMU. Signed-off-by: Andrew Jones --- arch/riscv/include/asm/irq.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/riscv/include/asm/irq.h b/arch/riscv/include/asm/irq.h index 59c975f750c9..27ff169d1b77 100644 --- a/arch/riscv/include/asm/irq.h +++ b/arch/riscv/include/asm/irq.h @@ -25,6 +25,15 @@ struct fwnode_handle *riscv_get_intc_hwnode(void); int riscv_get_hart_index(struct fwnode_handle *fwnode, u32 logical_index, u32 *hart_index); +struct riscv_iommu_ir_vcpu_info { + u64 gpa; + u64 hpa; + u64 msi_addr_mask; + u64 msi_addr_pattern; + u32 group_index_bits; + u32 group_index_shift; +}; + #ifdef CONFIG_ACPI enum riscv_irqchip_type { -- 2.49.0 Track each IRQ's MSI table index in the IRQ's chip data of the IR irqdomain along with a generation number. This will be necessary when support for irq-set-vcpu-affinity is added as the msitbl configuration will change to match the guest. When a configuration changes then it may no longer be possible to compute the index from the target address, hence the need to stash it. Also, if an allocated IRQ is not mapped with irq-set-vcpu-affinity after a configuration change (which will unmap everything), then we need to avoid attempting to unmap it at free-irqs time. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu-ir.c | 75 +++++++++++++++++++++++++++++----- drivers/iommu/riscv/iommu.h | 1 + 2 files changed, 65 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c index b97768cac4be..059671f18267 100644 --- a/drivers/iommu/riscv/iommu-ir.c +++ b/drivers/iommu/riscv/iommu-ir.c @@ -164,11 +164,42 @@ static void riscv_iommu_ir_msitbl_inval(struct riscv_iommu_domain *domain, rcu_read_unlock(); } -static void riscv_iommu_ir_msitbl_map(struct riscv_iommu_domain *domain, size_t idx, - phys_addr_t addr) +struct riscv_iommu_ir_chip_data { + size_t idx; + u32 config; +}; + +static size_t riscv_iommu_ir_irq_msitbl_idx(struct irq_data *data) +{ + struct riscv_iommu_ir_chip_data *chip_data = irq_data_get_irq_chip_data(data); + + return chip_data->idx; +} + +static u32 riscv_iommu_ir_irq_msitbl_config(struct irq_data *data) +{ + struct riscv_iommu_ir_chip_data *chip_data = irq_data_get_irq_chip_data(data); + + return chip_data->config; +} + +static void riscv_iommu_ir_irq_set_msitbl_info(struct irq_data *data, + size_t idx, u32 config) +{ + struct riscv_iommu_ir_chip_data *chip_data = irq_data_get_irq_chip_data(data); + + chip_data->idx = idx; + chip_data->config = config; +} + +static void riscv_iommu_ir_msitbl_map(struct riscv_iommu_domain *domain, + struct irq_data *data, + size_t idx, phys_addr_t addr) { struct riscv_iommu_msipte *pte; + riscv_iommu_ir_irq_set_msitbl_info(data, idx, domain->msitbl_config); + if (!domain->msi_root) return; @@ -186,9 +217,17 @@ static void riscv_iommu_ir_msitbl_map(struct riscv_iommu_domain *domain, size_t } } -static void riscv_iommu_ir_msitbl_unmap(struct riscv_iommu_domain *domain, size_t idx) +static void riscv_iommu_ir_msitbl_unmap(struct riscv_iommu_domain *domain, + struct irq_data *data, size_t idx) { struct riscv_iommu_msipte *pte; + u32 config; + + config = riscv_iommu_ir_irq_msitbl_config(data); + riscv_iommu_ir_irq_set_msitbl_info(data, -1, -1); + + if (WARN_ON_ONCE(config != domain->msitbl_config)) + return; if (!domain->msi_root) return; @@ -219,11 +258,11 @@ static int riscv_iommu_ir_irq_set_affinity(struct irq_data *data, { struct riscv_iommu_info *info = data->domain->host_data; struct riscv_iommu_domain *domain = info->domain; - phys_addr_t old_addr, new_addr; size_t old_idx, new_idx; + phys_addr_t new_addr; int ret; - old_idx = riscv_iommu_ir_get_msipte_idx_from_target(domain, data, &old_addr); + old_idx = riscv_iommu_ir_irq_msitbl_idx(data); ret = irq_chip_set_affinity_parent(data, dest, force); if (ret < 0) @@ -234,8 +273,8 @@ static int riscv_iommu_ir_irq_set_affinity(struct irq_data *data, if (new_idx == old_idx) return ret; - riscv_iommu_ir_msitbl_unmap(domain, old_idx); - riscv_iommu_ir_msitbl_map(domain, new_idx, new_addr); + riscv_iommu_ir_msitbl_unmap(domain, data, old_idx); + riscv_iommu_ir_msitbl_map(domain, data, new_idx, new_addr); return ret; } @@ -254,11 +293,16 @@ static int riscv_iommu_ir_irq_domain_alloc_irqs(struct irq_domain *irqdomain, { struct riscv_iommu_info *info = irqdomain->host_data; struct riscv_iommu_domain *domain = info->domain; + struct riscv_iommu_ir_chip_data *chip_data; struct irq_data *data; phys_addr_t addr; size_t idx; int i, ret; + chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL_ACCOUNT); + if (!chip_data) + return -ENOMEM; + ret = irq_domain_alloc_irqs_parent(irqdomain, irq_base, nr_irqs, arg); if (ret) return ret; @@ -266,8 +310,9 @@ static int riscv_iommu_ir_irq_domain_alloc_irqs(struct irq_domain *irqdomain, for (i = 0; i < nr_irqs; i++) { data = irq_domain_get_irq_data(irqdomain, irq_base + i); data->chip = &riscv_iommu_ir_irq_chip; + data->chip_data = chip_data; idx = riscv_iommu_ir_get_msipte_idx_from_target(domain, data, &addr); - riscv_iommu_ir_msitbl_map(domain, idx, addr); + riscv_iommu_ir_msitbl_map(domain, data, idx, addr); } return 0; @@ -280,14 +325,22 @@ static void riscv_iommu_ir_irq_domain_free_irqs(struct irq_domain *irqdomain, struct riscv_iommu_info *info = irqdomain->host_data; struct riscv_iommu_domain *domain = info->domain; struct irq_data *data; - phys_addr_t addr; + u32 config; size_t idx; int i; for (i = 0; i < nr_irqs; i++) { data = irq_domain_get_irq_data(irqdomain, irq_base + i); - idx = riscv_iommu_ir_get_msipte_idx_from_target(domain, data, &addr); - riscv_iommu_ir_msitbl_unmap(domain, idx); + config = riscv_iommu_ir_irq_msitbl_config(data); + /* + * Only irqs with matching config versions need to be unmapped here + * since config changes will unmap everything. + */ + if (config == domain->msitbl_config) { + idx = riscv_iommu_ir_irq_msitbl_idx(data); + riscv_iommu_ir_msitbl_unmap(domain, data, idx); + } + kfree(data->chip_data); } irq_domain_free_irqs_parent(irqdomain, irq_base, nr_irqs); diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index aeb5642f003c..130f82e8392a 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -36,6 +36,7 @@ struct riscv_iommu_domain { struct riscv_iommu_msipte *msi_root; refcount_t *msi_pte_counts; raw_spinlock_t msi_lock; + u32 msitbl_config; u64 msi_addr_mask; u64 msi_addr_pattern; u32 group_index_bits; -- 2.49.0 Implement irq_set_vcpu_affinity() in the RISCV IOMMU driver. irq_set_vcpu_affinity() is the channel from a hypervisor to the IOMMU needed to ensure that assigned devices which direct MSIs to guest IMSIC addresses will have those MSI writes redirected to their corresponding guest interrupt files. Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu-ir.c | 165 ++++++++++++++++++++++++++++++++- drivers/iommu/riscv/iommu.c | 5 +- drivers/iommu/riscv/iommu.h | 4 + 3 files changed, 171 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/riscv/iommu-ir.c b/drivers/iommu/riscv/iommu-ir.c index 059671f18267..48f424ce1a8d 100644 --- a/drivers/iommu/riscv/iommu-ir.c +++ b/drivers/iommu/riscv/iommu-ir.c @@ -10,6 +10,8 @@ #include #include +#include + #include "../iommu-pages.h" #include "iommu.h" @@ -164,6 +166,48 @@ static void riscv_iommu_ir_msitbl_inval(struct riscv_iommu_domain *domain, rcu_read_unlock(); } +static void riscv_iommu_ir_msitbl_clear(struct riscv_iommu_domain *domain) +{ + for (size_t i = 0; i < riscv_iommu_ir_nr_msiptes(domain); i++) { + riscv_iommu_ir_clear_pte(&domain->msi_root[i]); + refcount_set(&domain->msi_pte_counts[i], 0); + } +} + +static void riscv_iommu_ir_msiptp_update(struct riscv_iommu_domain *domain) +{ + struct riscv_iommu_bond *bond; + struct riscv_iommu_device *iommu, *prev; + struct riscv_iommu_dc new_dc = { + .ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | + RISCV_IOMMU_PC_TA_V, + .fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | + FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)), + .msiptp = virt_to_pfn(domain->msi_root) | + FIELD_PREP(RISCV_IOMMU_DC_MSIPTP_MODE, + RISCV_IOMMU_DC_MSIPTP_MODE_FLAT), + .msi_addr_mask = domain->msi_addr_mask, + .msi_addr_pattern = domain->msi_addr_pattern, + }; + + /* Like riscv_iommu_ir_msitbl_inval(), synchronize with riscv_iommu_bond_link() */ + smp_mb(); + + rcu_read_lock(); + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + if (iommu == prev) + continue; + + riscv_iommu_iodir_update(iommu, bond->dev, &new_dc); + prev = iommu; + } + + rcu_read_unlock(); +} + struct riscv_iommu_ir_chip_data { size_t idx; u32 config; @@ -279,12 +323,127 @@ static int riscv_iommu_ir_irq_set_affinity(struct irq_data *data, return ret; } +static bool riscv_iommu_ir_vcpu_check_config(struct riscv_iommu_domain *domain, + struct riscv_iommu_ir_vcpu_info *vcpu_info) +{ + return domain->msi_addr_mask == vcpu_info->msi_addr_mask && + domain->msi_addr_pattern == vcpu_info->msi_addr_pattern && + domain->group_index_bits == vcpu_info->group_index_bits && + domain->group_index_shift == vcpu_info->group_index_shift; +} + +static int riscv_iommu_ir_vcpu_new_config(struct riscv_iommu_domain *domain, + struct irq_data *data, + struct riscv_iommu_ir_vcpu_info *vcpu_info) +{ + struct riscv_iommu_msipte *pte; + size_t idx; + int ret; + + if (domain->pgd_mode) + riscv_iommu_ir_unmap_imsics(domain); + + riscv_iommu_ir_msitbl_clear(domain); + + domain->msi_addr_mask = vcpu_info->msi_addr_mask; + domain->msi_addr_pattern = vcpu_info->msi_addr_pattern; + domain->group_index_bits = vcpu_info->group_index_bits; + domain->group_index_shift = vcpu_info->group_index_shift; + domain->imsic_stride = SZ_4K; + domain->msitbl_config += 1; + + if (domain->pgd_mode) { + /* + * As in riscv_iommu_ir_irq_domain_create(), we do all stage1 + * mappings up front since the MSI table will manage the + * translations. + * + * XXX: Since irq-set-vcpu-affinity is called in atomic context + * we need GFP_ATOMIC. If the number of 4K dma pte allocations + * is considered too many for GFP_ATOMIC, then we can wrap + * riscv_iommu_pte_alloc()'s iommu_alloc_pages_node_sz() call + * in a mempool and try to ensure the pool has enough elements + * in riscv_iommu_ir_irq_domain_enable_msis(). + */ + ret = riscv_iommu_ir_map_imsics(domain, GFP_ATOMIC); + if (ret) + return ret; + } + + idx = riscv_iommu_ir_compute_msipte_idx(domain, vcpu_info->gpa); + pte = &domain->msi_root[idx]; + riscv_iommu_ir_irq_set_msitbl_info(data, idx, domain->msitbl_config); + riscv_iommu_ir_set_pte(pte, vcpu_info->hpa); + riscv_iommu_ir_msitbl_inval(domain, NULL); + refcount_set(&domain->msi_pte_counts[idx], 1); + + riscv_iommu_ir_msiptp_update(domain); + + return 0; +} + +static int riscv_iommu_ir_irq_set_vcpu_affinity(struct irq_data *data, void *arg) +{ + struct riscv_iommu_info *info = data->domain->host_data; + struct riscv_iommu_domain *domain = info->domain; + struct riscv_iommu_ir_vcpu_info *vcpu_info = arg; + struct riscv_iommu_msipte pteval; + struct riscv_iommu_msipte *pte; + bool inc = false, dec = false; + size_t old_idx, new_idx; + u32 old_config; + + if (!domain->msi_root) + return -EOPNOTSUPP; + + old_idx = riscv_iommu_ir_irq_msitbl_idx(data); + old_config = riscv_iommu_ir_irq_msitbl_config(data); + + if (!vcpu_info) { + riscv_iommu_ir_msitbl_unmap(domain, data, old_idx); + return 0; + } + + guard(raw_spinlock)(&domain->msi_lock); + + if (!riscv_iommu_ir_vcpu_check_config(domain, vcpu_info)) + return riscv_iommu_ir_vcpu_new_config(domain, data, vcpu_info); + + new_idx = riscv_iommu_ir_compute_msipte_idx(domain, vcpu_info->gpa); + riscv_iommu_ir_irq_set_msitbl_info(data, new_idx, domain->msitbl_config); + + pte = &domain->msi_root[new_idx]; + riscv_iommu_ir_set_pte(&pteval, vcpu_info->hpa); + + if (pteval.pte != pte->pte) { + *pte = pteval; + riscv_iommu_ir_msitbl_inval(domain, pte); + } + + if (old_config != domain->msitbl_config) + inc = true; + else if (new_idx != old_idx) + inc = dec = true; + + if (dec && refcount_dec_and_test(&domain->msi_pte_counts[old_idx])) { + pte = &domain->msi_root[old_idx]; + riscv_iommu_ir_clear_pte(pte); + riscv_iommu_ir_msitbl_inval(domain, pte); + } + + if (inc && !refcount_inc_not_zero(&domain->msi_pte_counts[new_idx])) + refcount_set(&domain->msi_pte_counts[new_idx], 1); + + return 0; +} + static struct irq_chip riscv_iommu_ir_irq_chip = { .name = "IOMMU-IR", .irq_ack = irq_chip_ack_parent, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, .irq_set_affinity = riscv_iommu_ir_irq_set_affinity, + .irq_set_vcpu_affinity = riscv_iommu_ir_irq_set_vcpu_affinity, }; static int riscv_iommu_ir_irq_domain_alloc_irqs(struct irq_domain *irqdomain, @@ -334,7 +493,11 @@ static void riscv_iommu_ir_irq_domain_free_irqs(struct irq_domain *irqdomain, config = riscv_iommu_ir_irq_msitbl_config(data); /* * Only irqs with matching config versions need to be unmapped here - * since config changes will unmap everything. + * since config changes will unmap everything and irq-set-vcpu-affinity + * irq deletions unmap at deletion time. An example of stale indices that + * don't need to be unmapped are those of irqs allocated by VFIO that a + * guest driver never used. The config change made for the guest will have + * already unmapped those, though, so there's no need to unmap them here. */ if (config == domain->msitbl_config) { idx = riscv_iommu_ir_irq_msitbl_idx(data); diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 440c3eb6f15a..02f38aa0b231 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -957,8 +957,9 @@ static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, * device is not quiesced might be disruptive, potentially causing * interim translation faults. */ -static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, - struct device *dev, struct riscv_iommu_dc *new_dc) +void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, + struct riscv_iommu_dc *new_dc) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct riscv_iommu_dc *dc; diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h index 130f82e8392a..5ab2b4d6ee88 100644 --- a/drivers/iommu/riscv/iommu.h +++ b/drivers/iommu/riscv/iommu.h @@ -124,6 +124,10 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu); void riscv_iommu_remove(struct riscv_iommu_device *iommu); void riscv_iommu_disable(struct riscv_iommu_device *iommu); +void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, + struct riscv_iommu_dc *new_dc); + void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, struct riscv_iommu_command *cmd); void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, unsigned int timeout_us); -- 2.49.0 From: Tomasz Jeznach Report RISC-V IOMMU capability required by the VFIO subsystem to enable PCIe device assignment. Signed-off-by: Tomasz Jeznach Signed-off-by: Andrew Jones --- drivers/iommu/riscv/iommu.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index 02f38aa0b231..5a0dd99f07d0 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -1452,6 +1452,17 @@ static struct iommu_group *riscv_iommu_device_group(struct device *dev) return generic_device_group(dev); } +static bool riscv_iommu_capable(struct device *dev, enum iommu_cap cap) +{ + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + /* The RISC-V IOMMU is always DMA cache coherent. */ + return true; + default: + return false; + } +} + static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) { return iommu_fwspec_add_ids(dev, args->args, 1); @@ -1531,6 +1542,7 @@ static void riscv_iommu_release_device(struct device *dev) static const struct iommu_ops riscv_iommu_ops = { .of_xlate = riscv_iommu_of_xlate, + .capable = riscv_iommu_capable, .identity_domain = &riscv_iommu_identity_domain, .blocked_domain = &riscv_iommu_blocking_domain, .release_domain = &riscv_iommu_blocking_domain, -- 2.49.0 From: Tomasz Jeznach Enable KVM/VFIO support on RISC-V architecture. Signed-off-by: Tomasz Jeznach Signed-off-by: Andrew Jones --- arch/riscv/kvm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 5a62091b0809..968a33ab23b8 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -30,10 +30,12 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO + select KVM_VFIO select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_MMU_NOTIFIER select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS + select SRCU help Support hosting virtualized guest machines. -- 2.49.0 Add all the functions needed to wire up irqbypass support and implement kvm_arch_update_irqfd_routing() which makes irq_set_vcpu_affinity() calls whenever the assigned device updates its target addresses. Also implement calls to irq_set_vcpu_affinity() from kvm_riscv_vcpu_aia_imsic_update() which are needed to update the IOMMU mappings when the hypervisor migrates a VCPU to another CPU (requiring a change to the target guest interrupt file). Signed-off-by: Andrew Jones --- arch/riscv/kvm/Kconfig | 1 + arch/riscv/kvm/aia_imsic.c | 143 ++++++++++++++++++++++++++++++++++++- arch/riscv/kvm/vm.c | 31 ++++++++ 3 files changed, 173 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 968a33ab23b8..76cfd85c5c40 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on RISCV_SBI && MMU select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI select HAVE_KVM_VCPU_ASYNC_IOCTL diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index fda0346f0ea1..148ae94fa17b 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -11,11 +11,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64)) @@ -719,6 +721,14 @@ void kvm_riscv_vcpu_aia_imsic_put(struct kvm_vcpu *vcpu) read_unlock_irqrestore(&imsic->vsfile_lock, flags); } +static u64 kvm_riscv_aia_msi_addr_mask(struct kvm_aia *aia) +{ + u64 group_mask = BIT(aia->nr_group_bits) - 1; + + return (group_mask << (aia->nr_group_shift - IMSIC_MMIO_PAGE_SHIFT)) | + (BIT(aia->nr_hart_bits + aia->nr_guest_bits) - 1); +} + void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) { unsigned long flags; @@ -769,6 +779,132 @@ void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei); } +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + struct riscv_iommu_ir_vcpu_info vcpu_info; + struct kvm *kvm = irqfd->kvm; + struct kvm_aia *aia = &kvm->arch.aia; + int host_irq = irqfd->producer->irq; + struct irq_data *irqdata = irq_get_irq_data(host_irq); + unsigned long tmp, flags; + struct kvm_vcpu *vcpu; + struct imsic *imsic; + struct msi_msg msg; + u64 msi_addr_mask; + gpa_t target; + int ret; + + if (old && old->type == KVM_IRQ_ROUTING_MSI && + new && new->type == KVM_IRQ_ROUTING_MSI && + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) + return; + + if (!new) { + if (!WARN_ON_ONCE(!old) && old->type == KVM_IRQ_ROUTING_MSI) { + ret = irq_set_vcpu_affinity(host_irq, NULL); + WARN_ON_ONCE(ret && ret != -EOPNOTSUPP); + } + return; + } + + if (new->type != KVM_IRQ_ROUTING_MSI) + return; + + target = ((gpa_t)new->msi.address_hi << 32) | new->msi.address_lo; + if (WARN_ON_ONCE(target & (IMSIC_MMIO_PAGE_SZ - 1))) + return; + + msg = (struct msi_msg){ + .address_hi = new->msi.address_hi, + .address_lo = new->msi.address_lo, + .data = new->msi.data, + }; + + kvm_for_each_vcpu(tmp, vcpu, kvm) { + if (target == vcpu->arch.aia_context.imsic_addr) + break; + } + if (!vcpu) + return; + + msi_addr_mask = kvm_riscv_aia_msi_addr_mask(aia); + vcpu_info = (struct riscv_iommu_ir_vcpu_info){ + .gpa = target, + .msi_addr_mask = msi_addr_mask, + .msi_addr_pattern = (target >> IMSIC_MMIO_PAGE_SHIFT) & ~msi_addr_mask, + .group_index_bits = aia->nr_group_bits, + .group_index_shift = aia->nr_group_shift, + }; + + imsic = vcpu->arch.aia_context.imsic_state; + + read_lock_irqsave(&imsic->vsfile_lock, flags); + + if (WARN_ON_ONCE(imsic->vsfile_cpu < 0)) + goto out; + + vcpu_info.hpa = imsic->vsfile_pa; + + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + WARN_ON_ONCE(ret && ret != -EOPNOTSUPP); + if (ret) + goto out; + + irq_data_get_irq_chip(irqdata)->irq_write_msi_msg(irqdata, &msg); + +out: + read_unlock_irqrestore(&imsic->vsfile_lock, flags); +} + +static void kvm_riscv_vcpu_irq_update(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + gpa_t gpa = vcpu->arch.aia_context.imsic_addr; + struct kvm_aia *aia = &kvm->arch.aia; + u64 msi_addr_mask = kvm_riscv_aia_msi_addr_mask(aia); + struct riscv_iommu_ir_vcpu_info vcpu_info = { + .gpa = gpa, + .hpa = imsic->vsfile_pa, + .msi_addr_mask = msi_addr_mask, + .msi_addr_pattern = (gpa >> IMSIC_MMIO_PAGE_SHIFT) & ~msi_addr_mask, + .group_index_bits = aia->nr_group_bits, + .group_index_shift = aia->nr_group_shift, + }; + struct kvm_kernel_irq_routing_entry *irq_entry; + struct kvm_kernel_irqfd *irqfd; + gpa_t target; + int host_irq, ret; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry(irqfd, &kvm->irqfds.items, list) { + if (!irqfd->producer) + continue; + + irq_entry = &irqfd->irq_entry; + if (irq_entry->type != KVM_IRQ_ROUTING_MSI) + continue; + + target = ((gpa_t)irq_entry->msi.address_hi << 32) | irq_entry->msi.address_lo; + if (WARN_ON_ONCE(target & (IMSIC_MMIO_PAGE_SZ - 1))) + continue; + + if (target != gpa) + continue; + + host_irq = irqfd->producer->irq; + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + WARN_ON_ONCE(ret && ret != -EOPNOTSUPP); + if (ret == -EOPNOTSUPP) + break; + } + + spin_unlock_irq(&kvm->irqfds.lock); +} + int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) { unsigned long flags; @@ -836,14 +972,17 @@ int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) if (ret) goto fail_free_vsfile_hgei; - /* TODO: Update the IOMMU mapping ??? */ - /* Update new IMSIC VS-file details in IMSIC context */ write_lock_irqsave(&imsic->vsfile_lock, flags); + imsic->vsfile_hgei = new_vsfile_hgei; imsic->vsfile_cpu = vcpu->cpu; imsic->vsfile_va = new_vsfile_va; imsic->vsfile_pa = new_vsfile_pa; + + /* Update the IOMMU mapping */ + kvm_riscv_vcpu_irq_update(vcpu); + write_unlock_irqrestore(&imsic->vsfile_lock, flags); /* diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 66d91ae6e9b2..1d33cff73e00 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include const struct _kvm_stats_desc kvm_vm_stats_desc[] = { @@ -56,6 +58,35 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_riscv_aia_destroy_vm(kvm); } +bool kvm_arch_has_irq_bypass(void) +{ + return true; +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + irqfd->producer = prod; + kvm_arch_update_irqfd_routing(irqfd, NULL, &irqfd->irq_entry); + + return 0; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + WARN_ON(irqfd->producer != prod); + + kvm_arch_update_irqfd_routing(irqfd, &irqfd->irq_entry, NULL); + irqfd->producer = NULL; +} + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irql, bool line_status) { -- 2.49.0 From: Tomasz Jeznach Enable VFIO support on RISC-V architecture. Signed-off-by: Tomasz Jeznach Signed-off-by: Andrew Jones --- drivers/vfio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index ceae52fd7586..ad62205b4e45 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -39,7 +39,7 @@ config VFIO_GROUP config VFIO_CONTAINER bool "Support for the VFIO container /dev/vfio/vfio" - select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64 || RISCV) depends on VFIO_GROUP default y help -- 2.49.0 Add the VFIO modules to the defconfig to complement KVM now that there is IOMMU support. Signed-off-by: Andrew Jones --- arch/riscv/configs/defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 7b5eed17611a..633aed46064f 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -242,6 +242,8 @@ CONFIG_DMADEVICES=y CONFIG_DMA_SUN6I=m CONFIG_DW_AXI_DMAC=y CONFIG_DWMAC_THEAD=m +CONFIG_VFIO=m +CONFIG_VFIO_PCI=m CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BALLOON=y CONFIG_VIRTIO_INPUT=y -- 2.49.0 Workaround a bug that breaks guest booting with device assignment that was introduced with commit 9bca8be646e0 ("RISC-V: KVM: Fix pte settings within kvm_riscv_gstage_ioremap()") --- arch/riscv/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 525fb5a330c0..994f18b92143 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -56,7 +56,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(hpa); - prot = pgprot_noncached(PAGE_WRITE); + prot = pgprot_noncached(__pgprot(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_READ | _PAGE_WRITE)); for (addr = gpa; addr < end; addr += PAGE_SIZE) { map.addr = addr; -- 2.49.0