Wire an MSI-X vector to a dedicated EQ so the mlx5 driver supports send_msi(). Each EQ can be linked to an MSI-X vector, and the CQ can be set up to deliver an event to the EQ. Thus, when everything is armed, an RDMA WRITE posted to the QP generates a CQE, which generates an EQE, which generates an MSI-X. To keep things simple this just re-uses all the existing QPs and CQs, so they generate single MSIs during memcpy. send_msi() drains any accumulated MSI EQ events from prior memcpy completions, posts a small signaled RDMA Write, then polls the CQ to consume the resulting CQE (avoiding stale completions on subsequent test cycles). Assisted-by: Claude:claude-opus-4.6 Signed-off-by: Jason Gunthorpe --- .../selftests/vfio/lib/drivers/mlx5/mlx5.c | 165 +++++++++++++++++- .../selftests/vfio/lib/drivers/mlx5/mlx5_hw.h | 6 + 2 files changed, 168 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c index e5e75adb253166..c8388aabb8c672 100644 --- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c +++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c @@ -57,17 +57,23 @@ struct mlx5st_device { /* CQ */ u32 cqn; u32 cq_ci; + u32 cq_arm_sn; /* UAR */ u32 uar_page; void __iomem *uar_base; unsigned int uar_bf_offset; - /* EQ */ + /* EQ (cmd/pages events — polled, not interrupt-driven) */ u32 eqn; u32 eq_cons_index; bool have_eq; + /* MSI EQ (CQ completion events — fires MSI-X) */ + u32 msi_eqn; + u32 msi_eq_cons_index; + bool have_msi_eq; + /* Async pages slot state */ bool pages_slot_in_use; bool pages_slot_is_reclaim; @@ -91,6 +97,10 @@ struct mlx5st_device { bool fl_supported; u8 log_max_msg; + /* Buffers used by send_msi() to trigger an interrupt */ + u64 send_msi_src; + u64 send_msi_dst; + /* * HW-visible DMA buffers below — device reads/writes via DMA. */ @@ -113,6 +123,9 @@ struct mlx5st_device { /* EQ does not support page_offset */ struct mlx5st_eqe eq_buf[EQ_NENT] __aligned(MLX5_HW_PAGE_SIZE); + /* MSI EQ buffer — CQ completions generate EQEs here -> MSI-X */ + struct mlx5st_eqe msi_eq_buf[MSI_EQ_NENT] __aligned(MLX5_HW_PAGE_SIZE); + u8 fw_pages[MAX_FW_PAGES][MLX5_HW_PAGE_SIZE] __aligned(MLX5_HW_PAGE_SIZE); }; @@ -135,6 +148,9 @@ static_assert(offsetof(struct mlx5st_device, qp_dbrec) % 64 == 0, static_assert(offsetof(struct mlx5st_device, eq_buf) % MLX5_HW_PAGE_SIZE == 0, "eq_buf must be page-aligned"); +static_assert(offsetof(struct mlx5st_device, msi_eq_buf) % + MLX5_HW_PAGE_SIZE == 0, + "msi_eq_buf must be page-aligned"); static_assert(offsetof(struct mlx5st_device, fw_pages) % MLX5_HW_PAGE_SIZE == 0, "fw_pages must be page-aligned"); @@ -1013,6 +1029,85 @@ static void mlx5st_process_events(struct mlx5st_device *dev) mlx5st_eq_update_ci(dev, cc, 0); } +/* + * MSI EQ — dedicated EQ for CQ completion events that fires MSI-X. + * Separate from the cmd/pages EQ so that only CQ completions (from + * send_msi or memcpy) trigger the interrupt vector. + */ + +static void mlx5st_msi_eq_drain(struct mlx5st_device *dev) +{ + u32 cc = 0; + u32 val; + + while (cc < MSI_EQ_NENT) { + u32 ci = dev->msi_eq_cons_index + cc; + struct mlx5st_eqe *eqe = + &dev->msi_eq_buf[ci % MSI_EQ_NENT]; + + if (MLX5_GET_ONCE(eqe, eqe, owner) != !!(ci & MSI_EQ_NENT)) + break; + cc++; + } + + /* Update consumer index and re-arm for next interrupt */ + dev->msi_eq_cons_index += cc; + val = (dev->msi_eq_cons_index & 0xffffff) | (dev->msi_eqn << 24); + iowrite32be(val, (u8 __iomem *)dev->uar_base + MLX5_EQ_DOORBELL_OFFSET); +} + +static void mlx5st_create_msi_eq(struct mlx5st_device *dev) +{ + struct vfio_pci_device *device = dev->device; + u64 in[MLX5_ST_SZ_QW(create_eq_in) + 1] = {}; + u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {}; + struct mlx5_ifc_eqc_bits *eqc; + unsigned int i; + __be64 *pas; + + /* Initialize EQE owner bits */ + for (i = 0; i < MSI_EQ_NENT; i++) { + struct mlx5st_eqe *eqe = &dev->msi_eq_buf[i]; + + MLX5_SET_ONCE(eqe, eqe, owner, 1); + } + + MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ); + + /* + * No event_bitmask — completion events are routed to this EQ via + * the CQ's c_eqn field, not through CREATE_EQ subscription. + */ + eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry); + MLX5_SET(eqc, eqc, log_eq_size, LOG_MSI_EQ_SIZE); + MLX5_SET(eqc, eqc, uar_page, dev->uar_page); + MLX5_SET(eqc, eqc, intr, MSI_VECTOR); + pas = MLX5_ADDR_OF(create_eq_in, in, pas); + VFIO_ASSERT_EQ(mlx5st_fill_pas(device, dev->msi_eq_buf, pas), 0u); + MLX5_SET(eqc, eqc, log_page_size, 0); + + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + + dev->msi_eqn = MLX5_GET(create_eq_out, out, eq_number); + dev->msi_eq_cons_index = 0; + dev->have_msi_eq = true; + mlx5st_msi_eq_drain(dev); + + dev_dbg(device, + "Created MSI EQ: eqn=%u, %d entries (COMP), vector=%d\n", + dev->msi_eqn, MSI_EQ_NENT, MSI_VECTOR); +} + +static void mlx5st_destroy_msi_eq(struct mlx5st_device *dev) +{ + u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {}; + u32 in[MLX5_ST_SZ_DW(destroy_eq_in)] = {}; + + MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ); + MLX5_SET(destroy_eq_in, in, eq_number, dev->msi_eqn); + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + /* * HCA init / teardown */ @@ -1369,7 +1464,7 @@ static void mlx5st_create_cq(struct mlx5st_device *dev) cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); MLX5_SET(cqc, cqc, log_cq_size, LOG_CQ_SIZE); MLX5_SET(cqc, cqc, uar_page, dev->uar_page); - MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->eqn); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->msi_eqn); MLX5_SET(cqc, cqc, cqe_sz, 0); pas = MLX5_ADDR_OF(create_cq_in, in, pas); MLX5_SET(cqc, cqc, page_offset, mlx5st_fill_pas(device, dev->cq_buf, pas)); @@ -1394,6 +1489,30 @@ static void mlx5st_destroy_cq(struct mlx5st_device *dev) mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } +/* + * Arm CQ for event generation. The CQ event delivery state machine is + * single-shot: after generating one EQE the CQ enters "Fired" state and + * won't generate another until re-armed via ARM_NEXT. Both the CQ doorbell + * record and the UAR CQ doorbell register must be written. + */ +static void mlx5st_arm_cq(struct mlx5st_device *dev) +{ + u32 sn = dev->cq_arm_sn & 3; + u32 ci = dev->cq_ci & 0xffffff; + u64 doorbell; + + /* Update CQ doorbell record arm word */ + WRITE_ONCE(dev->cq_dbrec.send_counter, + cpu_to_be32(sn << 28 | ci)); + + /* Ring CQ doorbell register, iowrite has an internal dma_wmb() */ + doorbell = ((u64)(sn << 28 | ci) << 32) | dev->cqn; + iowrite64be(doorbell, + (u8 __iomem *)dev->uar_base + MLX5_CQ_DOORBELL_OFFSET); + + dev->cq_arm_sn++; +} + /* * QP create/destroy */ @@ -1650,6 +1769,7 @@ static void mlx5st_teardown_datapath(struct mlx5st_device *dev) } dev->sq_pi = 0; dev->sq_ci = 0; + dev->cq_arm_sn = 0; memset(&dev->qp_dbrec, 0, sizeof(dev->qp_dbrec)); memset(&dev->cq_dbrec, 0, sizeof(dev->cq_dbrec)); } @@ -1691,6 +1811,34 @@ static int mlx5st_memcpy_wait(struct vfio_pci_device *device) return ret; } +/* + * send_msi callback — trigger CQE -> EQE -> MSI-X via a small RDMA Write. + * + * Both the CQ and MSI EQ use single-shot arming: the CQ must be armed so the + * CQE generates an EQE, and the MSI EQ must be armed so the EQE fires MSI-X. + */ +static void mlx5st_send_msi(struct vfio_pci_device *device) +{ + struct mlx5st_device *dev = to_mlx5st(device); + + /* Drain accumulated MSI EQ events and re-arm for next interrupt */ + mlx5st_msi_eq_drain(dev); + + /* Arm CQ so the next CQE generates an EQE on the MSI EQ */ + mlx5st_arm_cq(dev); + + /* Post a signaled RDMA Write to trigger CQE -> EQE -> MSI-X */ + mlx5st_post_rdma_write(dev, + to_iova(device, &dev->send_msi_src), + dev->global_lkey, + to_iova(device, &dev->send_msi_dst), + dev->global_rkey, + sizeof(dev->send_msi_src), true); + + /* Consume the CQE to avoid stale completions */ + VFIO_ASSERT_EQ(mlx5st_poll_cq(dev, MLX5ST_MEMCPY_TIMEOUT_MS), 0); +} + /* * Driver ops callbacks */ @@ -1721,8 +1869,13 @@ static void mlx5st_init(struct vfio_pci_device *device) mlx5st_alloc_pd(dev); mlx5st_create_mkey(dev); + /* MSI EQ must be created before CQ so CQ can reference its eqn */ + mlx5st_create_msi_eq(dev); mlx5st_setup_datapath(dev); + vfio_pci_msix_enable(device, MSI_VECTOR, 1); + device->driver.msi = MSI_VECTOR; + device->driver.max_memcpy_size = 1ULL << dev->log_max_msg; device->driver.max_memcpy_count = SQ_WQE_CNT - 1; @@ -1733,8 +1886,14 @@ static void mlx5st_remove(struct vfio_pci_device *device) { struct mlx5st_device *dev = to_mlx5st(device); + vfio_pci_msix_disable(device); mlx5st_teardown_datapath(dev); + if (dev->have_msi_eq) { + mlx5st_destroy_msi_eq(dev); + dev->have_msi_eq = false; + } + dev_dbg(device, "teardown: destroy_mkey\n"); if (dev->mkey_index) { mlx5st_destroy_mkey(dev); @@ -1765,5 +1924,5 @@ struct vfio_pci_driver_ops mlx5st_ops = { .remove = mlx5st_remove, .memcpy_start = mlx5st_memcpy_start, .memcpy_wait = mlx5st_memcpy_wait, - .send_msi = NULL, + .send_msi = mlx5st_send_msi, }; diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h index a2506ec8a19523..2c451e411ec13f 100644 --- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h +++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h @@ -80,6 +80,9 @@ struct mlx5st_dbrec { #define MLX5_BF_OFFSET 0x800 #define MLX5_BF_SIZE 0x100 +/* CQ doorbell offset within UAR page */ +#define MLX5_CQ_DOORBELL_OFFSET 0x20 + /* EQ doorbell offset within UAR page */ #define MLX5_EQ_DOORBELL_OFFSET 0x40 @@ -94,6 +97,9 @@ struct mlx5st_dbrec { #define LOG_CQ_SIZE 4 #define EQ_NENT 64 #define LOG_EQ_SIZE 6 +#define MSI_EQ_NENT 16 +#define LOG_MSI_EQ_SIZE 4 +#define MSI_VECTOR 0 #define MAX_FW_PAGES 8192 #define MAX_FW_PAGES_PER_CMD 512 -- 2.43.0