We're going to give more control over rx buffer sizes to user space, and since we can't always rely on driver validation, let's sanitise it in page_pool_init() as well. Note that we only need to reject over MAX_PAGE_ORDER allocations for normal page pools, as current memory providers don't need to use the buddy allocator and must check the order on init.i Suggested-by: Stanislav Fomichev Reviewed-by: Mina Almasry Signed-off-by: Pavel Begunkov --- net/core/page_pool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1a5edec485f1..635c77e8050b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -301,6 +301,9 @@ static int page_pool_init(struct page_pool *pool, } static_branch_inc(&page_pool_mem_providers); + } else if (pool->p.order > MAX_PAGE_ORDER) { + err = -EINVAL; + goto free_ptr_ring; } return 0; -- 2.52.0 Instead of resetting memory provider parameters one by one in __net_mp_{open,close}_rxq, memzero the entire structure. It'll be used to extend the structure. Signed-off-by: Pavel Begunkov --- net/core/netdev_rx_queue.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index c7d9341b7630..a0083f176a9c 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -139,10 +139,9 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, rxq->mp_params = *p; ret = netdev_rx_queue_restart(dev, rxq_idx); - if (ret) { - rxq->mp_params.mp_ops = NULL; - rxq->mp_params.mp_priv = NULL; - } + if (ret) + memset(&rxq->mp_params, 0, sizeof(rxq->mp_params)); + return ret; } @@ -179,8 +178,7 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, rxq->mp_params.mp_priv != old_p->mp_priv)) return; - rxq->mp_params.mp_ops = NULL; - rxq->mp_params.mp_priv = NULL; + memset(&rxq->mp_params, 0, sizeof(rxq->mp_params)); err = netdev_rx_queue_restart(dev, ifq_idx); WARN_ON(err && err != -ENETDOWN); } -- 2.52.0 Allow memory providers to configure rx queues with a specific receive buffer length. Pass it in struct pp_memory_provider_params, which is copied into the queue, so it's preserved across queue restarts. It's an opt-in feature for drivers, which they can enable by setting NDO_QUEUE_RX_BUF_SIZE to their struct netdev_queue_mgmt_ops. Signed-off-by: Pavel Begunkov --- include/net/netdev_queues.h | 9 +++++++++ include/net/page_pool/types.h | 1 + net/core/netdev_rx_queue.c | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index cd00e0406cf4..2e6bcec1e1e3 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -111,6 +111,11 @@ void netdev_stat_queue_sum(struct net_device *netdev, int tx_start, int tx_end, struct netdev_queue_stats_tx *tx_sum); +enum { + /* queue restart support custom rx buffer sizes */ + NDO_QUEUE_RX_BUF_SIZE = 0x1, +}; + /** * struct netdev_queue_mgmt_ops - netdev ops for queue management * @@ -130,6 +135,8 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used * for this queue. Return NULL on error. * + * @supported_params: bitmask of supported features, see NDO_QUEUE_* + * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -149,6 +156,8 @@ struct netdev_queue_mgmt_ops { int idx); struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, int idx); + + unsigned supported_params; }; bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 1509a536cb85..be74e4aec7b5 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -161,6 +161,7 @@ struct memory_provider_ops; struct pp_memory_provider_params { void *mp_priv; const struct memory_provider_ops *mp_ops; + u32 rx_buf_len; }; struct page_pool { diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index a0083f176a9c..09d6f97e910e 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -29,6 +29,10 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start) return -EOPNOTSUPP; + if (!(qops->supported_params & NDO_QUEUE_RX_BUF_SIZE) && + rxq->mp_params.rx_buf_len) + return -EOPNOTSUPP; + netdev_assert_locked(dev); new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); -- 2.52.0 Instead of using a constant buffer length, allow configuring the size for each queue separately. There is no way to change the length yet, and it'll be passed from memory providers in a later patch. Suggested-by: Jakub Kicinski Signed-off-by: Pavel Begunkov --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 56 +++++++++++-------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 6 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 +- 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a625e7c311dd..091da26fdf7c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -905,7 +905,7 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) static bool bnxt_separate_head_pool(struct bnxt_rx_ring_info *rxr) { - return rxr->need_head_pool || PAGE_SIZE > BNXT_RX_PAGE_SIZE; + return rxr->need_head_pool || rxr->rx_page_size < PAGE_SIZE; } static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, @@ -915,9 +915,9 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, { struct page *page; - if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) { + if (rxr->rx_page_size < PAGE_SIZE) { page = page_pool_dev_alloc_frag(rxr->page_pool, offset, - BNXT_RX_PAGE_SIZE); + rxr->rx_page_size); } else { page = page_pool_dev_alloc_pages(rxr->page_pool); *offset = 0; @@ -936,8 +936,9 @@ static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, { netmem_ref netmem; - if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) { - netmem = page_pool_alloc_frag_netmem(rxr->page_pool, offset, BNXT_RX_PAGE_SIZE, gfp); + if (rxr->rx_page_size < PAGE_SIZE) { + netmem = page_pool_alloc_frag_netmem(rxr->page_pool, offset, + rxr->rx_page_size, gfp); } else { netmem = page_pool_alloc_netmems(rxr->page_pool, gfp); *offset = 0; @@ -1155,9 +1156,9 @@ static struct sk_buff *bnxt_rx_multi_page_skb(struct bnxt *bp, return NULL; } dma_addr -= bp->rx_dma_offset; - dma_sync_single_for_cpu(&bp->pdev->dev, dma_addr, BNXT_RX_PAGE_SIZE, + dma_sync_single_for_cpu(&bp->pdev->dev, dma_addr, rxr->rx_page_size, bp->rx_dir); - skb = napi_build_skb(data_ptr - bp->rx_offset, BNXT_RX_PAGE_SIZE); + skb = napi_build_skb(data_ptr - bp->rx_offset, rxr->rx_page_size); if (!skb) { page_pool_recycle_direct(rxr->page_pool, page); return NULL; @@ -1189,7 +1190,7 @@ static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp, return NULL; } dma_addr -= bp->rx_dma_offset; - dma_sync_single_for_cpu(&bp->pdev->dev, dma_addr, BNXT_RX_PAGE_SIZE, + dma_sync_single_for_cpu(&bp->pdev->dev, dma_addr, rxr->rx_page_size, bp->rx_dir); if (unlikely(!payload)) @@ -1203,7 +1204,7 @@ static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp, skb_mark_for_recycle(skb); off = (void *)data_ptr - page_address(page); - skb_add_rx_frag(skb, 0, page, off, len, BNXT_RX_PAGE_SIZE); + skb_add_rx_frag(skb, 0, page, off, len, rxr->rx_page_size); memcpy(skb->data - NET_IP_ALIGN, data_ptr - NET_IP_ALIGN, payload + NET_IP_ALIGN); @@ -1288,7 +1289,7 @@ static u32 __bnxt_rx_agg_netmems(struct bnxt *bp, if (skb) { skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, cons_rx_buf->offset, - frag_len, BNXT_RX_PAGE_SIZE); + frag_len, rxr->rx_page_size); } else { skb_frag_t *frag = &shinfo->frags[i]; @@ -1313,7 +1314,7 @@ static u32 __bnxt_rx_agg_netmems(struct bnxt *bp, if (skb) { skb->len -= frag_len; skb->data_len -= frag_len; - skb->truesize -= BNXT_RX_PAGE_SIZE; + skb->truesize -= rxr->rx_page_size; } --shinfo->nr_frags; @@ -1328,7 +1329,7 @@ static u32 __bnxt_rx_agg_netmems(struct bnxt *bp, } page_pool_dma_sync_netmem_for_cpu(rxr->page_pool, netmem, 0, - BNXT_RX_PAGE_SIZE); + rxr->rx_page_size); total_frag_len += frag_len; prod = NEXT_RX_AGG(prod); @@ -2281,8 +2282,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (!skb) goto oom_next_rx; } else { - skb = bnxt_xdp_build_skb(bp, skb, agg_bufs, - rxr->page_pool, &xdp); + skb = bnxt_xdp_build_skb(bp, skb, agg_bufs, rxr, &xdp); if (!skb) { /* we should be able to free the old skb here */ bnxt_xdp_buff_frags_free(rxr, &xdp); @@ -3828,11 +3828,13 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pp.pool_size = bp->rx_agg_ring_size / agg_size_fac; if (BNXT_RX_PAGE_MODE(bp)) pp.pool_size += bp->rx_ring_size / rx_size_fac; + + pp.order = get_order(rxr->rx_page_size); pp.nid = numa_node; pp.netdev = bp->dev; pp.dev = &bp->pdev->dev; pp.dma_dir = bp->rx_dir; - pp.max_len = PAGE_SIZE; + pp.max_len = PAGE_SIZE << pp.order; pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_ALLOW_UNREADABLE_NETMEM; pp.queue_idx = rxr->bnapi->index; @@ -3843,7 +3845,10 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, rxr->page_pool = pool; rxr->need_head_pool = page_pool_is_unreadable(pool); + rxr->need_head_pool |= !!pp.order; if (bnxt_separate_head_pool(rxr)) { + pp.order = 0; + pp.max_len = PAGE_SIZE; pp.pool_size = min(bp->rx_ring_size / rx_size_fac, 1024); pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; pool = page_pool_create(&pp); @@ -4319,6 +4324,8 @@ static void bnxt_init_ring_struct(struct bnxt *bp) if (!rxr) goto skip_rx; + rxr->rx_page_size = BNXT_RX_PAGE_SIZE; + ring = &rxr->rx_ring_struct; rmem = &ring->ring_mem; rmem->nr_pages = bp->rx_nr_pages; @@ -4478,7 +4485,7 @@ static void bnxt_init_one_rx_agg_ring_rxbd(struct bnxt *bp, ring = &rxr->rx_agg_ring_struct; ring->fw_ring_id = INVALID_HW_RING_ID; if ((bp->flags & BNXT_FLAG_AGG_RINGS)) { - type = ((u32)BNXT_RX_PAGE_SIZE << RX_BD_LEN_SHIFT) | + type = ((u32)rxr->rx_page_size << RX_BD_LEN_SHIFT) | RX_BD_TYPE_RX_AGG_BD | RX_BD_FLAGS_SOP; bnxt_init_rxbd_pages(ring, type); @@ -7045,6 +7052,7 @@ static void bnxt_hwrm_ring_grp_free(struct bnxt *bp) static void bnxt_set_rx_ring_params_p5(struct bnxt *bp, u32 ring_type, struct hwrm_ring_alloc_input *req, + struct bnxt_rx_ring_info *rxr, struct bnxt_ring_struct *ring) { struct bnxt_ring_grp_info *grp_info = &bp->grp_info[ring->grp_idx]; @@ -7054,7 +7062,7 @@ static void bnxt_set_rx_ring_params_p5(struct bnxt *bp, u32 ring_type, if (ring_type == HWRM_RING_ALLOC_AGG) { req->ring_type = RING_ALLOC_REQ_RING_TYPE_RX_AGG; req->rx_ring_id = cpu_to_le16(grp_info->rx_fw_ring_id); - req->rx_buf_size = cpu_to_le16(BNXT_RX_PAGE_SIZE); + req->rx_buf_size = cpu_to_le16(rxr->rx_page_size); enables |= RING_ALLOC_REQ_ENABLES_RX_RING_ID_VALID; } else { req->rx_buf_size = cpu_to_le16(bp->rx_buf_use_size); @@ -7068,6 +7076,7 @@ static void bnxt_set_rx_ring_params_p5(struct bnxt *bp, u32 ring_type, } static int hwrm_ring_alloc_send_msg(struct bnxt *bp, + struct bnxt_rx_ring_info *rxr, struct bnxt_ring_struct *ring, u32 ring_type, u32 map_index) { @@ -7124,7 +7133,8 @@ static int hwrm_ring_alloc_send_msg(struct bnxt *bp, cpu_to_le32(bp->rx_ring_mask + 1) : cpu_to_le32(bp->rx_agg_ring_mask + 1); if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) - bnxt_set_rx_ring_params_p5(bp, ring_type, req, ring); + bnxt_set_rx_ring_params_p5(bp, ring_type, req, + rxr, ring); break; case HWRM_RING_ALLOC_CMPL: req->ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL; @@ -7272,7 +7282,7 @@ static int bnxt_hwrm_rx_ring_alloc(struct bnxt *bp, u32 map_idx = bnapi->index; int rc; - rc = hwrm_ring_alloc_send_msg(bp, ring, type, map_idx); + rc = hwrm_ring_alloc_send_msg(bp, rxr, ring, type, map_idx); if (rc) return rc; @@ -7292,7 +7302,7 @@ static int bnxt_hwrm_rx_agg_ring_alloc(struct bnxt *bp, int rc; map_idx = grp_idx + bp->rx_nr_rings; - rc = hwrm_ring_alloc_send_msg(bp, ring, type, map_idx); + rc = hwrm_ring_alloc_send_msg(bp, rxr, ring, type, map_idx); if (rc) return rc; @@ -7316,7 +7326,7 @@ static int bnxt_hwrm_cp_ring_alloc_p5(struct bnxt *bp, ring = &cpr->cp_ring_struct; ring->handle = BNXT_SET_NQ_HDL(cpr); - rc = hwrm_ring_alloc_send_msg(bp, ring, type, map_idx); + rc = hwrm_ring_alloc_send_msg(bp, NULL, ring, type, map_idx); if (rc) return rc; bnxt_set_db(bp, &cpr->cp_db, type, map_idx, ring->fw_ring_id); @@ -7331,7 +7341,7 @@ static int bnxt_hwrm_tx_ring_alloc(struct bnxt *bp, const u32 type = HWRM_RING_ALLOC_TX; int rc; - rc = hwrm_ring_alloc_send_msg(bp, ring, type, tx_idx); + rc = hwrm_ring_alloc_send_msg(bp, NULL, ring, type, tx_idx); if (rc) return rc; bnxt_set_db(bp, &txr->tx_db, type, tx_idx, ring->fw_ring_id); @@ -7357,7 +7367,7 @@ static int bnxt_hwrm_ring_alloc(struct bnxt *bp) vector = bp->irq_tbl[map_idx].vector; disable_irq_nosync(vector); - rc = hwrm_ring_alloc_send_msg(bp, ring, type, map_idx); + rc = hwrm_ring_alloc_send_msg(bp, NULL, ring, type, map_idx); if (rc) { enable_irq(vector); goto err_out; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3613a172483a..bfe36ea1e7c5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1106,6 +1106,7 @@ struct bnxt_rx_ring_info { unsigned long *rx_agg_bmap; u16 rx_agg_bmap_size; + u16 rx_page_size; bool need_head_pool; dma_addr_t rx_desc_mapping[MAX_RX_PAGES]; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 3e77a96e5a3e..619235b151a4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -183,7 +183,7 @@ void bnxt_xdp_buff_init(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, u8 *data_ptr, unsigned int len, struct xdp_buff *xdp) { - u32 buflen = BNXT_RX_PAGE_SIZE; + u32 buflen = rxr->rx_page_size; struct bnxt_sw_rx_bd *rx_buf; struct pci_dev *pdev; dma_addr_t mapping; @@ -461,7 +461,7 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) struct sk_buff * bnxt_xdp_build_skb(struct bnxt *bp, struct sk_buff *skb, u8 num_frags, - struct page_pool *pool, struct xdp_buff *xdp) + struct bnxt_rx_ring_info *rxr, struct xdp_buff *xdp) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -469,7 +469,7 @@ bnxt_xdp_build_skb(struct bnxt *bp, struct sk_buff *skb, u8 num_frags, return NULL; xdp_update_skb_frags_info(skb, num_frags, sinfo->xdp_frags_size, - BNXT_RX_PAGE_SIZE * num_frags, + rxr->rx_page_size * num_frags, xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h index 220285e190fc..8933a0dec09a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h @@ -32,6 +32,6 @@ void bnxt_xdp_buff_init(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, void bnxt_xdp_buff_frags_free(struct bnxt_rx_ring_info *rxr, struct xdp_buff *xdp); struct sk_buff *bnxt_xdp_build_skb(struct bnxt *bp, struct sk_buff *skb, - u8 num_frags, struct page_pool *pool, + u8 num_frags, struct bnxt_rx_ring_info *rxr, struct xdp_buff *xdp); #endif -- 2.52.0 From: Jakub Kicinski The driver tries to provision more agg buffers than header buffers since multiple agg segments can reuse the same header. The calculation / heuristic tries to provide enough pages for 65k of data for each header (or 4 frags per header if the result is too big). This calculation is currently global to the adapter. If we increase the buffer sizes 8x we don't want 8x the amount of memory sitting on the rings. Luckily we don't have to fill the rings completely, adjust the fill level dynamically in case particular queue has buffers larger than the global size. Signed-off-by: Jakub Kicinski [pavel: rebase on top of agg_size_fac, assert agg_size_fac] Signed-off-by: Pavel Begunkov --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 28 +++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 091da26fdf7c..fc88566779a4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3816,16 +3816,34 @@ static void bnxt_free_rx_rings(struct bnxt *bp) } } +static int bnxt_rx_agg_ring_fill_level(struct bnxt *bp, + struct bnxt_rx_ring_info *rxr) +{ + /* User may have chosen larger than default rx_page_size, + * we keep the ring sizes uniform and also want uniform amount + * of bytes consumed per ring, so cap how much of the rings we fill. + */ + int fill_level = bp->rx_agg_ring_size; + + if (rxr->rx_page_size > BNXT_RX_PAGE_SIZE) + fill_level /= rxr->rx_page_size / BNXT_RX_PAGE_SIZE; + + return fill_level; +} + static int bnxt_alloc_rx_page_pool(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, int numa_node) { - const unsigned int agg_size_fac = PAGE_SIZE / BNXT_RX_PAGE_SIZE; + unsigned int agg_size_fac = rxr->rx_page_size / BNXT_RX_PAGE_SIZE; const unsigned int rx_size_fac = PAGE_SIZE / SZ_4K; struct page_pool_params pp = { 0 }; struct page_pool *pool; - pp.pool_size = bp->rx_agg_ring_size / agg_size_fac; + if (WARN_ON_ONCE(agg_size_fac == 0)) + agg_size_fac = 1; + + pp.pool_size = bnxt_rx_agg_ring_fill_level(bp, rxr) / agg_size_fac; if (BNXT_RX_PAGE_MODE(bp)) pp.pool_size += bp->rx_ring_size / rx_size_fac; @@ -4403,11 +4421,13 @@ static void bnxt_alloc_one_rx_ring_netmem(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, int ring_nr) { + int fill_level, i; u32 prod; - int i; + + fill_level = bnxt_rx_agg_ring_fill_level(bp, rxr); prod = rxr->rx_agg_prod; - for (i = 0; i < bp->rx_agg_ring_size; i++) { + for (i = 0; i < fill_level; i++) { if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) { netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n", ring_nr, i, bp->rx_agg_ring_size); -- 2.52.0 Implement NDO_QUEUE_RX_BUF_SIZE and take the rx buf size from the memory providers. Signed-off-by: Pavel Begunkov --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 34 +++++++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 35 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fc88566779a4..698ed30b1dcc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -15906,16 +15906,46 @@ static const struct netdev_stat_ops bnxt_stat_ops = { .get_base_stats = bnxt_get_base_stats, }; +static ssize_t bnxt_get_rx_buf_size(struct bnxt *bp, int rxq_idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(bp->dev, rxq_idx); + size_t rx_buf_size; + + rx_buf_size = rxq->mp_params.rx_buf_len; + if (!rx_buf_size) + return BNXT_RX_PAGE_SIZE; + + /* Older chips need MSS calc so rx_buf_len is not supported, + * but we don't set queue ops for them so we should never get here. + */ + if (!(bp->flags & BNXT_FLAG_CHIP_P5_PLUS)) + return -EINVAL; + + if (!is_power_of_2(rx_buf_size)) + return -ERANGE; + + if (rx_buf_size < BNXT_RX_PAGE_SIZE || + rx_buf_size > BNXT_MAX_RX_PAGE_SIZE) + return -ERANGE; + + return rx_buf_size; +} + static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) { struct bnxt_rx_ring_info *rxr, *clone; struct bnxt *bp = netdev_priv(dev); struct bnxt_ring_struct *ring; + ssize_t rx_buf_size; int rc; if (!bp->rx_ring) return -ENETDOWN; + rx_buf_size = bnxt_get_rx_buf_size(bp, idx); + if (rx_buf_size < 0) + return rx_buf_size; + rxr = &bp->rx_ring[idx]; clone = qmem; memcpy(clone, rxr, sizeof(*rxr)); @@ -15927,6 +15957,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) clone->rx_sw_agg_prod = 0; clone->rx_next_cons = 0; clone->need_head_pool = false; + clone->rx_page_size = rx_buf_size; rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid); if (rc) @@ -16053,6 +16084,8 @@ static void bnxt_copy_rx_ring(struct bnxt *bp, src_ring = &src->rx_agg_ring_struct; src_rmem = &src_ring->ring_mem; + dst->rx_page_size = src->rx_page_size; + WARN_ON(dst_rmem->nr_pages != src_rmem->nr_pages); WARN_ON(dst_rmem->page_size != src_rmem->page_size); WARN_ON(dst_rmem->flags != src_rmem->flags); @@ -16205,6 +16238,7 @@ static const struct netdev_queue_mgmt_ops bnxt_queue_mgmt_ops = { .ndo_queue_mem_free = bnxt_queue_mem_free, .ndo_queue_start = bnxt_queue_start, .ndo_queue_stop = bnxt_queue_stop, + .supported_params = NDO_QUEUE_RX_BUF_SIZE, }; static void bnxt_remove_one(struct pci_dev *pdev) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index bfe36ea1e7c5..b59b8e4984f4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -759,6 +759,7 @@ struct nqe_cn { #endif #define BNXT_RX_PAGE_SIZE (1 << BNXT_RX_PAGE_SHIFT) +#define BNXT_MAX_RX_PAGE_SIZE (1 << 15) #define BNXT_MAX_MTU 9500 -- 2.52.0 struct io_uring_zcrx_ifq_reg::rx_buf_len is used as a hint specifying the kernel what buffer size it should use. Document the API and limitations. Signed-off-by: Pavel Begunkov --- Documentation/networking/iou-zcrx.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/networking/iou-zcrx.rst b/Documentation/networking/iou-zcrx.rst index 54a72e172bdc..7f3f4b2e6cf2 100644 --- a/Documentation/networking/iou-zcrx.rst +++ b/Documentation/networking/iou-zcrx.rst @@ -196,6 +196,26 @@ Return buffers back to the kernel to be used again:: rqe->len = cqe->res; IO_URING_WRITE_ONCE(*refill_ring.ktail, ++refill_ring.rq_tail); +Area chunking +------------- + +zcrx splits the memory area into fixed-length physically contiguous chunks. +This limits the maximum buffer size returned in a single io_uring CQE. Users +can provide a hint to the kernel to use larger chunks by setting the +``rx_buf_len`` field of ``struct io_uring_zcrx_ifq_reg`` to the desired length +during registration. If this field is set to zero, the kernel defaults to +the system page size. + +To use larger sizes, the memory area must be backed by physically contiguous +ranges whose sizes are multiples of ``rx_buf_len``. It also requires kernel +and hardware support. If registration fails, users are generally expected to +fall back to defaults by setting ``rx_buf_len`` to zero. + +Larger chunks don't give any additional guarantees about buffer sizes returned +in CQEs, and they can vary depending on many factors like traffic pattern, +hardware offload, etc. It doesn't require any application changes beyond zcrx +registration. + Testing ======= -- 2.52.0 Add a test using large chunks for zcrx memory area. Signed-off-by: Pavel Begunkov --- .../selftests/drivers/net/hw/iou-zcrx.c | 72 +++++++++++++++---- .../selftests/drivers/net/hw/iou-zcrx.py | 37 ++++++++++ 2 files changed, 97 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c index 62456df947bc..fed5452e2ca3 100644 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -37,6 +38,23 @@ #include +#define SKIP_CODE 42 + +struct t_io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u32 zcrx_id; + __u32 rx_buf_len; + __u64 __resv[3]; +}; + static long page_size; #define AREA_SIZE (8192 * page_size) #define SEND_SIZE (512 * 4096) @@ -65,6 +83,8 @@ static bool cfg_oneshot; static int cfg_oneshot_recvs; static int cfg_send_size = SEND_SIZE; static struct sockaddr_in6 cfg_addr; +static unsigned cfg_rx_buf_len = 0; +static bool cfg_dry_run; static char *payload; static void *area_ptr; @@ -128,14 +148,28 @@ static void setup_zcrx(struct io_uring *ring) if (!ifindex) error(1, 0, "bad interface name: %s", cfg_ifname); - area_ptr = mmap(NULL, - AREA_SIZE, - PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, - 0, - 0); - if (area_ptr == MAP_FAILED) - error(1, 0, "mmap(): zero copy area"); + if (cfg_rx_buf_len && cfg_rx_buf_len != page_size) { + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | + MAP_HUGETLB | MAP_HUGE_2MB, + -1, + 0); + if (area_ptr == MAP_FAILED) { + printf("Can't allocate huge pages\n"); + exit(SKIP_CODE); + } + } else { + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + if (area_ptr == MAP_FAILED) + error(1, 0, "mmap(): zero copy area"); + } ring_size = get_refill_ring_size(rq_entries); ring_ptr = mmap(NULL, @@ -157,17 +191,23 @@ static void setup_zcrx(struct io_uring *ring) .flags = 0, }; - struct io_uring_zcrx_ifq_reg reg = { + struct t_io_uring_zcrx_ifq_reg reg = { .if_idx = ifindex, .if_rxq = cfg_queue_id, .rq_entries = rq_entries, .area_ptr = (__u64)(unsigned long)&area_reg, .region_ptr = (__u64)(unsigned long)®ion_reg, + .rx_buf_len = cfg_rx_buf_len, }; - ret = io_uring_register_ifq(ring, ®); - if (ret) + ret = io_uring_register_ifq(ring, (void *)®); + if (cfg_rx_buf_len && (ret == -EINVAL || ret == -EOPNOTSUPP || + ret == -ERANGE)) { + printf("Large chunks are not supported %i\n", ret); + exit(SKIP_CODE); + } else if (ret) { error(1, 0, "io_uring_register_ifq(): %d", ret); + } rq_ring.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head); rq_ring.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail); @@ -323,6 +363,8 @@ static void run_server(void) io_uring_queue_init(512, &ring, flags); setup_zcrx(&ring); + if (cfg_dry_run) + return; add_accept(&ring, fd); @@ -383,7 +425,7 @@ static void parse_opts(int argc, char **argv) usage(argv[0]); cfg_payload_len = max_payload_len; - while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:")) != -1) { + while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) != -1) { switch (c) { case 's': if (cfg_client) @@ -418,6 +460,12 @@ static void parse_opts(int argc, char **argv) case 'z': cfg_send_size = strtoul(optarg, NULL, 0); break; + case 'x': + cfg_rx_buf_len = page_size * strtoul(optarg, NULL, 0); + break; + case 'd': + cfg_dry_run = true; + break; } } diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 712c806508b5..83061b27f2f2 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -7,6 +7,7 @@ from lib.py import ksft_run, ksft_exit, KsftSkipEx from lib.py import NetDrvEpEnv from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen +SKIP_CODE = 42 def _get_current_settings(cfg): output = ethtool(f"-g {cfg.ifname}", json=True)[0] @@ -132,6 +133,42 @@ def test_zcrx_rss(cfg) -> None: cmd(tx_cmd, host=cfg.remote) +def test_zcrx_large_chunks(cfg) -> None: + cfg.require_ipver('6') + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + (rx_ring, hds_thresh) = _get_current_settings(cfg) + port = rand_port() + + ethtool(f"-G {cfg.ifname} tcp-data-split on") + defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") + + ethtool(f"-G {cfg.ifname} hds-thresh 0") + defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + + ethtool(f"-G {cfg.ifname} rx 64") + defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") + defer(ethtool, f"-X {cfg.ifname} default") + + flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -x 2" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" + + probe = cmd(rx_cmd + " -d", fail=False) + if probe.ret == SKIP_CODE: + raise KsftSkipEx(probe.stdout) + + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(port, proto="tcp") + cmd(tx_cmd, host=cfg.remote) + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") -- 2.52.0