From: Patrisious Haddad When moving to switchdev mode when the device doesn't support IPsec, we try to clean up the IPsec resources anyway which causes the crash below, fix that by correctly checking for IPsec support before trying to clean up its resources. [27642.515799] WARNING: arch/x86/mm/fault.c:1276 at do_user_addr_fault+0x18a/0x680, CPU#4: devlink/6490 [27642.517159] Modules linked in: xt_conntrack xt_MASQUERADE ip6table_nat ip6table_filter ip6_tables iptable_nat nf_nat xt_addrtype rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_fwctl nfnetlink zram zsmalloc mlx5_ib fuse rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_core ib_core [27642.521358] CPU: 4 UID: 0 PID: 6490 Comm: devlink Not tainted 6.19.0-rc5_for_upstream_min_debug_2026_01_14_16_47 #1 NONE [27642.522923] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [27642.524528] RIP: 0010:do_user_addr_fault+0x18a/0x680 [27642.525362] Code: ff 0f 84 75 03 00 00 48 89 ee 4c 89 e7 e8 5e b9 22 00 49 89 c0 48 85 c0 0f 84 a8 02 00 00 f7 c3 60 80 00 00 74 22 31 c9 eb ae <0f> 0b 48 83 c4 10 48 89 ea 48 89 de 4c 89 f7 5b 5d 41 5c 41 5d 41 [27642.528166] RSP: 0018:ffff88810770f6b8 EFLAGS: 00010046 [27642.529038] RAX: 0000000000000000 RBX: 0000000000000002 RCX: ffff88810b980f00 [27642.530158] RDX: 00000000000000a0 RSI: 0000000000000002 RDI: ffff88810770f728 [27642.531270] RBP: 00000000000000a0 R08: 0000000000000000 R09: 0000000000000000 [27642.532383] R10: 0000000000000000 R11: 0000000000000000 R12: ffff888103f3c4c0 [27642.533499] R13: 0000000000000000 R14: ffff88810770f728 R15: 0000000000000000 [27642.534614] FS: 00007f197c741740(0000) GS:ffff88856a94c000(0000) knlGS:0000000000000000 [27642.535915] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [27642.536858] CR2: 00000000000000a0 CR3: 000000011334c003 CR4: 0000000000172eb0 [27642.537982] Call Trace: [27642.538466] [27642.538907] exc_page_fault+0x76/0x140 [27642.539583] asm_exc_page_fault+0x22/0x30 [27642.540282] RIP: 0010:_raw_spin_lock_irqsave+0x10/0x30 [27642.541134] Code: 07 85 c0 75 11 ba ff 00 00 00 f0 0f b1 17 75 06 b8 01 00 00 00 c3 31 c0 c3 90 0f 1f 44 00 00 53 9c 5b fa 31 c0 ba 01 00 00 00 0f b1 17 75 05 48 89 d8 5b c3 89 c6 e8 7e 02 00 00 48 89 d8 5b [27642.543936] RSP: 0018:ffff88810770f7d8 EFLAGS: 00010046 [27642.544803] RAX: 0000000000000000 RBX: 0000000000000202 RCX: ffff888113ad96d8 [27642.545916] RDX: 0000000000000001 RSI: ffff88810770f818 RDI: 00000000000000a0 [27642.547027] RBP: 0000000000000098 R08: 0000000000000400 R09: ffff88810b980f00 [27642.548140] R10: 0000000000000001 R11: ffff888101845a80 R12: 00000000000000a8 [27642.549263] R13: ffffffffa02a9060 R14: 00000000000000a0 R15: ffff8881130d8a40 [27642.550379] complete_all+0x20/0x90 [27642.551010] mlx5e_ipsec_disable_events+0xb6/0xf0 [mlx5_core] [27642.552022] mlx5e_nic_disable+0x12d/0x220 [mlx5_core] [27642.552929] mlx5e_detach_netdev+0x66/0xf0 [mlx5_core] [27642.553822] mlx5e_netdev_change_profile+0x5b/0x120 [mlx5_core] [27642.554821] mlx5e_vport_rep_load+0x419/0x590 [mlx5_core] [27642.555757] ? xa_load+0x53/0x90 [27642.556361] __esw_offloads_load_rep+0x54/0x70 [mlx5_core] [27642.557328] mlx5_esw_offloads_rep_load+0x45/0xd0 [mlx5_core] [27642.558320] esw_offloads_enable+0xb4b/0xc90 [mlx5_core] [27642.559247] mlx5_eswitch_enable_locked+0x34e/0x4f0 [mlx5_core] [27642.560257] ? mlx5_rescan_drivers_locked+0x222/0x2d0 [mlx5_core] [27642.561284] mlx5_devlink_eswitch_mode_set+0x5ac/0x9c0 [mlx5_core] [27642.562334] ? devlink_rate_set_ops_supported+0x21/0x3a0 [27642.563220] devlink_nl_eswitch_set_doit+0x67/0xe0 [27642.564026] genl_family_rcv_msg_doit+0xe0/0x130 [27642.564816] genl_rcv_msg+0x183/0x290 [27642.565466] ? __devlink_nl_pre_doit.isra.0+0x160/0x160 [27642.566329] ? devlink_nl_eswitch_get_doit+0x290/0x290 [27642.567181] ? devlink_nl_pre_doit_parent_dev_optional+0x20/0x20 [27642.568147] ? genl_family_rcv_msg_dumpit+0xf0/0xf0 [27642.568966] netlink_rcv_skb+0x4b/0xf0 [27642.569629] genl_rcv+0x24/0x40 [27642.570215] netlink_unicast+0x255/0x380 [27642.570901] ? __alloc_skb+0xfa/0x1e0 [27642.571560] netlink_sendmsg+0x1f3/0x420 [27642.572249] __sock_sendmsg+0x38/0x60 [27642.572911] __sys_sendto+0x119/0x180 [27642.573561] ? __sys_recvmsg+0x5c/0xb0 [27642.574227] __x64_sys_sendto+0x20/0x30 [27642.574904] do_syscall_64+0x55/0xc10 [27642.575554] entry_SYSCALL_64_after_hwframe+0x4b/0x53 [27642.576391] RIP: 0033:0x7f197c85e807 [27642.577050] Code: c7 c0 ff ff ff ff eb be 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 80 3d 45 08 0d 00 00 41 89 ca 74 10 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 69 c3 55 48 89 e5 53 48 83 ec 38 44 89 4d d0 [27642.579846] RSP: 002b:00007ffebd4e2248 EFLAGS: 00000202 ORIG_RAX: 000000000000002c [27642.581082] RAX: ffffffffffffffda RBX: 000055cfcd9cd2a0 RCX: 00007f197c85e807 [27642.582200] RDX: 0000000000000038 RSI: 000055cfcd9cd490 RDI: 0000000000000003 [27642.583320] RBP: 00007ffebd4e2290 R08: 00007f197c942200 R09: 000000000000000c [27642.584437] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 [27642.585555] R13: 000055cfcd9cd490 R14: 00007ffebd4e45d1 R15: 000055cfcd9cd2a0 [27642.586671] [27642.587121] ---[ end trace 0000000000000000 ]--- [27642.587910] BUG: kernel NULL pointer dereference, address: 00000000000000a0 Fixes: 664f76be38a1 ("net/mlx5: Fix IPsec cleanup over MPV device") Signed-off-by: Patrisious Haddad Reviewed-by: Leon Romanovsky Signed-off-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 197a1c6930c0..329608c59313 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -2912,7 +2912,7 @@ void mlx5e_ipsec_disable_events(struct mlx5e_priv *priv) goto out; peer_priv = mlx5_devcom_get_next_peer_data(priv->devcom, &tmp); - if (peer_priv) + if (peer_priv && peer_priv->ipsec) complete_all(&peer_priv->ipsec->comp); mlx5_devcom_for_each_peer_end(priv->devcom); -- 2.44.0 From: Carolina Jubran The check on mlx5_esw_host_functions_enabled(esw->dev) for adding VF peer miss rules is incorrect. These rules match traffic from peer's VFs, so the local device's host function status is irrelevant. Remove this check to ensure peer VF traffic is properly handled regardless of local host configuration. Also fix the PF peer miss rule deletion to be symmetric with the add path, so only attempt to delete the rule if it was actually created. Fixes: 520369ef43a8 ("net/mlx5: Support disabling host PFs") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan --- .../mellanox/mlx5/core/eswitch_offloads.c | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 1366f6e489bd..2f55ea3f8bf8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1241,21 +1241,17 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, flows[peer_vport->index] = flow; } - if (mlx5_esw_host_functions_enabled(esw->dev)) { - mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport, - mlx5_core_max_vfs(peer_dev)) { - esw_set_peer_miss_rule_source_port(esw, peer_esw, - spec, - peer_vport->vport); - - flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), - spec, &flow_act, &dest, 1); - if (IS_ERR(flow)) { - err = PTR_ERR(flow); - goto add_vf_flow_err; - } - flows[peer_vport->index] = flow; + mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_vfs(peer_dev)) { + esw_set_peer_miss_rule_source_port(esw, peer_esw, spec, + peer_vport->vport); + flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), + spec, &flow_act, &dest, 1); + if (IS_ERR(flow)) { + err = PTR_ERR(flow); + goto add_vf_flow_err; } + flows[peer_vport->index] = flow; } if (mlx5_core_ec_sriov_enabled(peer_dev)) { @@ -1347,7 +1343,8 @@ static void esw_del_fdb_peer_miss_rules(struct mlx5_eswitch *esw, mlx5_del_flow_rules(flows[peer_vport->index]); } - if (mlx5_core_is_ecpf_esw_manager(peer_dev)) { + if (mlx5_core_is_ecpf_esw_manager(peer_dev) && + mlx5_esw_host_functions_enabled(peer_dev)) { peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF); mlx5_del_flow_rules(flows[peer_vport->index]); } -- 2.44.0 From: Gal Pressman In case of a TX error CQE, a recovery flow is triggered, mlx5e_reset_txqsq_cc_pc() resets dma_fifo_cc to 0 but not dma_fifo_pc, desyncing the DMA FIFO producer and consumer. After recovery, the producer pushes new DMA entries at the old dma_fifo_pc, while the consumer reads from position 0. This causes us to unmap stale DMA addresses from before the recovery. The DMA FIFO is a purely software construct with no HW counterpart. At the point of reset, all WQEs have been flushed so dma_fifo_cc is already equal to dma_fifo_pc. There is no need to reset either counter, similar to how skb_fifo pc/cc are untouched. Remove the 'dma_fifo_cc = 0' reset. This fixes the following WARNING: WARNING: CPU: 0 PID: 0 at drivers/iommu/dma-iommu.c:1240 iommu_dma_unmap_page+0x79/0x90 Modules linked in: mlx5_vdpa vringh vdpa bonding mlx5_ib mlx5_vfio_pci ipip mlx5_fwctl tunnel4 mlx5_core ib_ipoib geneve ip6_gre ip_gre gre nf_tables ip6_tunnel rdma_ucm ib_uverbs ib_umad vfio_pci vfio_pci_core act_mirred act_skbedit act_vlan vhost_net vhost tap ip6table_mangle ip6table_nat ip6table_filter ip6_tables iptable_mangle cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress vhost_iotlb iptable_raw tunnel6 vfio_iommu_type1 vfio openvswitch nsh rpcsec_gss_krb5 auth_rpcgss oid_registry xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter overlay zram zsmalloc rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core fuse [last unloaded: nf_tables] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.13.0-rc5_for_upstream_min_debug_2024_12_30_21_33 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:iommu_dma_unmap_page+0x79/0x90 Code: 2b 4d 3b 21 72 26 4d 3b 61 08 73 20 49 89 d8 44 89 f9 5b 4c 89 f2 4c 89 e6 48 89 ef 5d 41 5c 41 5d 41 5e 41 5f e9 c7 ae 9e ff <0f> 0b 5b 5d 41 5c 41 5d 41 5e 41 5f c3 66 2e 0f 1f 84 00 00 00 00 Call Trace: ? __warn+0x7d/0x110 ? iommu_dma_unmap_page+0x79/0x90 ? report_bug+0x16d/0x180 ? handle_bug+0x4f/0x90 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? iommu_dma_unmap_page+0x79/0x90 ? iommu_dma_unmap_page+0x2e/0x90 dma_unmap_page_attrs+0x10d/0x1b0 mlx5e_tx_wi_dma_unmap+0xbe/0x120 [mlx5_core] mlx5e_poll_tx_cq+0x16d/0x690 [mlx5_core] mlx5e_napi_poll+0x8b/0xac0 [mlx5_core] __napi_poll+0x24/0x190 net_rx_action+0x32a/0x3b0 ? mlx5_eq_comp_int+0x7e/0x270 [mlx5_core] ? notifier_call_chain+0x35/0xa0 handle_softirqs+0xc9/0x270 irq_exit_rcu+0x71/0xd0 common_interrupt+0x7f/0xa0 asm_common_interrupt+0x22/0x40 Fixes: db75373c91b0 ("net/mlx5e: Recover Send Queue (SQ) from error state") Signed-off-by: Gal Pressman Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 60ba840e00fa..afdeb1b3d425 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -47,7 +47,6 @@ static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", sq->sqn, sq->cc, sq->pc); sq->cc = 0; - sq->dma_fifo_cc = 0; sq->pc = 0; } -- 2.44.0 From: Dragos Tatulea XDP multi-buf programs can modify the layout of the XDP buffer when the program calls bpf_xdp_pull_data() or bpf_xdp_adjust_tail(). The referenced commit in the fixes tag corrected the assumption in the mlx5 driver that the XDP buffer layout doesn't change during a program execution. However, this fix introduced another issue: the dropped fragments still need to be counted on the driver side to avoid page fragment reference counting issues. The issue was discovered by the drivers/net/xdp.py selftest, more specifically the test_xdp_native_tx_mb: - The mlx5 driver allocates a page_pool page and initializes it with a frag counter of 64 (pp_ref_count=64) and the internal frag counter to 0. - The test sends one packet with no payload. - On RX (mlx5e_skb_from_cqe_mpwrq_nonlinear()), mlx5 configures the XDP buffer with the packet data starting in the first fragment which is the page mentioned above. - The XDP program runs and calls bpf_xdp_pull_data() which moves the header into the linear part of the XDP buffer. As the packet doesn't contain more data, the program drops the tail fragment since it no longer contains any payload (pp_ref_count=63). - mlx5 device skips counting this fragment. Internal frag counter remains 0. - mlx5 releases all 64 fragments of the page but page pp_ref_count is 63 => negative reference counting error. Resulting splat during the test: WARNING: CPU: 0 PID: 188225 at ./include/net/page_pool/helpers.h:297 mlx5e_page_release_fragmented.isra.0+0xbd/0xe0 [mlx5_core] Modules linked in: [...] CPU: 0 UID: 0 PID: 188225 Comm: ip Not tainted 6.18.0-rc7_for_upstream_min_debug_2025_12_08_11_44 #1 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5e_page_release_fragmented.isra.0+0xbd/0xe0 [mlx5_core] [...] Call Trace: mlx5e_free_rx_mpwqe+0x20a/0x250 [mlx5_core] mlx5e_dealloc_rx_mpwqe+0x37/0xb0 [mlx5_core] mlx5e_free_rx_descs+0x11a/0x170 [mlx5_core] mlx5e_close_rq+0x78/0xa0 [mlx5_core] mlx5e_close_queues+0x46/0x2a0 [mlx5_core] mlx5e_close_channel+0x24/0x90 [mlx5_core] mlx5e_close_channels+0x5d/0xf0 [mlx5_core] mlx5e_safe_switch_params+0x2ec/0x380 [mlx5_core] mlx5e_change_mtu+0x11d/0x490 [mlx5_core] mlx5e_change_nic_mtu+0x19/0x30 [mlx5_core] netif_set_mtu_ext+0xfc/0x240 do_setlink.isra.0+0x226/0x1100 rtnl_newlink+0x7a9/0xba0 rtnetlink_rcv_msg+0x220/0x3c0 netlink_rcv_skb+0x4b/0xf0 netlink_unicast+0x255/0x380 netlink_sendmsg+0x1f3/0x420 __sock_sendmsg+0x38/0x60 ____sys_sendmsg+0x1e8/0x240 ___sys_sendmsg+0x7c/0xb0 [...] __sys_sendmsg+0x5f/0xb0 do_syscall_64+0x55/0xc70 The problem applies for XDP_PASS as well which is handled in a different code path in the driver. This patch fixes the issue by doing page frag counting on all the original XDP buffer fragments for all relevant XDP actions (XDP_TX , XDP_REDIRECT and XDP_PASS). This is basically reverting to the original counting before the commit in the fixes tag. As frag_page is still pointing to the original tail, the nr_frags parameter to xdp_update_skb_frags_info() needs to be calculated in a different way to reflect the new nr_frags. Fixes: 87bcef158ac1 ("net/mlx5e: RX, Fix generating skb from non-linear xdp_buff for striding RQ") Signed-off-by: Dragos Tatulea Cc: Amery Hung Reviewed-by: Nimrod Oren Signed-off-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index efcfcddab376..40e53a612989 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1957,14 +1957,13 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w if (prog) { u8 nr_frags_free, old_nr_frags = sinfo->nr_frags; + u8 new_nr_frags; u32 len; if (mlx5e_xdp_handle(rq, prog, mxbuf)) { if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { struct mlx5e_frag_page *pfp; - frag_page -= old_nr_frags - sinfo->nr_frags; - for (pfp = head_page; pfp < frag_page; pfp++) pfp->frags++; @@ -1975,13 +1974,12 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w return NULL; /* page/packet was consumed by XDP */ } - nr_frags_free = old_nr_frags - sinfo->nr_frags; - if (unlikely(nr_frags_free)) { - frag_page -= nr_frags_free; + new_nr_frags = sinfo->nr_frags; + nr_frags_free = old_nr_frags - new_nr_frags; + if (unlikely(nr_frags_free)) truesize -= (nr_frags_free - 1) * PAGE_SIZE + ALIGN(pg_consumed_bytes, BIT(rq->mpwqe.log_stride_sz)); - } len = mxbuf->xdp.data_end - mxbuf->xdp.data; @@ -2003,7 +2001,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w struct mlx5e_frag_page *pagep; /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_frags_info(skb, frag_page - head_page, + xdp_update_skb_frags_info(skb, new_nr_frags, sinfo->xdp_frags_size, truesize, xdp_buff_get_skb_flags(&mxbuf->xdp)); -- 2.44.0 From: Dragos Tatulea XDP multi-buf programs can modify the layout of the XDP buffer when the program calls bpf_xdp_pull_data() or bpf_xdp_adjust_tail(). The referenced commit in the fixes tag corrected the assumption in the mlx5 driver that the XDP buffer layout doesn't change during a program execution. However, this fix introduced another issue: the dropped fragments still need to be counted on the driver side to avoid page fragment reference counting issues. Such issue can be observed with the test_xdp_native_adjst_tail_shrnk_data selftest when using a payload of 3600 and shrinking by 256 bytes (an upcoming selftest patch): the last fragment gets released by the XDP code but doesn't get tracked by the driver. This results in a negative pp_ref_count during page release and the following splat: WARNING: include/net/page_pool/helpers.h:297 at mlx5e_page_release_fragmented.isra.0+0x4a/0x50 [mlx5_core], CPU#12: ip/3137 Modules linked in: [...] CPU: 12 UID: 0 PID: 3137 Comm: ip Not tainted 6.19.0-rc3+ #12 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5e_page_release_fragmented.isra.0+0x4a/0x50 [mlx5_core] [...] Call Trace: mlx5e_dealloc_rx_wqe+0xcb/0x1a0 [mlx5_core] mlx5e_free_rx_descs+0x7f/0x110 [mlx5_core] mlx5e_close_rq+0x50/0x60 [mlx5_core] mlx5e_close_queues+0x36/0x2c0 [mlx5_core] mlx5e_close_channel+0x1c/0x50 [mlx5_core] mlx5e_close_channels+0x45/0x80 [mlx5_core] mlx5e_safe_switch_params+0x1a5/0x230 [mlx5_core] mlx5e_change_mtu+0xf3/0x2f0 [mlx5_core] netif_set_mtu_ext+0xf1/0x230 do_setlink.isra.0+0x219/0x1180 rtnl_newlink+0x79f/0xb60 rtnetlink_rcv_msg+0x213/0x3a0 netlink_rcv_skb+0x48/0xf0 netlink_unicast+0x24a/0x350 netlink_sendmsg+0x1ee/0x410 __sock_sendmsg+0x38/0x60 ____sys_sendmsg+0x232/0x280 ___sys_sendmsg+0x78/0xb0 __sys_sendmsg+0x5f/0xb0 [...] do_syscall_64+0x57/0xc50 This patch fixes the issue by doing page frag counting on all the original XDP buffer fragments for all relevant XDP actions (XDP_TX , XDP_REDIRECT and XDP_PASS). This is basically reverting to the original counting before the commit in the fixes tag. As frag_page is still pointing to the original tail, the nr_frags parameter to xdp_update_skb_frags_info() needs to be calculated in a different way to reflect the new nr_frags. Fixes: afd5ba577c10 ("net/mlx5e: RX, Fix generating skb from non-linear xdp_buff for legacy RQ") Signed-off-by: Dragos Tatulea Cc: Amery Hung Signed-off-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 40e53a612989..268e20884757 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1589,6 +1589,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi struct skb_shared_info *sinfo; u32 frag_consumed_bytes; struct bpf_prog *prog; + u8 nr_frags_free = 0; struct sk_buff *skb; dma_addr_t addr; u32 truesize; @@ -1631,15 +1632,13 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi prog = rcu_dereference(rq->xdp_prog); if (prog) { - u8 nr_frags_free, old_nr_frags = sinfo->nr_frags; + u8 old_nr_frags = sinfo->nr_frags; if (mlx5e_xdp_handle(rq, prog, mxbuf)) { if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { struct mlx5e_wqe_frag_info *pwi; - wi -= old_nr_frags - sinfo->nr_frags; - for (pwi = head_wi; pwi < wi; pwi++) pwi->frag_page->frags++; } @@ -1647,10 +1646,8 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi } nr_frags_free = old_nr_frags - sinfo->nr_frags; - if (unlikely(nr_frags_free)) { - wi -= nr_frags_free; + if (unlikely(nr_frags_free)) truesize -= nr_frags_free * frag_info->frag_stride; - } } skb = mlx5e_build_linear_skb( @@ -1666,7 +1663,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi if (xdp_buff_has_frags(&mxbuf->xdp)) { /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_frags_info(skb, wi - head_wi - 1, + xdp_update_skb_frags_info(skb, wi - head_wi - nr_frags_free - 1, sinfo->xdp_frags_size, truesize, xdp_buff_get_skb_flags(&mxbuf->xdp)); -- 2.44.0