Move multiple copies of same code snippet doing `gro_flush` and `gro_normal_list` into separate helper function. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn --- v6: - gro_flush_helper renamed to gro_flush_normal and moved to gro.h. Also used it in kernel/bpf/cpumap.c --- include/net/gro.h | 6 ++++++ kernel/bpf/cpumap.c | 3 +-- net/core/dev.c | 9 +++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/net/gro.h b/include/net/gro.h index 22d3a69e4404..a0fca7ac6e7e 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -534,6 +534,12 @@ static inline void gro_normal_list(struct gro_node *gro) gro->rx_count = 0; } +static inline void gro_flush_normal(struct gro_node *gro, bool flush_old) +{ + gro_flush(gro, flush_old); + gro_normal_list(gro); +} + /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 67e8a2fc1a99..b2b7b8ec2c2a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -282,8 +282,7 @@ static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) * This is equivalent to how NAPI decides whether to perform a full * flush. */ - gro_flush(&rcpu->gro, !empty && HZ >= 1000); - gro_normal_list(&rcpu->gro); + gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000); } static int cpu_map_kthread_run(void *data) diff --git a/net/core/dev.c b/net/core/dev.c index 621a639aeba1..cc216a461743 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6574,8 +6574,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) * it, we need to bound somehow the time packets are kept in * the GRO layer. */ - gro_flush(&n->gro, !!timeout); - gro_normal_list(&n->gro); + gro_flush_normal(&n->gro, !!timeout); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6645,8 +6644,7 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) } /* Flush too old packets. If HZ < 1000, flush all packets */ - gro_flush(&napi->gro, HZ >= 1000); - gro_normal_list(&napi->gro); + gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -7511,8 +7509,7 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) } /* Flush too old packets. If HZ < 1000, flush all packets */ - gro_flush(&n->gro, HZ >= 1000); - gro_normal_list(&n->gro); + gro_flush_normal(&n->gro, HZ >= 1000); /* Some drivers may have called napi_schedule * prior to exhausting their budget. -- 2.50.0.727.gbf7dc18ff4-goog Prepare for adding an enum type for NAPI threaded states by adding dev_set_threaded_hint API. De-export the existing dev_set_threaded API and only use it internally. Update existing drivers to use dev_set_threaded_hint instead of the de-exported dev_set_threaded. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn --- drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/pci.c | 2 +- drivers/net/ethernet/renesas/ravb_main.c | 2 +- drivers/net/wireguard/device.c | 2 +- drivers/net/wireless/ath/ath10k/snoc.c | 2 +- drivers/net/wireless/mediatek/mt76/debugfs.c | 2 +- include/linux/netdevice.h | 2 +- net/core/dev.c | 7 ++++++- net/core/dev.h | 2 ++ 9 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c index ef1a51347351..4519379d284c 100644 --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c @@ -2688,7 +2688,7 @@ static int atl1c_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->mii.mdio_write = atl1c_mdio_write; adapter->mii.phy_id_mask = 0x1f; adapter->mii.reg_num_mask = MDIO_CTRL_REG_MASK; - dev_set_threaded(netdev, true); + dev_set_threaded_hint(netdev); for (i = 0; i < adapter->rx_queue_count; ++i) netif_napi_add(netdev, &adapter->rrd_ring[i].napi, atl1c_clean_rx); diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 058dcabfaa2e..268b830ce17e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -156,7 +156,7 @@ static int mlxsw_pci_napi_devs_init(struct mlxsw_pci *mlxsw_pci) } strscpy(mlxsw_pci->napi_dev_rx->name, "mlxsw_rx", sizeof(mlxsw_pci->napi_dev_rx->name)); - dev_set_threaded(mlxsw_pci->napi_dev_rx, true); + dev_set_threaded_hint(mlxsw_pci->napi_dev_rx); return 0; diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index c9f4976a3527..31b2cb11764d 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -3075,7 +3075,7 @@ static int ravb_probe(struct platform_device *pdev) if (info->coalesce_irqs) { netdev_sw_irq_coalesce_default_on(ndev); if (num_present_cpus() == 1) - dev_set_threaded(ndev, true); + dev_set_threaded_hint(ndev); } /* Network device register */ diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 4a529f1f9bea..1f3e4e7cc90a 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -366,7 +366,7 @@ static int wg_newlink(struct net_device *dev, if (ret < 0) goto err_free_handshake_queue; - dev_set_threaded(dev, true); + dev_set_threaded_hint(dev); ret = register_netdevice(dev); if (ret < 0) goto err_uninit_ratelimiter; diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c index d51f2e5a79a4..d6412330d8ef 100644 --- a/drivers/net/wireless/ath/ath10k/snoc.c +++ b/drivers/net/wireless/ath/ath10k/snoc.c @@ -936,7 +936,7 @@ static int ath10k_snoc_hif_start(struct ath10k *ar) bitmap_clear(ar_snoc->pending_ce_irqs, 0, CE_COUNT_MAX); - dev_set_threaded(ar->napi_dev, true); + dev_set_threaded_hint(ar->napi_dev); ath10k_core_napi_enable(ar); /* IRQs are left enabled when we restart due to a firmware crash */ if (!test_bit(ATH10K_SNOC_FLAG_RECOVERY, &ar_snoc->flags)) diff --git a/drivers/net/wireless/mediatek/mt76/debugfs.c b/drivers/net/wireless/mediatek/mt76/debugfs.c index b6a2746c187d..bd62a87aabfe 100644 --- a/drivers/net/wireless/mediatek/mt76/debugfs.c +++ b/drivers/net/wireless/mediatek/mt76/debugfs.c @@ -34,7 +34,7 @@ mt76_napi_threaded_set(void *data, u64 val) return -EOPNOTSUPP; if (dev->napi_dev->threaded != val) - return dev_set_threaded(dev->napi_dev, val); + return dev_set_threaded_hint(dev->napi_dev); return 0; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e49d8c98d284..87591448a008 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -589,7 +589,7 @@ static inline bool napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } -int dev_set_threaded(struct net_device *dev, bool threaded); +int dev_set_threaded_hint(struct net_device *dev); void napi_disable(struct napi_struct *n); void napi_disable_locked(struct napi_struct *n); diff --git a/net/core/dev.c b/net/core/dev.c index cc216a461743..d3f72e5f4904 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7025,7 +7025,12 @@ int dev_set_threaded(struct net_device *dev, bool threaded) return err; } -EXPORT_SYMBOL(dev_set_threaded); + +int dev_set_threaded_hint(struct net_device *dev) +{ + return dev_set_threaded(dev, true); +} +EXPORT_SYMBOL(dev_set_threaded_hint); /** * netif_queue_set_napi - Associate queue with the napi diff --git a/net/core/dev.h b/net/core/dev.h index a603387fb566..23cbeaad8ca2 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -322,6 +322,8 @@ static inline bool napi_get_threaded(struct napi_struct *n) int napi_set_threaded(struct napi_struct *n, bool threaded); +int dev_set_threaded(struct net_device *dev, bool threaded); + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) -- 2.50.0.727.gbf7dc18ff4-goog Instead of using '0' and '1' for napi threaded state use an enum with 'disabled' and 'enabled' states. This is to prepare for the next patch to add a new 'busy-poll-enabled' state. Also move and update the 'threaded' field in struct net_device to u8 instead of bool. Tested: ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Samiullah Khawaja v6: - Moved threaded in struct netdevice up to fill the cacheline hole. - Changed dev_set_threaded to dev_set_threaded_hint and removed the second argument that was always set to true by all the drivers. Exported only dev_set_threaded_hint and made dev_set_threaded core only function. This change is done in a separate commit. - Updated documentation comment for threaded in struct netdevice. --- Documentation/netlink/specs/netdev.yaml | 13 ++++--- .../networking/net_cachelines/net_device.rst | 2 +- include/linux/netdevice.h | 7 ++-- include/uapi/linux/netdev.h | 5 +++ net/core/dev.c | 12 ++++--- net/core/dev.h | 13 ++++--- net/core/netdev-genl-gen.c | 2 +- net/core/netdev-genl.c | 2 +- tools/include/uapi/linux/netdev.h | 5 +++ tools/testing/selftests/net/nl_netdev.py | 36 +++++++++---------- 10 files changed, 58 insertions(+), 39 deletions(-) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 85d0ea6ac426..11edbf9c5727 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -85,6 +85,10 @@ definitions: name: qstats-scope type: flags entries: [queue] + - + name: napi-threaded + type: enum + entries: [ disabled, enabled ] attribute-sets: - @@ -286,11 +290,10 @@ attribute-sets: - name: threaded doc: Whether the NAPI is configured to operate in threaded polling - mode. If this is set to 1 then the NAPI context operates in - threaded polling mode. - type: uint - checks: - max: 1 + mode. If this is set to `enabled` then the NAPI context operates + in threaded polling mode. + type: u32 + enum: napi-threaded - name: xsk-info attributes: [] diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index c69cc89c958e..cb6daccac0b6 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -68,6 +68,7 @@ unsigned_char addr_assign_type unsigned_char addr_len unsigned_char upper_level unsigned_char lower_level +u8 threaded napi_poll(napi_enable,dev_set_threaded) unsigned_short neigh_priv_len unsigned_short padded unsigned_short dev_id @@ -165,7 +166,6 @@ struct sfp_bus* sfp_bus struct lock_class_key* qdisc_tx_busylock bool proto_down unsigned:1 wol_enabled -unsigned:1 threaded napi_poll(napi_enable,dev_set_threaded) unsigned_long:1 see_all_hwtstamp_requests unsigned_long:1 change_proto_down unsigned_long:1 netns_immutable diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 87591448a008..97cf14a9b469 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -369,7 +369,7 @@ struct napi_config { u64 irq_suspend_timeout; u32 defer_hard_irqs; cpumask_t affinity_mask; - bool threaded; + u8 threaded; unsigned int napi_id; }; @@ -1871,6 +1871,7 @@ enum netdev_reg_state { * @addr_len: Hardware address length * @upper_level: Maximum depth level of upper devices. * @lower_level: Maximum depth level of lower devices. + * @threaded: napi threaded state. * @neigh_priv_len: Used in neigh_alloc() * @dev_id: Used to differentiate devices that share * the same link layer address @@ -2010,8 +2011,6 @@ enum netdev_reg_state { * switch driver and used to set the phys state of the * switch port. * - * @threaded: napi threaded mode is enabled - * * @irq_affinity_auto: driver wants the core to store and re-assign the IRQ * affinity. Set by netif_enable_irq_affinity(), then * the driver must create a persistent napi by @@ -2247,6 +2246,7 @@ struct net_device { unsigned char addr_len; unsigned char upper_level; unsigned char lower_level; + u8 threaded; unsigned short neigh_priv_len; unsigned short dev_id; @@ -2428,7 +2428,6 @@ struct net_device { struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; bool proto_down; - bool threaded; bool irq_affinity_auto; bool rx_cpu_rmap_auto; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 1f3719a9a0eb..48eb49aa03d4 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -77,6 +77,11 @@ enum netdev_qstats_scope { NETDEV_QSTATS_SCOPE_QUEUE = 1, }; +enum netdev_napi_threaded { + NETDEV_NAPI_THREADED_DISABLED, + NETDEV_NAPI_THREADED_ENABLED, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, diff --git a/net/core/dev.c b/net/core/dev.c index d3f72e5f4904..ec65b03492b1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6959,7 +6959,8 @@ static void napi_stop_kthread(struct napi_struct *napi) napi->thread = NULL; } -int napi_set_threaded(struct napi_struct *napi, bool threaded) +int napi_set_threaded(struct napi_struct *napi, + enum netdev_napi_threaded threaded) { if (threaded) { if (!napi->thread) { @@ -6984,7 +6985,8 @@ int napi_set_threaded(struct napi_struct *napi, bool threaded) return 0; } -int dev_set_threaded(struct net_device *dev, bool threaded) +int dev_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) { struct napi_struct *napi; int err = 0; @@ -6996,7 +6998,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) if (!napi->thread) { err = napi_kthread_create(napi); if (err) { - threaded = false; + threaded = NETDEV_NAPI_THREADED_DISABLED; break; } } @@ -7028,7 +7030,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) int dev_set_threaded_hint(struct net_device *dev) { - return dev_set_threaded(dev, true); + return dev_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED); } EXPORT_SYMBOL(dev_set_threaded_hint); @@ -7345,7 +7347,7 @@ void netif_napi_add_weight_locked(struct net_device *dev, * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = false; + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); diff --git a/net/core/dev.h b/net/core/dev.h index 23cbeaad8ca2..ab6fac65ec24 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -315,14 +315,19 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, WRITE_ONCE(n->irq_suspend_timeout, timeout); } -static inline bool napi_get_threaded(struct napi_struct *n) +static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) { - return test_bit(NAPI_STATE_THREADED, &n->state); + if (test_bit(NAPI_STATE_THREADED, &n->state)) + return NETDEV_NAPI_THREADED_ENABLED; + + return NETDEV_NAPI_THREADED_DISABLED; } -int napi_set_threaded(struct napi_struct *n, bool threaded); +int napi_set_threaded(struct napi_struct *n, + enum netdev_napi_threaded threaded); -int dev_set_threaded(struct net_device *dev, bool threaded); +int dev_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded); int rps_cpumask_housekeeping(struct cpumask *mask); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 0994bd68a7e6..e9a2a6f26cb7 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, - [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_UINT, 1), + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), }; /* NETDEV_CMD_BIND_TX - do */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 5875df372415..6314eb7bdf69 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -333,7 +333,7 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) int ret; threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); - ret = napi_set_threaded(napi, !!threaded); + ret = napi_set_threaded(napi, threaded); if (ret) return ret; } diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 1f3719a9a0eb..48eb49aa03d4 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -77,6 +77,11 @@ enum netdev_qstats_scope { NETDEV_QSTATS_SCOPE_QUEUE = 1, }; +enum netdev_napi_threaded { + NETDEV_NAPI_THREADED_DISABLED, + NETDEV_NAPI_THREADED_ENABLED, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py index c8ffade79a52..5c66421ab8aa 100755 --- a/tools/testing/selftests/net/nl_netdev.py +++ b/tools/testing/selftests/net/nl_netdev.py @@ -52,14 +52,14 @@ def napi_set_threaded(nf) -> None: napi1_id = napis[1]['id'] # set napi threaded and verify - nf.napi_set({'id': napi0_id, 'threaded': 1}) + nf.napi_set({'id': napi0_id, 'threaded': "enabled"}) napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) # check it is not set for napi1 napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) ip(f"link set dev {nsim.ifname} down") @@ -67,18 +67,18 @@ def napi_set_threaded(nf) -> None: # verify if napi threaded is still set napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) # check it is still not set for napi1 napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) # unset napi threaded and verify - nf.napi_set({'id': napi0_id, 'threaded': 0}) + nf.napi_set({'id': napi0_id, 'threaded': "disabled"}) napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0['threaded'], "disabled") ksft_eq(napi0.get('pid'), None) # set threaded at device level @@ -86,10 +86,10 @@ def napi_set_threaded(nf) -> None: # check napi threaded is set for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 1) + ksft_eq(napi1['threaded'], "enabled") ksft_ne(napi1.get('pid'), None) # unset threaded at device level @@ -97,16 +97,16 @@ def napi_set_threaded(nf) -> None: # check napi threaded is unset for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0['threaded'], "disabled") ksft_eq(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) # set napi threaded for napi0 nf.napi_set({'id': napi0_id, 'threaded': 1}) napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) # unset threaded at device level @@ -114,10 +114,10 @@ def napi_set_threaded(nf) -> None: # check napi threaded is unset for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0['threaded'], "disabled") ksft_eq(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) def dev_set_threaded(nf) -> None: @@ -141,10 +141,10 @@ def dev_set_threaded(nf) -> None: # check napi threaded is set for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 1) + ksft_eq(napi1['threaded'], "enabled") ksft_ne(napi1.get('pid'), None) # unset threaded @@ -152,10 +152,10 @@ def dev_set_threaded(nf) -> None: # check napi threaded is unset for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0['threaded'], "disabled") ksft_eq(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) def nsim_rxq_reset_down(nf) -> None: -- 2.50.0.727.gbf7dc18ff4-goog Add a new state to napi state enum: - NAPI_STATE_THREADED_BUSY_POLL Threaded busy poll is enabled/running for this napi. Following changes are introduced in the napi scheduling and state logic: - When threaded busy poll is enabled through sysfs or netlink it also enables NAPI_STATE_THREADED so a kthread is created per napi. It also sets NAPI_STATE_THREADED_BUSY_POLL bit on each napi to indicate that it is going to busy poll the napi. - When napi is scheduled with NAPI_STATE_SCHED_THREADED and associated kthread is woken up, the kthread owns the context. If NAPI_STATE_THREADED_BUSY_POLL and NAPI_STATE_SCHED_THREADED both are set then it means that kthread can busy poll. - To keep busy polling and to avoid scheduling of the interrupts, the napi_complete_done returns false when both NAPI_STATE_SCHED_THREADED and NAPI_STATE_THREADED_BUSY_POLL flags are set. Also napi_complete_done returns early to avoid the NAPI_STATE_SCHED_THREADED being unset. - If at any point NAPI_STATE_THREADED_BUSY_POLL is unset, the napi_complete_done will run and unset the NAPI_STATE_SCHED_THREADED bit also. This will make the associated kthread go to sleep as per existing logic. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn --- Documentation/ABI/testing/sysfs-class-net | 3 +- Documentation/netlink/specs/netdev.yaml | 5 +- Documentation/networking/napi.rst | 63 +++++++++++++++++++- include/linux/netdevice.h | 11 +++- include/uapi/linux/netdev.h | 1 + net/core/dev.c | 71 +++++++++++++++++++---- net/core/dev.h | 3 + net/core/net-sysfs.c | 2 +- net/core/netdev-genl-gen.c | 2 +- tools/include/uapi/linux/netdev.h | 1 + 10 files changed, 145 insertions(+), 17 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index ebf21beba846..15d7d36a8294 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -343,7 +343,7 @@ Date: Jan 2021 KernelVersion: 5.12 Contact: netdev@vger.kernel.org Description: - Boolean value to control the threaded mode per device. User could + Integer value to control the threaded mode per device. User could set this value to enable/disable threaded mode for all napi belonging to this device, without the need to do device up/down. @@ -351,4 +351,5 @@ Description: == ================================== 0 threaded mode disabled for this dev 1 threaded mode enabled for this dev + 2 threaded mode enabled, and busy polling enabled. == ================================== diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 11edbf9c5727..70a4a9c8afef 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -88,7 +88,7 @@ definitions: - name: napi-threaded type: enum - entries: [ disabled, enabled ] + entries: [ disabled, enabled, busy-poll-enabled ] attribute-sets: - @@ -291,7 +291,8 @@ attribute-sets: name: threaded doc: Whether the NAPI is configured to operate in threaded polling mode. If this is set to `enabled` then the NAPI context operates - in threaded polling mode. + in threaded polling mode. If this is set to `busy-poll-enabled` + then the NAPI kthread also does busypolling. type: u32 enum: napi-threaded - diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index a15754adb041..a1e76341a99a 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -263,7 +263,9 @@ are not well known). Busy polling is enabled by either setting ``SO_BUSY_POLL`` on selected sockets or using the global ``net.core.busy_poll`` and ``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling -also exists. +also exists. Threaded polling of NAPI also has a mode to busy poll for +packets (:ref:`threaded busy polling`) using the same +thread that is used for NAPI processing. epoll-based busy polling ------------------------ @@ -426,6 +428,65 @@ Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is the recommended usage, because otherwise setting ``irq-suspend-timeout`` might not have any discernible effect. +.. _threaded_busy_poll: + +Threaded NAPI busy polling +-------------------------- + +Threaded NAPI allows processing of packets from each NAPI in a kthread in +kernel. Threaded NAPI busy polling extends this and adds support to do +continuous busy polling of this NAPI. This can be used to enable busy polling +independent of userspace application or the API (epoll, io_uring, raw sockets) +being used in userspace to process the packets. + +It can be enabled for each NAPI using netlink interface or at device level using +the threaded NAPI sysctl. + +For example, using following script: + +.. code-block:: bash + + $ ynl --family netdev --do napi-set \ + --json='{"id": 66, "threaded": "busy-poll-enabled"}' + + +Enabling it for each NAPI allows finer control to enable busy pollling for +only a set of NIC queues which will get traffic with low latency requirements. + +Depending on application requirement, user might want to set affinity of the +kthread that is busy polling each NAPI. User might also want to set priority +and the scheduler of the thread depending on the latency requirements. + +For a hard low-latency application, user might want to dedicate the full core +for the NAPI polling so the NIC queue descriptors are picked up from the queue +as soon as they appear. Once enabled, the NAPI thread will poll the NIC queues +continuously without sleeping. This will keep the CPU core busy with 100% +usage. For more relaxed low-latency requirement, user might want to share the +core with other threads by setting thread affinity and priority. + +Once threaded busy polling is enabled for a NAPI, PID of the kthread can be +fetched using netlink interface so the affinity, priority and scheduler +configuration can be done. + +For example, following script can be used to fetch the pid: + +.. code-block:: bash + + $ ynl --family netdev --do napi-get --json='{"id": 66}' + +This will output something like following, the pid `258` is the PID of the +kthread that is polling this NAPI. + +.. code-block:: bash + + $ {'defer-hard-irqs': 0, + 'gro-flush-timeout': 0, + 'id': 66, + 'ifindex': 2, + 'irq-suspend-timeout': 0, + 'pid': 258, + 'threaded': 'enable'} + .. _threaded: Threaded NAPI diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 97cf14a9b469..6682b975febd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -427,6 +427,8 @@ enum { NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ + NAPI_STATE_THREADED_BUSY_POLL, /* The threaded napi poller will busy poll */ + NAPI_STATE_SCHED_THREADED_BUSY_POLL, /* The threaded napi poller is busy polling */ }; enum { @@ -441,8 +443,14 @@ enum { NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), + NAPIF_STATE_THREADED_BUSY_POLL = BIT(NAPI_STATE_THREADED_BUSY_POLL), + NAPIF_STATE_SCHED_THREADED_BUSY_POLL = + BIT(NAPI_STATE_SCHED_THREADED_BUSY_POLL), }; +#define NAPIF_STATE_THREADED_BUSY_POLL_MASK \ + (NAPIF_STATE_THREADED | NAPIF_STATE_THREADED_BUSY_POLL) + enum gro_result { GRO_MERGED, GRO_MERGED_FREE, @@ -1871,7 +1879,8 @@ enum netdev_reg_state { * @addr_len: Hardware address length * @upper_level: Maximum depth level of upper devices. * @lower_level: Maximum depth level of lower devices. - * @threaded: napi threaded state. + * @threaded: napi threaded mode is disabled, enabled or + * enabled with busy polling. * @neigh_priv_len: Used in neigh_alloc() * @dev_id: Used to differentiate devices that share * the same link layer address diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 48eb49aa03d4..8163afb15377 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -80,6 +80,7 @@ enum netdev_qstats_scope { enum netdev_napi_threaded { NETDEV_NAPI_THREADED_DISABLED, NETDEV_NAPI_THREADED_ENABLED, + NETDEV_NAPI_THREADED_BUSY_POLL_ENABLED, }; enum { diff --git a/net/core/dev.c b/net/core/dev.c index ec65b03492b1..9511c69dc8e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include @@ -6554,7 +6555,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) * the guarantee we will be called later. */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | - NAPIF_STATE_IN_BUSY_POLL))) + NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED_THREADED_BUSY_POLL))) return false; if (work_done) { @@ -6959,6 +6961,19 @@ static void napi_stop_kthread(struct napi_struct *napi) napi->thread = NULL; } +static void napi_set_threaded_state(struct napi_struct *napi, + enum netdev_napi_threaded threaded) +{ + unsigned long val; + + val = 0; + if (threaded == NETDEV_NAPI_THREADED_BUSY_POLL_ENABLED) + val |= NAPIF_STATE_THREADED_BUSY_POLL; + if (threaded) + val |= NAPIF_STATE_THREADED; + set_mask_bits(&napi->state, NAPIF_STATE_THREADED_BUSY_POLL_MASK, val); +} + int napi_set_threaded(struct napi_struct *napi, enum netdev_napi_threaded threaded) { @@ -6979,7 +6994,7 @@ int napi_set_threaded(struct napi_struct *napi, } else { /* Make sure kthread is created before THREADED bit is set. */ smp_mb__before_atomic(); - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + napi_set_threaded_state(napi, threaded); } return 0; @@ -7017,12 +7032,15 @@ int dev_set_threaded(struct net_device *dev, * polled. In this case, the switch between threaded mode and * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. + * + * Switch to busy_poll threaded napi will occur after the threaded + * napi is scheduled. */ list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!threaded && napi->thread) napi_stop_kthread(napi); else - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + napi_set_threaded_state(napi, threaded); } return err; @@ -7369,7 +7387,9 @@ void napi_disable_locked(struct napi_struct *n) } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; - new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + new &= ~(NAPIF_STATE_THREADED + | NAPIF_STATE_THREADED_BUSY_POLL + | NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); @@ -7413,7 +7433,7 @@ void napi_enable_locked(struct napi_struct *n) new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) - new |= NAPIF_STATE_THREADED; + napi_set_threaded_state(n, n->dev->threaded); } while (!try_cmpxchg(&n->state, &val, new)); } EXPORT_SYMBOL(napi_enable_locked); @@ -7581,7 +7601,7 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void napi_threaded_poll_loop(struct napi_struct *napi) +static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; @@ -7610,22 +7630,53 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) } skb_defer_free_flush(sd); bpf_net_ctx_clear(bpf_net_ctx); + + /* Flush too old packets. If HZ < 1000, flush all packets */ + if (busy_poll) + gro_flush_normal(&napi->gro, HZ >= 1000); local_bh_enable(); - if (!repoll) + /* If busy polling then do not break here because we need to + * call cond_resched and rcu_softirq_qs_periodic to prevent + * watchdog warnings. + */ + if (!repoll && !busy_poll) break; rcu_softirq_qs_periodic(last_qs); cond_resched(); + + if (!repoll) + break; } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; + bool busy_poll_sched; + unsigned long val; + bool busy_poll; + + while (!napi_thread_wait(napi)) { + /* Once woken up, this means that we are scheduled as threaded + * napi and this thread owns the napi context, if busy poll + * state is set then busy poll this napi. + */ + val = READ_ONCE(napi->state); + busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; + busy_poll_sched = val & NAPIF_STATE_SCHED_THREADED_BUSY_POLL; - while (!napi_thread_wait(napi)) - napi_threaded_poll_loop(napi); + /* Do not busy poll if napi is disabled. */ + if (unlikely(val & NAPIF_STATE_DISABLE)) + busy_poll = false; + + if (busy_poll != busy_poll_sched) + assign_bit(NAPI_STATE_SCHED_THREADED_BUSY_POLL, + &napi->state, busy_poll); + + napi_threaded_poll_loop(napi, busy_poll); + } return 0; } @@ -12808,7 +12859,7 @@ static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); - napi_threaded_poll_loop(&sd->backlog); + napi_threaded_poll_loop(&sd->backlog, false); } static void backlog_napi_setup(unsigned int cpu) diff --git a/net/core/dev.h b/net/core/dev.h index ab6fac65ec24..082270ed5b92 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -317,6 +317,9 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) { + if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state)) + return NETDEV_NAPI_THREADED_BUSY_POLL_ENABLED; + if (test_bit(NAPI_STATE_THREADED, &n->state)) return NETDEV_NAPI_THREADED_ENABLED; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 8f897e2c8b4f..3ebf8153666b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -754,7 +754,7 @@ static int modify_napi_threaded(struct net_device *dev, unsigned long val) if (list_empty(&dev->napi_list)) return -EOPNOTSUPP; - if (val != 0 && val != 1) + if (val > NETDEV_NAPI_THREADED_BUSY_POLL_ENABLED) return -EOPNOTSUPP; ret = dev_set_threaded(dev, val); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index e9a2a6f26cb7..ff20435c45d2 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, - [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2), }; /* NETDEV_CMD_BIND_TX - do */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 48eb49aa03d4..8163afb15377 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -80,6 +80,7 @@ enum netdev_qstats_scope { enum netdev_napi_threaded { NETDEV_NAPI_THREADED_DISABLED, NETDEV_NAPI_THREADED_ENABLED, + NETDEV_NAPI_THREADED_BUSY_POLL_ENABLED, }; enum { -- 2.50.0.727.gbf7dc18ff4-goog Add testcase to run busy poll test with threaded napi busy poll enabled. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn --- tools/testing/selftests/net/busy_poll_test.sh | 25 ++++++++++++++++++- tools/testing/selftests/net/busy_poller.c | 14 ++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/busy_poll_test.sh b/tools/testing/selftests/net/busy_poll_test.sh index 7d2d40812074..ab230df1057e 100755 --- a/tools/testing/selftests/net/busy_poll_test.sh +++ b/tools/testing/selftests/net/busy_poll_test.sh @@ -27,6 +27,9 @@ NAPI_DEFER_HARD_IRQS=100 GRO_FLUSH_TIMEOUT=50000 SUSPEND_TIMEOUT=20000000 +# NAPI threaded busy poll config +NAPI_THREADED_POLL=2 + setup_ns() { set -e @@ -62,6 +65,9 @@ cleanup_ns() test_busypoll() { suspend_value=${1:-0} + napi_threaded_value=${2:-0} + prefer_busy_poll_value=${3:-$PREFER_BUSY_POLL} + tmp_file=$(mktemp) out_file=$(mktemp) @@ -73,10 +79,11 @@ test_busypoll() -b${SERVER_IP} \ -m${MAX_EVENTS} \ -u${BUSY_POLL_USECS} \ - -P${PREFER_BUSY_POLL} \ + -P${prefer_busy_poll_value} \ -g${BUSY_POLL_BUDGET} \ -i${NSIM_SV_IFIDX} \ -s${suspend_value} \ + -t${napi_threaded_value} \ -o${out_file}& wait_local_port_listen nssv ${SERVER_PORT} tcp @@ -109,6 +116,15 @@ test_busypoll_with_suspend() return $? } +test_busypoll_with_napi_threaded() +{ + # Only enable napi threaded poll. Set suspend timeout and prefer busy + # poll to 0. + test_busypoll 0 ${NAPI_THREADED_POLL} 0 + + return $? +} + ### ### Code start ### @@ -154,6 +170,13 @@ if [ $? -ne 0 ]; then exit 1 fi +test_busypoll_with_napi_threaded +if [ $? -ne 0 ]; then + echo "test_busypoll_with_napi_threaded failed" + cleanup_ns + exit 1 +fi + echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c index 04c7ff577bb8..f7407f09f635 100644 --- a/tools/testing/selftests/net/busy_poller.c +++ b/tools/testing/selftests/net/busy_poller.c @@ -65,15 +65,16 @@ static uint32_t cfg_busy_poll_usecs; static uint16_t cfg_busy_poll_budget; static uint8_t cfg_prefer_busy_poll; -/* IRQ params */ +/* NAPI params */ static uint32_t cfg_defer_hard_irqs; static uint64_t cfg_gro_flush_timeout; static uint64_t cfg_irq_suspend_timeout; +static enum netdev_napi_threaded cfg_napi_threaded_poll = NETDEV_NAPI_THREADED_DISABLE; static void usage(const char *filepath) { error(1, 0, - "Usage: %s -p -b -m -u -P -g -o -d -r -s -i", + "Usage: %s -p -b -m -u -P -g -o -d -r -s -t -i", filepath); } @@ -86,7 +87,7 @@ static void parse_opts(int argc, char **argv) if (argc <= 1) usage(argv[0]); - while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) { + while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:t:")) != -1) { /* most options take integer values, except o and b, so reduce * code duplication a bit for the common case by calling * strtoull here and leave bounds checking and casting per @@ -168,6 +169,12 @@ static void parse_opts(int argc, char **argv) cfg_ifindex = (int)tmp; break; + case 't': + if (tmp == ULLONG_MAX || tmp > 2) + error(1, ERANGE, "napi threaded poll value must be 0-2"); + + cfg_napi_threaded_poll = (enum netdev_napi_threaded)tmp; + break; } } @@ -246,6 +253,7 @@ static void setup_queue(void) cfg_gro_flush_timeout); netdev_napi_set_req_set_irq_suspend_timeout(set_req, cfg_irq_suspend_timeout); + netdev_napi_set_req_set_threaded(set_req, cfg_napi_threaded_poll); if (netdev_napi_set(ys, set_req)) error(1, 0, "can't set NAPI params: %s\n", yerr.msg); -- 2.50.0.727.gbf7dc18ff4-goog