Add ndo_set_rx_mode_async callback that drivers can implement instead of the legacy ndo_set_rx_mode. The legacy callback runs under the netif_addr_lock spinlock with BHs disabled, preventing drivers from sleeping. The async variant runs from a work queue with rtnl_lock and netdev_lock_ops held, in fully sleepable context. When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules netdev_rx_mode_work instead of calling the driver inline. The work function takes two snapshots of each address list (uc/mc) under the addr_lock, then drops the lock and calls the driver with the work copies. After the driver returns, it reconciles the snapshots back to the real lists under the lock. Add netif_rx_mode_sync() to opportunistically execute the pending workqueue update inline, so that rx mode changes are committed before returning to userspace: - dev_change_flags (SIOCSIFFLAGS / RTM_NEWLINK) - dev_set_promiscuity - dev_set_allmulti - dev_ifsioc SIOCADDMULTI / SIOCDELMULTI - do_setlink (RTM_SETLINK) Note that some deep hierarchies still do skip the lower updates via: - dev_uc_sync - dev_mc_sync If we do end up hitting user-visible issues, we can add more calls to netif_rx_mode_sync in specific places. But hopefully we should not, the actual user-visible lists are still synced, it's that just HW state that might be lagging. Signed-off-by: Stanislav Fomichev --- Documentation/networking/netdevices.rst | 9 ++ include/linux/netdevice.h | 18 +++ net/core/dev.c | 43 +----- net/core/dev.h | 3 + net/core/dev_addr_lists.c | 194 ++++++++++++++++++++++++ net/core/dev_api.c | 3 + net/core/dev_ioctl.c | 6 +- net/core/rtnetlink.c | 1 + 8 files changed, 234 insertions(+), 43 deletions(-) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 35704d115312..8a488c21fd7c 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -289,6 +289,15 @@ struct net_device synchronization rules ndo_set_rx_mode: Synchronization: netif_addr_lock spinlock. Context: BHs disabled + Notes: Deprecated in favor of ndo_set_rx_mode_async which runs + in process context. + +ndo_set_rx_mode_async: + Synchronization: rtnl_lock() semaphore. In addition, netdev instance + lock if the driver implements queue management or shaper API. + Context: process (from a work queue) + Notes: Async version of ndo_set_rx_mode which runs in process + context. Receives snapshots of the unicast and multicast address lists. ndo_setup_tc: ``TC_SETUP_BLOCK`` and ``TC_SETUP_FT`` are running under NFT locks diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 151d6f4fd9b3..0a41b216cbcf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1119,6 +1119,16 @@ struct netdev_net_notifier { * This function is called device changes address list filtering. * If driver handles unicast address filtering, it should set * IFF_UNICAST_FLT in its priv_flags. + * Cannot sleep, called with netif_addr_lock_bh held. + * Deprecated in favor of ndo_set_rx_mode_async. + * + * void (*ndo_set_rx_mode_async)(struct net_device *dev, + * struct netdev_hw_addr_list *uc, + * struct netdev_hw_addr_list *mc); + * Async version of ndo_set_rx_mode which runs in process context + * with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters + * are snapshots of the address lists - iterate with + * netdev_hw_addr_list_for_each(ha, uc). * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address @@ -1439,6 +1449,10 @@ struct net_device_ops { void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); + void (*ndo_set_rx_mode_async)( + struct net_device *dev, + struct netdev_hw_addr_list *uc, + struct netdev_hw_addr_list *mc); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); @@ -1903,6 +1917,8 @@ enum netdev_reg_state { * has been enabled due to the need to listen to * additional unicast addresses in a device that * does not implement ndo_set_rx_mode() + * @rx_mode_node: List entry for rx_mode work processing + * @rx_mode_tracker: Refcount tracker for rx_mode work * @uc: unicast mac addresses * @mc: multicast mac addresses * @dev_addrs: list of device hw addresses @@ -2294,6 +2310,8 @@ struct net_device { unsigned int promiscuity; unsigned int allmulti; bool uc_promisc; + struct list_head rx_mode_node; + netdevice_tracker rx_mode_tracker; #ifdef CONFIG_LOCKDEP unsigned char nested_level; #endif diff --git a/net/core/dev.c b/net/core/dev.c index 4519f0e59beb..fe33feacc4f3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9579,7 +9579,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) +int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; unsigned int promiscuity, flags; @@ -9683,46 +9683,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify) return 0; } -/* - * Upload unicast and multicast address lists to device and - * configure RX filtering. When the device doesn't support unicast - * filtering it is put in promiscuous mode while unicast addresses - * are present. - */ -void __dev_set_rx_mode(struct net_device *dev) -{ - const struct net_device_ops *ops = dev->netdev_ops; - - /* dev_open will call this function so the list will stay sane. */ - if (!(dev->flags&IFF_UP)) - return; - - if (!netif_device_present(dev)) - return; - - if (!(dev->priv_flags & IFF_UNICAST_FLT)) { - /* Unicast addresses changes may only happen under the rtnl, - * therefore calling __dev_set_promiscuity here is safe. - */ - if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1, false); - dev->uc_promisc = true; - } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1, false); - dev->uc_promisc = false; - } - } - - if (ops->ndo_set_rx_mode) - ops->ndo_set_rx_mode(dev); -} - -void dev_set_rx_mode(struct net_device *dev) -{ - netif_addr_lock_bh(dev); - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); -} /** * netif_get_flags() - get flags reported to userspace @@ -12113,6 +12073,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #endif mutex_init(&dev->lock); + INIT_LIST_HEAD(&dev->rx_mode_node); dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); diff --git a/net/core/dev.h b/net/core/dev.h index acc925b7b337..50edb380ca94 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -146,6 +146,9 @@ int netif_change_carrier(struct net_device *dev, bool new_carrier); int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); +int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify); +bool netif_rx_mode_clean(struct net_device *dev); +void netif_rx_mode_sync(struct net_device *dev); void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index bb4851bc55ce..477392127e8a 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -11,10 +11,18 @@ #include #include #include +#include +#include #include #include "dev.h" +static void netdev_rx_mode_work(struct work_struct *work); + +static LIST_HEAD(rx_mode_list); +static DEFINE_SPINLOCK(rx_mode_lock); +static DECLARE_WORK(rx_mode_work, netdev_rx_mode_work); + /* * General list handling functions */ @@ -1156,3 +1164,189 @@ void dev_mc_init(struct net_device *dev) __hw_addr_init(&dev->mc); } EXPORT_SYMBOL(dev_mc_init); + +static int netif_addr_lists_snapshot(struct net_device *dev, + struct netdev_hw_addr_list *uc_snap, + struct netdev_hw_addr_list *mc_snap, + struct netdev_hw_addr_list *uc_ref, + struct netdev_hw_addr_list *mc_ref) +{ + int err; + + err = __hw_addr_list_snapshot(uc_snap, &dev->uc, dev->addr_len); + if (!err) + err = __hw_addr_list_snapshot(uc_ref, &dev->uc, dev->addr_len); + if (!err) + err = __hw_addr_list_snapshot(mc_snap, &dev->mc, + dev->addr_len); + if (!err) + err = __hw_addr_list_snapshot(mc_ref, &dev->mc, dev->addr_len); + + if (err) { + __hw_addr_flush(uc_snap); + __hw_addr_flush(uc_ref); + __hw_addr_flush(mc_snap); + } + + return err; +} + +static void netif_addr_lists_reconcile(struct net_device *dev, + struct netdev_hw_addr_list *uc_snap, + struct netdev_hw_addr_list *mc_snap, + struct netdev_hw_addr_list *uc_ref, + struct netdev_hw_addr_list *mc_ref) +{ + __hw_addr_list_reconcile(&dev->uc, uc_snap, uc_ref, dev->addr_len); + __hw_addr_list_reconcile(&dev->mc, mc_snap, mc_ref, dev->addr_len); +} + +static void netif_rx_mode_run(struct net_device *dev) +{ + struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref; + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + might_sleep(); + netdev_ops_assert_locked(dev); + + __hw_addr_init(&uc_snap); + __hw_addr_init(&mc_snap); + __hw_addr_init(&uc_ref); + __hw_addr_init(&mc_ref); + + if (!(dev->flags & IFF_UP) || !netif_device_present(dev)) + return; + + netif_addr_lock_bh(dev); + err = netif_addr_lists_snapshot(dev, &uc_snap, &mc_snap, + &uc_ref, &mc_ref); + if (err) { + netdev_WARN(dev, "failed to sync uc/mc addresses\n"); + netif_addr_unlock_bh(dev); + return; + } + netif_addr_unlock_bh(dev); + + ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap); + + netif_addr_lock_bh(dev); + netif_addr_lists_reconcile(dev, &uc_snap, &mc_snap, + &uc_ref, &mc_ref); + netif_addr_unlock_bh(dev); +} + +static void netdev_rx_mode_work(struct work_struct *work) +{ + struct net_device *dev; + + rtnl_lock(); + + while (true) { + spin_lock_bh(&rx_mode_lock); + if (list_empty(&rx_mode_list)) { + spin_unlock_bh(&rx_mode_lock); + break; + } + dev = list_first_entry(&rx_mode_list, struct net_device, + rx_mode_node); + list_del_init(&dev->rx_mode_node); + spin_unlock_bh(&rx_mode_lock); + + netdev_lock_ops(dev); + netif_rx_mode_run(dev); + netdev_unlock_ops(dev); + netdev_put(dev, &dev->rx_mode_tracker); + } + + rtnl_unlock(); +} + +static void netif_rx_mode_queue(struct net_device *dev) +{ + spin_lock_bh(&rx_mode_lock); + if (list_empty(&dev->rx_mode_node)) { + list_add_tail(&dev->rx_mode_node, &rx_mode_list); + netdev_hold(dev, &dev->rx_mode_tracker, GFP_ATOMIC); + } + spin_unlock_bh(&rx_mode_lock); + schedule_work(&rx_mode_work); +} + +/** + * __dev_set_rx_mode() - upload unicast and multicast address lists to device + * and configure RX filtering. + * @dev: device + * + * When the device doesn't support unicast filtering it is put in promiscuous + * mode while unicast addresses are present. + */ +void __dev_set_rx_mode(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + /* dev_open will call this function so the list will stay sane. */ + if (!(dev->flags & IFF_UP)) + return; + + if (!netif_device_present(dev)) + return; + + if (ops->ndo_set_rx_mode_async) { + netif_rx_mode_queue(dev); + return; + } + + if (!(dev->priv_flags & IFF_UNICAST_FLT)) { + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { + __dev_set_promiscuity(dev, 1, false); + dev->uc_promisc = true; + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { + __dev_set_promiscuity(dev, -1, false); + dev->uc_promisc = false; + } + } + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); +} + +void dev_set_rx_mode(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); +} + +bool netif_rx_mode_clean(struct net_device *dev) +{ + bool clean = false; + + spin_lock_bh(&rx_mode_lock); + if (!list_empty(&dev->rx_mode_node)) { + list_del_init(&dev->rx_mode_node); + clean = true; + } + spin_unlock_bh(&rx_mode_lock); + + return clean; +} + +/** + * netif_rx_mode_sync() - sync rx mode inline + * @dev: network device + * + * Drivers implementing ndo_set_rx_mode_async() have their rx mode callback + * executed from a workqueue. This allows the callback to sleep, but means + * the hardware update is deferred and may not be visible to userspace + * by the time the initiating syscall returns. netif_rx_mode_sync() steals + * workqueue update and executes it inline. This preserves the atomicity of + * operations to the userspace. + */ +void netif_rx_mode_sync(struct net_device *dev) +{ + if (netif_rx_mode_clean(dev)) { + netif_rx_mode_run(dev); + netdev_put(dev, &dev->rx_mode_tracker); + } +} diff --git a/net/core/dev_api.c b/net/core/dev_api.c index f28852078aa6..437947dd08ed 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -66,6 +66,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, netdev_lock_ops(dev); ret = netif_change_flags(dev, flags, extack); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return ret; @@ -285,6 +286,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) netdev_lock_ops(dev); ret = netif_set_promiscuity(dev, inc); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return ret; @@ -311,6 +313,7 @@ int dev_set_allmulti(struct net_device *dev, int inc) netdev_lock_ops(dev); ret = netif_set_allmulti(dev, inc, true); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return ret; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 7a8966544c9d..f3979b276090 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -586,24 +586,26 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return err; case SIOCADDMULTI: - if (!ops->ndo_set_rx_mode || + if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return err; case SIOCDELMULTI: - if (!ops->ndo_set_rx_mode || + if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return err; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fae8034efbff..f4e5ac70709d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3431,6 +3431,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, dev->name); } + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return err; -- 2.52.0