From: Jiayuan Chen __dev_set_rx_mode() is called with addr_list_lock (spinlock) held from many places in dev_addr_lists.c. When a device lacks IFF_UNICAST_FLT, __dev_set_rx_mode() calls __dev_set_promiscuity() which propagates through dev_change_rx_flags -> ndo_change_rx_flags -> dev_set_promiscuity on lower devices. Since commit 78cd408356fe ("net: add missing instance lock to dev_set_promiscuity"), dev_set_promiscuity() acquires the netdev instance lock (mutex) via netdev_lock_ops(). This leads to a "sleeping function called from invalid context" / "Invalid wait context" bug when the lower device has request_ops_lock or queue_mgmt_ops set. The call chain is: dev_uc_add(bridge0) # e.g. from macsec_dev_open netif_addr_lock_bh(bridge0) # <- spinlock, BH disabled __dev_set_rx_mode(bridge0) # bridge has no IFF_UNICAST_FLT __dev_set_promiscuity(bridge0) ndo_change_rx_flags(bridge0) br_manage_promisc -> dev_set_promiscuity(team0) ndo_change_rx_flags(team0) team_change_rx_flags -> dev_set_promiscuity(dummy0) netdev_lock_ops(dummy0) # <- mutex! dummy has # request_ops_lock=true This is not limited to bridge/team/dummy. Any combination of stacking devices (bridge, bond, macvlan, vlan, macsec, team, dsa, netvsc) over devices with instance lock (dummy, mlx5, bnxt, gve) can trigger this. Fix this by deferring __dev_set_promiscuity() to after the spinlock is released: 1. Change __dev_set_rx_mode() to return a promiscuity increment value (+1, 0, -1) instead of calling __dev_set_promiscuity() directly. The uc_promisc flag is still updated under the lock for correctness. 2. Change dev_set_rx_mode() to call __dev_set_promiscuity() after releasing addr_list_lock, based on the returned increment. 3. Change all callers in dev_addr_lists.c to release their spinlock first, then call dev_set_rx_mode() which handles both the rx mode update and the deferred promiscuity change safely. Reproducer: ip link add dummy0 type dummy ip link add team0 type team ip link set dummy0 master team0 ip link set team0 up ip link add bridge0 type bridge vlan_filtering 1 ip link set bridge0 up ip link set team0 master bridge0 ip link add macsec0 link bridge0 type macsec ip link set macsec0 up # triggers the bug Fixes: 78cd408356fe ("net: add missing instance lock to dev_set_promiscuity") Reported-by: syzbot+2b3391f44313b3983e91@syzkaller.appspotmail.com Signed-off-by: Jiayuan Chen --- net/core/dev.c | 26 ++++++++++++++++------- net/core/dev.h | 2 +- net/core/dev_addr_lists.c | 44 +++++++++++++++++++-------------------- 3 files changed, 42 insertions(+), 30 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index ac6bcb2a0784..7cd25e850972 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9639,39 +9639,51 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify) * filtering it is put in promiscuous mode while unicast addresses * are present. */ -void __dev_set_rx_mode(struct net_device *dev) +int __dev_set_rx_mode(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; + int promisc_inc = 0; /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) - return; + return 0; if (!netif_device_present(dev)) - return; + return 0; if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, - * therefore calling __dev_set_promiscuity here is safe. + * therefore changing uc_promisc here is safe. The actual + * __dev_set_promiscuity() call is deferred to the caller + * (after releasing addr_list_lock) because it may trigger + * ndo_change_rx_flags -> dev_set_promiscuity on lower + * devices which can acquire netdev instance lock (mutex). */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; + promisc_inc = 1; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; + promisc_inc = -1; } } if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); + + return promisc_inc; } void dev_set_rx_mode(struct net_device *dev) { + int promisc_inc; + netif_addr_lock_bh(dev); - __dev_set_rx_mode(dev); + promisc_inc = __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); + + if (promisc_inc) + __dev_set_promiscuity(dev, promisc_inc, false); } /** diff --git a/net/core/dev.h b/net/core/dev.h index 98793a738f43..1759384dea8d 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -144,7 +144,7 @@ void dev_set_group(struct net_device *dev, int new_group); int netif_change_carrier(struct net_device *dev, bool new_carrier); int dev_change_carrier(struct net_device *dev, bool new_carrier); -void __dev_set_rx_mode(struct net_device *dev); +int __dev_set_rx_mode(struct net_device *dev); void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 76c91f224886..04222e7544c1 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -667,9 +667,9 @@ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST, true, false, 0, true); - if (!err) - __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); + if (!err) + dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_uc_add_excl); @@ -689,9 +689,9 @@ int dev_uc_add(struct net_device *dev, const unsigned char *addr) netif_addr_lock_bh(dev); err = __hw_addr_add(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); - if (!err) - __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); + if (!err) + dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_uc_add); @@ -711,9 +711,9 @@ int dev_uc_del(struct net_device *dev, const unsigned char *addr) netif_addr_lock_bh(dev); err = __hw_addr_del(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); - if (!err) - __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); + if (!err) + dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_uc_del); @@ -740,9 +740,9 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); netif_addr_unlock(to); + if (!err) + dev_set_rx_mode(to); return err; } EXPORT_SYMBOL(dev_uc_sync); @@ -770,9 +770,9 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); netif_addr_unlock(to); + if (!err) + dev_set_rx_mode(to); return err; } EXPORT_SYMBOL(dev_uc_sync_multiple); @@ -803,9 +803,9 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) netif_addr_lock_bh(from); netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); - __dev_set_rx_mode(to); netif_addr_unlock(to); netif_addr_unlock_bh(from); + dev_set_rx_mode(to); } EXPORT_SYMBOL(dev_uc_unsync); @@ -852,9 +852,9 @@ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, true, false, 0, true); - if (!err) - __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); + if (!err) + dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_mc_add_excl); @@ -868,9 +868,9 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, global, false, 0, false); - if (!err) - __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); + if (!err) + dev_set_rx_mode(dev); return err; } /** @@ -908,9 +908,9 @@ static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, global, false); - if (!err) - __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); + if (!err) + dev_set_rx_mode(dev); return err; } @@ -963,9 +963,9 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); netif_addr_unlock(to); + if (!err) + dev_set_rx_mode(to); return err; } EXPORT_SYMBOL(dev_mc_sync); @@ -993,9 +993,9 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); netif_addr_unlock(to); + if (!err) + dev_set_rx_mode(to); return err; } EXPORT_SYMBOL(dev_mc_sync_multiple); @@ -1018,9 +1018,9 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) netif_addr_lock_bh(from); netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); - __dev_set_rx_mode(to); netif_addr_unlock(to); netif_addr_unlock_bh(from); + dev_set_rx_mode(to); } EXPORT_SYMBOL(dev_mc_unsync); -- 2.43.0