This commit extends most test cases in ipmr.c for IPV6MR. Note that IP6MR does not provide rtnetlink interface for MFC, so such tests will be skipped. Signed-off-by: Kuniyuki Iwashima --- tools/testing/selftests/net/forwarding/ipmr.c | 163 ++++++++++++------ 1 file changed, 110 insertions(+), 53 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/ipmr.c b/tools/testing/selftests/net/forwarding/ipmr.c index df870aad9ead..cfd00173bcd6 100644 --- a/tools/testing/selftests/net/forwarding/ipmr.c +++ b/tools/testing/selftests/net/forwarding/ipmr.c @@ -2,7 +2,9 @@ /* Copyright 2026 Google LLC */ #include +#include #include +#include #include #include #include @@ -17,6 +19,14 @@ FIXTURE(ipmr) int netlink_sk; int raw_sk; int veth_ifindex; + union { + struct vifctl vif; + struct mif6ctl vif6; + }; + union { + struct mfcctl mfc; + struct mf6cctl mfc6; + }; }; FIXTURE_VARIANT(ipmr) @@ -25,6 +35,11 @@ FIXTURE_VARIANT(ipmr) int protocol; int level; int opts[MRT_MAX - MRT_BASE + 1]; + int vif_size; + char vif_check_cmd_pimreg[64]; + char vif_check_cmd_veth[64]; + int mfc_size; + char mfc_check_cmd[1024]; }; FIXTURE_VARIANT_ADD(ipmr, ipv4) @@ -47,6 +62,39 @@ FIXTURE_VARIANT_ADD(ipmr, ipv4) MRT_DEL_MFC_PROXY, MRT_FLUSH, }, + .vif_size = sizeof(struct vifctl), + .vif_check_cmd_pimreg = "cat /proc/net/ip_mr_vif | grep -q pimreg", + .vif_check_cmd_veth = "cat /proc/net/ip_mr_vif | grep -q veth", + .mfc_size = sizeof(struct mfcctl), + .mfc_check_cmd = "cat /proc/net/ip_mr_cache | grep -q '00000000 00000000'", +}; + +FIXTURE_VARIANT_ADD(ipmr, ipv6) +{ + .family = AF_INET6, + .protocol = IPPROTO_ICMPV6, + .level = IPPROTO_IPV6, + .opts = { + MRT6_INIT, + MRT6_DONE, + MRT6_ADD_MIF, + MRT6_DEL_MIF, + MRT6_ADD_MFC, + MRT6_DEL_MFC, + MRT6_VERSION, + MRT6_ASSERT, + MRT6_PIM, + MRT6_TABLE, + MRT6_ADD_MFC_PROXY, + MRT6_DEL_MFC_PROXY, + MRT_FLUSH, + }, + .vif_size = sizeof(struct mif6ctl), + .vif_check_cmd_pimreg = "cat /proc/net/ip6_mr_vif | grep -q pim6reg", + .vif_check_cmd_veth = "cat /proc/net/ip6_mr_vif | grep -q veth", + .mfc_size = sizeof(struct mf6cctl), + .mfc_check_cmd = "cat /proc/net/ip6_mr_cache | " + "grep -q '0000:0000:0000:0000:0000:0000:0000:0000 0000:0000:0000:0000:0000:0000:0000:0000'", }; struct mfc_attr { @@ -144,6 +192,18 @@ FIXTURE_SETUP(ipmr) ASSERT_EQ(0, err); self->veth_ifindex = ifr.ifr_ifindex; + + if (variant->family == AF_INET) { + self->vif = (struct vifctl){ + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + } else { + self->vif6 = (struct mif6ctl){ + .mif6c_flags = 0, + .mif6c_pifi = self->veth_ifindex, + }; + } } FIXTURE_TEARDOWN(ipmr) @@ -169,41 +229,39 @@ TEST_F(ipmr, mrt_init) TEST_F(ipmr, mrt_add_vif_register) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_REGISTER, - }; int err; + memset(&self->vif, 0, variant->vif_size); + + if (variant->family == AF_INET) + self->vif.vifc_flags = VIFF_REGISTER; + else + self->vif6.mif6c_flags = MIFF_REGISTER; + err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_vif | grep -q pimreg"); + err = system(variant->vif_check_cmd_pimreg); ASSERT_EQ(0, err); err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); } TEST_F(ipmr, mrt_del_vif_unreg) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; int err; err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_vif | grep -q veth0"); + err = system(variant->vif_check_cmd_veth); ASSERT_EQ(0, err); /* VIF is removed along with its device. */ @@ -213,23 +271,18 @@ TEST_F(ipmr, mrt_del_vif_unreg) /* mrt->vif_table[veth_ifindex]->dev is NULL. */ err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(-1, err); ASSERT_EQ(EADDRNOTAVAIL, errno); } TEST_F(ipmr, mrt_del_vif_netns_dismantle) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; int err; err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); /* Let cleanup_net() remove veth0 and VIF. */ @@ -237,49 +290,49 @@ TEST_F(ipmr, mrt_del_vif_netns_dismantle) TEST_F(ipmr, mrt_add_mfc) { - struct mfcctl mfc = {}; int err; /* MRT_ADD_MFC / MRT_ADD_MFC_PROXY does not need vif to exist (unlike netlink). */ err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE], - &mfc, sizeof(mfc)); + &self->mfc, variant->mfc_size); ASSERT_EQ(0, err); /* (0.0.0.0 -> 0.0.0.0) */ - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_DEL_MFC - MRT_BASE], - &mfc, sizeof(mfc)); + &self->mfc, variant->mfc_size); } TEST_F(ipmr, mrt_add_mfc_proxy) { - struct mfcctl mfc = {}; int err; err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_MFC_PROXY - MRT_BASE], - &mfc, sizeof(mfc)); + &self->mfc, variant->mfc_size); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_DEL_MFC_PROXY - MRT_BASE], - &mfc, sizeof(mfc)); + &self->mfc, variant->mfc_size); } +#define SKIP_IPV6() \ + do { \ + if (variant->family == AF_INET6) \ + SKIP(return, \ + "no netlink MFC interface"); \ + } while (0) + TEST_F(ipmr, mrt_add_mfc_netlink) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; struct mfc_attr mfc_attr = { .table = RT_TABLE_DEFAULT, .origin = 0, @@ -289,15 +342,17 @@ TEST_F(ipmr, mrt_add_mfc_netlink) }; int err; + SKIP_IPV6(); + err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); @@ -306,11 +361,6 @@ TEST_F(ipmr, mrt_add_mfc_netlink) TEST_F(ipmr, mrt_add_mfc_netlink_proxy) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; struct mfc_attr mfc_attr = { .table = RT_TABLE_DEFAULT, .origin = 0, @@ -320,15 +370,17 @@ TEST_F(ipmr, mrt_add_mfc_netlink_proxy) }; int err; + SKIP_IPV6(); + err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); @@ -345,6 +397,8 @@ TEST_F(ipmr, mrt_add_mfc_netlink_no_vif) }; int err; + SKIP_IPV6(); + /* netlink always requires RTA_IIF of an existing vif. */ mfc_attr.ifindex = 0; err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); @@ -378,6 +432,8 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) }; int i, err; + SKIP_IPV6(); + for (i = 0; i < 2; i++) { /* Create 2 VIFs just to avoid -ENFILE later. */ err = setsockopt(self->raw_sk, @@ -390,7 +446,7 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); /* Remove mrt->vif_table[0]. */ @@ -398,7 +454,7 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) ASSERT_EQ(0, err); /* MFC entry is NOT removed even if the tied VIF is removed... */ - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); /* ... and netlink is not capable of removing such an entry @@ -412,11 +468,6 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) TEST_F(ipmr, mrt_table_flush) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; struct mfc_attr mfc_attr = { .origin = 0, .group = 0, @@ -436,11 +487,17 @@ TEST_F(ipmr, mrt_table_flush) err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); - mfc_attr.table = table_id; - err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + if (variant->family == AF_INET) { + mfc_attr.table = table_id; + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + } else { + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE], + &self->mfc, variant->mfc_size); + } ASSERT_EQ(0, err); /* Flush mrt->vif_table[] and all caches. */ -- 2.53.0.1213.gd9a14994de-goog mr_table.cache_resolve_queue_len is always updated under spin_lock_bh(&mfc_unres_lock). Let's convert it to u32. Signed-off-by: Kuniyuki Iwashima --- include/linux/mroute_base.h | 2 +- net/ipv4/ipmr.c | 18 ++++++++++-------- net/ipv6/ip6mr.c | 11 +++++++---- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index cf3374580f74..3341acca002f 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -254,7 +254,7 @@ struct mr_table { struct rhltable mfc_hash; struct list_head mfc_cache_list; int maxvif; - atomic_t cache_resolve_queue_len; + u32 cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; bool mroute_do_wrvifwhole; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8a08d09b4c30..2566b4a1f80b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -756,7 +756,8 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) struct sk_buff *skb; struct nlmsgerr *e; - atomic_dec(&mrt->cache_resolve_queue_len); + WRITE_ONCE(mrt->cache_resolve_queue_len, + mrt->cache_resolve_queue_len - 1); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { @@ -1178,11 +1179,12 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, return err; } - atomic_inc(&mrt->cache_resolve_queue_len); + WRITE_ONCE(mrt->cache_resolve_queue_len, + mrt->cache_resolve_queue_len + 1); list_add(&c->_c.list, &mrt->mfc_unres_queue); mroute_netlink_event(mrt, c, RTM_NEWROUTE); - if (atomic_read(&mrt->cache_resolve_queue_len) == 1) + if (mrt->cache_resolve_queue_len == 1) mod_timer(&mrt->ipmr_expire_timer, c->_c.mfc_un.unres.expires); } @@ -1287,7 +1289,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { list_del(&_uc->list); - atomic_dec(&mrt->cache_resolve_queue_len); + WRITE_ONCE(mrt->cache_resolve_queue_len, + mrt->cache_resolve_queue_len - 1); found = true; break; } @@ -1346,7 +1349,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, } if (flags & MRT_FLUSH_MFC) { - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + if (READ_ONCE(mrt->cache_resolve_queue_len)) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); @@ -2954,10 +2957,9 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) { - u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len); - if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || - nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || + nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, + READ_ONCE(mrt->cache_resolve_queue_len)) || nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM, READ_ONCE(mrt->mroute_reg_vif_num)) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 85010ff21c98..5755244e226c 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -792,7 +792,8 @@ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; - atomic_dec(&mrt->cache_resolve_queue_len); + WRITE_ONCE(mrt->cache_resolve_queue_len, + mrt->cache_resolve_queue_len - 1); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { @@ -1205,7 +1206,8 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, return err; } - atomic_inc(&mrt->cache_resolve_queue_len); + WRITE_ONCE(mrt->cache_resolve_queue_len, + mrt->cache_resolve_queue_len + 1); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); @@ -1510,7 +1512,8 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); - atomic_dec(&mrt->cache_resolve_queue_len); + WRITE_ONCE(mrt->cache_resolve_queue_len, + mrt->cache_resolve_queue_len - 1); found = true; break; } @@ -1568,7 +1571,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) } if (flags & MRT6_FLUSH_MFC) { - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + if (READ_ONCE(mrt->cache_resolve_queue_len)) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); -- 2.53.0.1213.gd9a14994de-goog These fields in struct mr_table are updated in ip6_mroute_setsockopt() under RTNL: * mroute_do_pim * mroute_do_assert * mroute_do_wrvifwhole However, ip6_mroute_getsockopt() does not hold RTNL and read the first two fields locklessly, and ip6_mr_forward() reads all the three under RCU. Let's use WRITE_ONCE() and READ_ONCE() for them. Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 5755244e226c..5d368ee39b28 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1783,7 +1783,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; - mrt->mroute_do_assert = v; + WRITE_ONCE(mrt->mroute_do_assert, v); return 0; } @@ -1803,9 +1803,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { - mrt->mroute_do_pim = v; - mrt->mroute_do_assert = v; - mrt->mroute_do_wrvifwhole = do_wrmifwhole; + WRITE_ONCE(mrt->mroute_do_pim, v); + WRITE_ONCE(mrt->mroute_do_assert, v); + WRITE_ONCE(mrt->mroute_do_wrvifwhole, do_wrmifwhole); } rtnl_unlock(); return ret; @@ -1873,11 +1873,11 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: - val = mrt->mroute_do_pim; + val = READ_ONCE(mrt->mroute_do_pim); break; #endif case MRT6_ASSERT: - val = mrt->mroute_do_assert; + val = READ_ONCE(mrt->mroute_do_assert); break; default: return -ENOPROTOOPT; @@ -2180,20 +2180,20 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { atomic_long_inc(&c->_c.mfc_un.res.wrong_if); - if (true_vifi >= 0 && mrt->mroute_do_assert && + if (true_vifi >= 0 && READ_ONCE(mrt->mroute_do_assert) && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ - (mrt->mroute_do_pim || + (READ_ONCE(mrt->mroute_do_pim) || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); - if (mrt->mroute_do_wrvifwhole) + if (READ_ONCE(mrt->mroute_do_wrvifwhole)) ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRMIFWHOLE); } -- 2.53.0.1213.gd9a14994de-goog mr6_msgsize() calculates skb size needed for ip6mr_fill_mroute(). The size differs based on mrt->maxvif. We will drop RTNL for ip6mr_rtm_getroute() and mrt->maxvif may change under RCU. To avoid -EMSGSIZE, let's calculate the size with the maximum value of mrt->maxvif, MAXMIFS. struct rtnexthop is 8 bytes and MAXMIFS is 32, so the maximum delta is 256 bytes, which is small enough. Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 5d368ee39b28..4d78041276c1 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2542,7 +2542,7 @@ static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, cmd, flags); } -static int mr6_msgsize(bool unresolved, int maxvif) +static int mr6_msgsize(bool unresolved) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -2555,7 +2555,7 @@ static int mr6_msgsize(bool unresolved, int maxvif) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ - + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) + + MAXMIFS * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; @@ -2570,8 +2570,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), - GFP_ATOMIC); + skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS), GFP_ATOMIC); if (!skb) goto errout; @@ -2727,7 +2726,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -ENOENT; } - skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL); + skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL); if (!skb) return -ENOBUFS; -- 2.53.0.1213.gd9a14994de-goog We will convert ip6mr_rtm_getroute() to RCU in the following patch, where __ip6mr_get_table() will be called under RCU. nlmsg_new() uses GFP_KERNEL and needs to be called before holding rcu_read_lock(). As a prep, let's move nlmsg_new() before __ip6mr_get_table(). Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 4d78041276c1..db7376cbcc01 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2705,6 +2705,10 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err < 0) return err; + skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + if (tb[RTA_SRC]) src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) @@ -2714,7 +2718,8 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); - return -ENOENT; + err = -ENOENT; + goto err; } /* entries are added/deleted only under RTNL */ @@ -2723,21 +2728,20 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); - return -ENOENT; + err = -ENOENT; + goto err; } - skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); - if (err < 0) { - kfree_skb(skb); - return err; - } + if (err < 0) + goto err; return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + +err: + kfree_skb(skb); + return err; } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) -- 2.53.0.1213.gd9a14994de-goog ip6mr_rtm_getroute() calls __ip6mr_get_table(), ip6mr_cache_find(), and ip6mr_fill_mroute(). Once created, struct mr_table is not freed until netns dismantle, so it's safe under RCU. ip6mr_cache_find() iterates mrt->mfc_hash with rhl_for_each_entry_rcu(). struct mr_mfc is freed with call_rcu(), so this is also safe under RCU. ip6mr_fill_mroute() calls mr_fill_mroute(), which properly uses RCU helpers. Let's call them under RCU and register ip6mr_rtm_getroute() with RTNL_FLAG_DOIT_UNLOCKED. Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index db7376cbcc01..389471e740bc 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1390,7 +1390,8 @@ static struct pernet_operations ip6mr_net_ops = { static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, .msgtype = RTM_GETROUTE, - .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute}, + .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip6_mr_init(void) @@ -2715,6 +2716,8 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, grp = nla_get_in6_addr(tb[RTA_DST]); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); + rcu_read_lock(); + mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); @@ -2722,10 +2725,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto err; } - /* entries are added/deleted only under RTNL */ - rcu_read_lock(); cache = ip6mr_cache_find(mrt, &src, &grp); - rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); err = -ENOENT; @@ -2737,9 +2737,12 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err < 0) goto err; + rcu_read_unlock(); + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); err: + rcu_read_unlock(); kfree_skb(skb); return err; } -- 2.53.0.1213.gd9a14994de-goog ip6mr_rtm_dumproute() calls mr_table_dump() or mr_rtm_dumproute(), and mr_rtm_dumproute() finally calls mr_table_dump(). mr_table_dump() calls the passed function, _ip6mr_fill_mroute(). _ip6mr_fill_mroute() is a wrapper for ip6mr_fill_mroute() to cast struct mr_mfc * to struct mfc6_cache *. ip6mr_fill_mroute() can already be called safely under RCU. Let's convert ip6mr_rtm_dumproute() to RCU. Now there is no user of the rtnl_held field in struct fib_dump_filter, and the next patch will remove it. Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 389471e740bc..cd28bea8c11a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1391,7 +1391,7 @@ static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_mo {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, .msgtype = RTM_GETROUTE, .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute, - .flags = RTNL_FLAG_DOIT_UNLOCKED}, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; int __init ip6_mr_init(void) @@ -2751,15 +2751,17 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = { - .rtnl_held = true, + .rtnl_held = false, }; int err; + rcu_read_lock(); + if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) - return err; + goto unlock; } if (filter.table_id) { @@ -2767,17 +2769,26 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { - if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) - return skb->len; + if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) { + err = skb->len; + goto unlock; + } NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); - return -ENOENT; + err = -ENOENT; + goto unlock; } + err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); - return skb->len ? : err; + err = skb->len ? : err; + goto unlock; } - return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, - _ip6mr_fill_mroute, &mfc_unres_lock, &filter); + err = mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, + _ip6mr_fill_mroute, &mfc_unres_lock, &filter); +unlock: + rcu_read_unlock(); + + return err; } -- 2.53.0.1213.gd9a14994de-goog Commit 22e36ea9f5d7 ("inet: allow ip_valid_fib_dump_req() to be called with RTNL or RCU") introduced the rtnl_held field in struct fib_dump_filter to switch __dev_get_by_index() and dev_get_by_index_rcu() depending on the caller's context. This field served as an interim measure while we were incrementally converting all callers of ip_valid_fib_dump_req() to RCU. Now that all users (IPv4, IPv6, ipmr, ip6mr, and MPLS) have been converted to RCU, the field is no longer necessary. Let's remove it. Signed-off-by: Kuniyuki Iwashima --- include/net/ip_fib.h | 1 - net/ipv4/fib_frontend.c | 19 ++++++------------- net/ipv4/ipmr.c | 4 +--- net/ipv6/ip6_fib.c | 1 - net/ipv6/ip6mr.c | 4 +--- net/mpls/af_mpls.c | 6 ++---- 6 files changed, 10 insertions(+), 25 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 318593743b6e..1142ffad7444 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -269,7 +269,6 @@ struct fib_dump_filter { bool filter_set; bool dump_routes; bool dump_exceptions; - bool rtnl_held; unsigned char protocol; unsigned char rt_type; unsigned int flags; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1dab44e13d3b..ceeb87b13b93 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -946,9 +946,6 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct rtmsg *rtm; int err, i; - if (filter->rtnl_held) - ASSERT_RTNL(); - rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); @@ -992,10 +989,8 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); - if (filter->rtnl_held) - filter->dev = __dev_get_by_index(net, ifindex); - else - filter->dev = dev_get_by_index_rcu(net, ifindex); + + filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; break; @@ -1017,18 +1012,16 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; + struct net *net = sock_net(skb->sk); struct fib_dump_filter filter = { .dump_routes = true, .dump_exceptions = true, - .rtnl_held = false, }; - const struct nlmsghdr *nlh = cb->nlh; - struct net *net = sock_net(skb->sk); - unsigned int h, s_h; - unsigned int e = 0, s_e; - struct fib_table *tb; + unsigned int e = 0, s_e, h, s_h; struct hlist_head *head; int dumped = 0, err = 0; + struct fib_table *tb; rcu_read_lock(); if (cb->strict_check) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2566b4a1f80b..c0fc606b0ae9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2770,9 +2770,7 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { - struct fib_dump_filter filter = { - .rtnl_held = false, - }; + struct fib_dump_filter filter = {}; int err; rcu_read_lock(); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b897b3c5023b..fc95738ded76 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -633,7 +633,6 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, - .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index cd28bea8c11a..b9c048b6f1ca 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2750,9 +2750,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; - struct fib_dump_filter filter = { - .rtnl_held = false, - }; + struct fib_dump_filter filter = {}; int err; rcu_read_lock(); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 26340a7306b5..ca504d9626cf 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2221,12 +2221,10 @@ static bool mpls_rt_uses_dev(struct mpls_route *rt, static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) { + struct mpls_route __rcu **platform_label; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct mpls_route __rcu **platform_label; - struct fib_dump_filter filter = { - .rtnl_held = false, - }; + struct fib_dump_filter filter = {}; unsigned int flags = NLM_F_MULTI; size_t platform_labels; unsigned int index; -- 2.53.0.1213.gd9a14994de-goog This is a prep commit to convert ip6mr_net_exit_batch() to ->exit_rtnl(). Let's move unregister_netdevice_many() in mroute_clean_tables() to its callers. Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index b9c048b6f1ca..c46f6a430d26 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -99,7 +99,8 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); -static void mroute_clean_tables(struct mr_table *mrt, int flags); +static void mroute_clean_tables(struct mr_table *mrt, int flags, + struct list_head *dev_kill_list); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES @@ -405,12 +406,15 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); + LIST_HEAD(dev_kill_list); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | - MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); + MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC, + &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } @@ -1537,10 +1541,10 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt, int flags) +static void mroute_clean_tables(struct mr_table *mrt, int flags, + struct list_head *dev_kill_list) { struct mr_mfc *c, *tmp; - LIST_HEAD(list); int i; /* Shut down all active vif entries */ @@ -1550,9 +1554,8 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags) !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; - mif6_delete(mrt, i, 0, &list); + mif6_delete(mrt, i, 0, dev_kill_list); } - unregister_netdevice_many(&list); } /* Wipe the cache */ @@ -1615,6 +1618,7 @@ int ip6mr_sk_done(struct sock *sk) { struct net *net = sock_net(sk); struct ipv6_devconf *devconf; + LIST_HEAD(dev_kill_list); struct mr_table *mrt; int err = -EACCES; @@ -1642,11 +1646,13 @@ int ip6mr_sk_done(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); + mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC, + &dev_kill_list); err = 0; break; } } + unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); return err; @@ -1761,14 +1767,17 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, case MRT6_FLUSH: { + LIST_HEAD(dev_kill_list); int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; + rtnl_lock(); - mroute_clean_tables(mrt, flags); + mroute_clean_tables(mrt, flags, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); return 0; } -- 2.53.0.1213.gd9a14994de-goog This is a prep commit to convert ip6mr_net_exit_batch() to ->exit_rtnl(). Let's move unregister_netdevice_many() in ip6mr_free_table() to its callers. Now ip6mr_rules_exit() can do batching all tables per netns. Note that later we will remove RTNL and unregister_netdevice_many() in ip6mr_rules_init(). Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index c46f6a430d26..30e1aece5f53 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -85,7 +85,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); -static void ip6mr_free_table(struct mr_table *mrt); +static void ip6mr_free_table(struct mr_table *mrt, + struct list_head *dev_kill_list); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, @@ -238,6 +239,7 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; + LIST_HEAD(dev_kill_list); struct mr_table *mrt; int err; @@ -262,7 +264,8 @@ static int __net_init ip6mr_rules_init(struct net *net) err2: rtnl_lock(); - ip6mr_free_table(mrt); + ip6mr_free_table(mrt, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); err1: fib_rules_unregister(ops); @@ -272,12 +275,15 @@ static int __net_init ip6mr_rules_init(struct net *net) static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; + LIST_HEAD(dev_kill_list); ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); - ip6mr_free_table(mrt); + ip6mr_free_table(mrt, &dev_kill_list); } + + unregister_netdevice_many(&dev_kill_list); fib_rules_unregister(net->ipv6.mr6_rules_ops); } @@ -337,8 +343,12 @@ static int __net_init ip6mr_rules_init(struct net *net) static void __net_exit ip6mr_rules_exit(struct net *net) { + LIST_HEAD(dev_kill_list); + ASSERT_RTNL(); - ip6mr_free_table(net->ipv6.mrt6); + ip6mr_free_table(net->ipv6.mrt6, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + net->ipv6.mrt6 = NULL; } @@ -403,18 +413,17 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) ipmr_expire_process, ip6mr_new_table_set); } -static void ip6mr_free_table(struct mr_table *mrt) +static void ip6mr_free_table(struct mr_table *mrt, + struct list_head *dev_kill_list) { struct net *net = read_pnet(&mrt->net); - LIST_HEAD(dev_kill_list); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC, - &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); + dev_kill_list); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } -- 2.53.0.1213.gd9a14994de-goog ip6mr_net_ops uses ->exit_batch() to acquire RTNL only once for dying network namespaces. ip6mr does not depend on the ordering of ->exit_rtnl() and ->exit_batch() of other pernet_operations (unlike fib_net_ops). Once ip6mr_free_table() is called and all devices are queued for destruction in ->exit_rtnl(), later during NETDEV_UNREGISTER, ip6mr_device_event() will not see anything in vif table and just do nothing. Let's convert ip6mr_net_exit_batch() to ->exit_rtnl(). Note that fib_rules_unregister() does not need RTNL and we will remove RTNL and unregister_netdevice_many() in ip6mr_rules_init(). Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 30e1aece5f53..2220a2049781 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -272,18 +272,17 @@ static int __net_init ip6mr_rules_init(struct net *net) return err; } -static void __net_exit ip6mr_rules_exit(struct net *net) +static void __net_exit ip6mr_rules_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { struct mr_table *mrt, *next; - LIST_HEAD(dev_kill_list); ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); - ip6mr_free_table(mrt, &dev_kill_list); + ip6mr_free_table(mrt, dev_kill_list); } - unregister_netdevice_many(&dev_kill_list); fib_rules_unregister(net->ipv6.mr6_rules_ops); } @@ -341,13 +340,11 @@ static int __net_init ip6mr_rules_init(struct net *net) return 0; } -static void __net_exit ip6mr_rules_exit(struct net *net) +static void __net_exit ip6mr_rules_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { - LIST_HEAD(dev_kill_list); - ASSERT_RTNL(); - ip6mr_free_table(net->ipv6.mrt6, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); + ip6mr_free_table(net->ipv6.mrt6, dev_kill_list); net->ipv6.mrt6 = NULL; } @@ -1340,6 +1337,7 @@ static void __net_exit ip6mr_notifier_exit(struct net *net) /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { + LIST_HEAD(dev_kill_list); int err; err = ip6mr_notifier_init(net); @@ -1367,7 +1365,8 @@ static int __net_init ip6mr_net_init(struct net *net) remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); - ip6mr_rules_exit(net); + ip6mr_rules_exit_rtnl(net, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); #endif ip6mr_rules_fail: @@ -1384,20 +1383,16 @@ static void __net_exit ip6mr_net_exit(struct net *net) ip6mr_notifier_exit(net); } -static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) +static void __net_exit ip6mr_net_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { - struct net *net; - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - ip6mr_rules_exit(net); - rtnl_unlock(); + ip6mr_rules_exit_rtnl(net, dev_kill_list); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, - .exit_batch = ip6mr_net_exit_batch, + .exit_rtnl = ip6mr_net_exit_rtnl, }; static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { -- 2.53.0.1213.gd9a14994de-goog When ip6mr_free_table() is called from ip6mr_rules_init() or ip6mr_net_init(), the netns is not yet published. Thus, no device should have been registered, and mroute_clean_tables() will not call mif6_delete(), so unregister_netdevice_many() is unnecessary. unregister_netdevice_many() does nothing if the list is empty, but it requires RTNL due to the unconditional ASSERT_RTNL() at the entry of unregister_netdevice_many_notify(). Let's remove unnecessary RTNL and ASSERT_RTNL() and instead add WARN_ON_ONCE() in ip6mr_free_table(). Note that we use a local list for the new WARN_ON_ONCE() because dev_kill_list passed from ip6mr_rules_exit_rtnl() may have some devices when other ops->init() fails after ipmr durnig setup_net(). Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 2220a2049781..e5a1d2c48b1b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -263,10 +263,7 @@ static int __net_init ip6mr_rules_init(struct net *net) return 0; err2: - rtnl_lock(); ip6mr_free_table(mrt, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); err1: fib_rules_unregister(ops); return err; @@ -277,7 +274,6 @@ static void __net_exit ip6mr_rules_exit_rtnl(struct net *net, { struct mr_table *mrt, *next; - ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt, dev_kill_list); @@ -343,7 +339,6 @@ static int __net_init ip6mr_rules_init(struct net *net) static void __net_exit ip6mr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { - ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6, dev_kill_list); net->ipv6.mrt6 = NULL; @@ -414,15 +409,19 @@ static void ip6mr_free_table(struct mr_table *mrt, struct list_head *dev_kill_list) { struct net *net = read_pnet(&mrt->net); + LIST_HEAD(ip6mr_dev_kill_list); WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC, - dev_kill_list); + &ip6mr_dev_kill_list); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); + + WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ip6mr_dev_kill_list)); + list_splice(&ip6mr_dev_kill_list, dev_kill_list); } #ifdef CONFIG_PROC_FS @@ -1364,10 +1363,7 @@ static int __net_init ip6mr_net_init(struct net *net) proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: - rtnl_lock(); ip6mr_rules_exit_rtnl(net, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); -- 2.53.0.1213.gd9a14994de-goog fib_rules_unregister() removes ops from net->rules_ops under spinlock, calls ops->delete() for each rule, and frees the ops. ip6mr_rules_ops_template does not have ->delete(), and any operation does not require RTNL there. Let's move fib_rules_unregister() from ip6mr_rules_exit_rtnl() to ip6mr_net_exit(). Signed-off-by: Kuniyuki Iwashima --- net/ipv6/ip6mr.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e5a1d2c48b1b..b1443fb65b40 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -269,6 +269,11 @@ static int __net_init ip6mr_rules_init(struct net *net) return err; } +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + fib_rules_unregister(net->ipv6.mr6_rules_ops); +} + static void __net_exit ip6mr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { @@ -278,8 +283,6 @@ static void __net_exit ip6mr_rules_exit_rtnl(struct net *net, list_del(&mrt->list); ip6mr_free_table(mrt, dev_kill_list); } - - fib_rules_unregister(net->ipv6.mr6_rules_ops); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, @@ -336,6 +339,10 @@ static int __net_init ip6mr_rules_init(struct net *net) return 0; } +static void __net_exit ip6mr_rules_exit(struct net *net) +{ +} + static void __net_exit ip6mr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { @@ -1364,6 +1371,7 @@ static int __net_init ip6mr_net_init(struct net *net) remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: ip6mr_rules_exit_rtnl(net, &dev_kill_list); + ip6mr_rules_exit(net); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); @@ -1376,6 +1384,7 @@ static void __net_exit ip6mr_net_exit(struct net *net) remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif + ip6mr_rules_exit(net); ip6mr_notifier_exit(net); } -- 2.53.0.1213.gd9a14994de-goog net->ipv6.ip6mr_notifier_ops and net->ipv6.ipmr_seq are used only in net/ipv6/ip6mr.c. Let's move these definitions under CONFIG_IP_MROUTE. Signed-off-by: Kuniyuki Iwashima --- include/net/netns/ipv6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 499e4288170f..df00567374f4 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -112,13 +112,13 @@ struct netns_ipv6 { struct list_head mr6_tables; struct fib_rules_ops *mr6_rules_ops; #endif + struct fib_notifier_ops *ip6mr_notifier_ops; + atomic_t ipmr_seq; #endif atomic_t dev_addr_genid; atomic_t fib6_sernum; struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; - struct fib_notifier_ops *ip6mr_notifier_ops; - atomic_t ipmr_seq; struct { struct hlist_head head; spinlock_t lock; -- 2.53.0.1213.gd9a14994de-goog ip6mr does not have rtnetlink interface for MFC unlike ipmr, which uses dev_get_by_index_rcu() to set struct mfcctl.mfcc_parent. ip6mr_mfc_add() and ip6mr_mfc_delete() are called under RTNL from ip6_mroute_setsockopt() only. There are no RTNL dependant, but ip6_mroute_setsockopt() reuses RTNL just for mrt->mfc_hash and mrt->mfc_cache_list. Let's replace RTNL with a new per-netns mutex. Signed-off-by: Kuniyuki Iwashima --- include/net/netns/ipv6.h | 1 + net/ipv6/ip6mr.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index df00567374f4..6453d70d5946 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -114,6 +114,7 @@ struct netns_ipv6 { #endif struct fib_notifier_ops *ip6mr_notifier_ops; atomic_t ipmr_seq; + struct mutex mfc_mutex; #endif atomic_t dev_addr_genid; atomic_t fib6_sernum; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index b1443fb65b40..e4c31d05744b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1256,7 +1256,6 @@ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, { struct mfc6_cache *c; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); @@ -1346,6 +1345,8 @@ static int __net_init ip6mr_net_init(struct net *net) LIST_HEAD(dev_kill_list); int err; + mutex_init(&net->ipv6.mfc_mutex); + err = ip6mr_notifier_init(net); if (err) return err; @@ -1474,7 +1475,6 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, ttls[i] = 1; } - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); @@ -1553,6 +1553,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, static void mroute_clean_tables(struct mr_table *mrt, int flags, struct list_head *dev_kill_list) { + struct net *net = read_pnet(&mrt->net); struct mr_mfc *c, *tmp; int i; @@ -1569,18 +1570,21 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { + mutex_lock(&net->ipv6.mfc_mutex); + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); - call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), - FIB_EVENT_ENTRY_DEL, + call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } + + mutex_unlock(&net->ipv6.mfc_mutex); } if (flags & MRT6_FLUSH_MFC) { @@ -1763,15 +1767,18 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; - rtnl_lock(); + + mutex_lock(&net->ipv6.mfc_mutex); + if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == - rtnl_dereference(mrt->mroute_sk), + rcu_access_pointer(mrt->mroute_sk), parent); - rtnl_unlock(); + + mutex_unlock(&net->ipv6.mfc_mutex); return ret; case MRT6_FLUSH: -- 2.53.0.1213.gd9a14994de-goog