neigh_get() passes 4 local variable pointers to neigh_valid_get_req(). If it returns a pointer of struct ndmsg, we do not need to pass two of them. Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 51 +++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 43a5dcbb5f9c7..d35399de640d0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2910,10 +2910,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return err; } -static int neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, - void **dst, int *dev_idx, u8 *ndm_flags, - struct netlink_ext_ack *extack) +static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, + struct neigh_table **tbl, void **dst, + struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; @@ -2922,31 +2921,33 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); - return -EINVAL; + err = -EINVAL; + goto err; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); - return -EINVAL; + err = -EINVAL; + goto err; } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); - return -EINVAL; + err = -EINVAL; + goto err; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) - return err; + goto err; - *ndm_flags = ndm->ndm_flags; - *dev_idx = ndm->ndm_ifindex; *tbl = neigh_find_table(ndm->ndm_family); - if (*tbl == NULL) { + if (!*tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - return -EAFNOSUPPORT; + err = -EAFNOSUPPORT; + goto err; } for (i = 0; i <= NDA_MAX; ++i) { @@ -2957,17 +2958,21 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, case NDA_DST: if (nla_len(tb[i]) != (int)(*tbl)->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - return -EINVAL; + err = -EINVAL; + goto err; } *dst = nla_data(tb[i]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); - return -EINVAL; + err = -EINVAL; + goto err; } } - return 0; + return ndm; +err: + return ERR_PTR(err); } static inline size_t neigh_nlmsg_size(void) @@ -3038,18 +3043,16 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net_device *dev = NULL; struct neigh_table *tbl = NULL; struct neighbour *neigh; + struct ndmsg *ndm; void *dst = NULL; - u8 ndm_flags = 0; - int dev_idx = 0; int err; - err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, - extack); - if (err < 0) - return err; + ndm = neigh_valid_get_req(nlh, &tbl, &dst, extack); + if (IS_ERR(ndm)) + return PTR_ERR(ndm); - if (dev_idx) { - dev = __dev_get_by_index(net, dev_idx); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; @@ -3061,7 +3064,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -EINVAL; } - if (ndm_flags & NTF_PROXY) { + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; pn = pneigh_lookup(tbl, net, dst, dev, 0); -- 2.50.0.727.gbf7dc18ff4-goog We will remove RTNL for neigh_get() and run it under RCU instead. neigh_get() returns -EINVAL in the following cases: * NDA_DST is not specified * Both ndm->ndm_ifindex and NTF_PROXY are not specified These validations do not require RCU. Let's move them to neigh_valid_get_req(). Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d35399de640d0..2c3e0f3615e20 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2938,6 +2938,12 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, goto err; } + if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { + NL_SET_ERR_MSG(extack, "No device specified"); + err = -EINVAL; + goto err; + } + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) @@ -2951,11 +2957,14 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, } for (i = 0; i <= NDA_MAX; ++i) { - if (!tb[i]) - continue; - switch (i) { case NDA_DST: + if (!tb[i]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); + err = -EINVAL; + goto err; + } + if (nla_len(tb[i]) != (int)(*tbl)->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); err = -EINVAL; @@ -2964,6 +2973,9 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, *dst = nla_data(tb[i]); break; default: + if (!tb[i]) + continue; + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); err = -EINVAL; goto err; @@ -3059,11 +3071,6 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, } } - if (!dst) { - NL_SET_ERR_MSG(extack, "Network address not specified"); - return -EINVAL; - } - if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -3076,11 +3083,6 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, nlh->nlmsg_seq, tbl); } - if (!dev) { - NL_SET_ERR_MSG(extack, "No device specified"); - return -EINVAL; - } - neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); -- 2.50.0.727.gbf7dc18ff4-goog We will remove RTNL for neigh_get() and run it under RCU instead. neigh_get_reply() and pneigh_get_reply() allocate skb with GFP_KERNEL. Let's move the allocation before __dev_get_by_index() in neigh_get(). Now, neigh_get_reply() and pneigh_get_reply() are inlined and rtnl_unicast() is factorised. We will convert pneigh_lookup() to __pneigh_lookup() later. Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 88 ++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 56 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2c3e0f3615e20..df5938b6020f1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2998,27 +2998,6 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int neigh_get_reply(struct net *net, struct neighbour *neigh, - u32 pid, u32 seq) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) @@ -3027,34 +3006,16 @@ static inline size_t pneigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, - u32 pid, u32 seq, struct neigh_table *tbl) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); + u32 pid = NETLINK_CB(in_skb).portid; struct net_device *dev = NULL; struct neigh_table *tbl = NULL; + u32 seq = nlh->nlmsg_seq; struct neighbour *neigh; + struct sk_buff *skb; struct ndmsg *ndm; void *dst = NULL; int err; @@ -3063,11 +3024,19 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (IS_ERR(ndm)) return PTR_ERR(ndm); + if (ndm->ndm_flags & NTF_PROXY) + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + else + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); - return -ENODEV; + err = -ENODEV; + goto err; } } @@ -3077,23 +3046,30 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 0); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); - return -ENOENT; + err = -ENOENT; + goto err; } - return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, tbl); - } - - neigh = neigh_lookup(tbl, dst, dev); - if (!neigh) { - NL_SET_ERR_MSG(extack, "Neighbour entry not found"); - return -ENOENT; - } - err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq); + err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) + goto err; + } else { + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + err = -ENOENT; + goto err; + } - neigh_release(neigh); + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + neigh_release(neigh); + if (err) + goto err; + } + return rtnl_unicast(skb, net, pid); +err: + kfree_skb(skb); return err; } -- 2.50.0.727.gbf7dc18ff4-goog neigh_valid_get_req() calls neigh_find_table() to fetch neigh_tables[]. neigh_find_table() uses rcu_dereference_rtnl(), but RTNL actually does not protect it at all; neigh_table_clear() can be called without RTNL and only waits for RCU readers by synchronize_rcu(). Fortunately, there is no bug because IPv4 is built-in, IPv6 cannot be unloaded, and DECNET was removed. To fetch neigh_tables[] by rcu_dereference() later, let's move neigh_find_table() from neigh_valid_get_req() to neigh_get(). Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index df5938b6020f1..ad79f173e6229 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2911,10 +2911,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) } static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, void **dst, + struct nlattr **tb, struct netlink_ext_ack *extack) { - struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; @@ -2949,13 +2948,6 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, if (err < 0) goto err; - *tbl = neigh_find_table(ndm->ndm_family); - if (!*tbl) { - NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - err = -EAFNOSUPPORT; - goto err; - } - for (i = 0; i <= NDA_MAX; ++i) { switch (i) { case NDA_DST: @@ -2964,13 +2956,6 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, err = -EINVAL; goto err; } - - if (nla_len(tb[i]) != (int)(*tbl)->key_len) { - NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - err = -EINVAL; - goto err; - } - *dst = nla_data(tb[i]); break; default: if (!tb[i]) @@ -3011,16 +2996,17 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, { struct net *net = sock_net(in_skb->sk); u32 pid = NETLINK_CB(in_skb).portid; + struct nlattr *tb[NDA_MAX + 1]; struct net_device *dev = NULL; - struct neigh_table *tbl = NULL; u32 seq = nlh->nlmsg_seq; + struct neigh_table *tbl; struct neighbour *neigh; struct sk_buff *skb; struct ndmsg *ndm; - void *dst = NULL; + void *dst; int err; - ndm = neigh_valid_get_req(nlh, &tbl, &dst, extack); + ndm = neigh_valid_get_req(nlh, tb, extack); if (IS_ERR(ndm)) return PTR_ERR(ndm); @@ -3031,6 +3017,21 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; + tbl = neigh_find_table(ndm->ndm_family); + if (!tbl) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + err = -EAFNOSUPPORT; + goto err; + } + + if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + err = -EINVAL; + goto err; + } + + dst = nla_data(tb[NDA_DST]); + if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { -- 2.50.0.727.gbf7dc18ff4-goog pneigh_lookup() has ASSERT_RTNL() in the middle of the function, which is confusing. When called with the last argument, creat, 0, pneigh_lookup() literally looks up a proxy neighbour entry. This is the case of the reader path as the fast path and RTM_GETNEIGH. pneigh_lookup(), however, creates a pneigh_entry when called with creat 1 from RTM_NEWNEIGH and SIOCSARP, which require RTNL. Let's split pneigh_lookup() into two functions. We will convert all the reader paths to RCU, and read_lock_bh(&tbl->lock) in the new pneigh_lookup() will be dropped. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 5 +++-- net/core/neighbour.c | 39 +++++++++++++++++++++++++++++---------- net/ipv4/arp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/ndisc.c | 2 +- 5 files changed, 36 insertions(+), 16 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7e865b14749d6..7f3d57da5689a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -376,10 +376,11 @@ unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev, - int creat); + const void *key, struct net_device *dev); struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); +struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, + const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ad79f173e6229..814a45fb1962e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -746,24 +747,44 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, } EXPORT_SYMBOL_GPL(__pneigh_lookup); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev, int creat) +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + + read_lock_bh(&tbl->lock); + n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); + read_unlock_bh(&tbl->lock); + + return n; +} +EXPORT_IPV6_MOD(pneigh_lookup); + +struct pneigh_entry *pneigh_create(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) { struct pneigh_entry *n; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); + ASSERT_RTNL(); + read_lock_bh(&tbl->lock); n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); - if (n || !creat) + if (n) goto out; - ASSERT_RTNL(); - n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; @@ -787,8 +808,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, out: return n; } -EXPORT_SYMBOL(pneigh_lookup); - int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) @@ -2007,7 +2026,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); + pn = pneigh_create(tbl, net, dst, dev); if (pn) { pn->flags = ndm_flags; pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); @@ -3044,7 +3063,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - pn = pneigh_lookup(tbl, net, dst, dev, 0); + pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c0440d61cf2ff..d93b5735b0ba4 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -864,7 +864,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && - pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { + pneigh_lookup(&arp_tbl, net, &tip, dev)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -1089,7 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) + if (!pneigh_create(&arp_tbl, net, &ip, dev)) return -ENOBUFS; return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fcc20c7250eb0..0412f85446958 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -563,7 +563,7 @@ int ip6_forward(struct sk_buff *skb) /* XXX: idev->cnf.proxy_ndp? */ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d4c5876e17718..a3ac26c1df6d8 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1100,7 +1100,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && READ_ONCE(net->ipv6.devconf_all->forwarding) && READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + pneigh_lookup(&nd_tbl, net, &msg->target, dev)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } -- 2.50.0.727.gbf7dc18ff4-goog We will convert RTM_GETNEIGH to RCU. neigh_get() looks up pneigh_entry by pneigh_lookup() and passes it to pneigh_fill_info(). Then, we must ensure that the entry is alive till pneigh_fill_info() completes, but read_lock_bh(&tbl->lock) in pneigh_lookup() does not guarantee that. Also, we will convert all readers of tbl->phash_buckets[] to RCU. Let's use call_rcu() to free pneigh_entry and update phash_buckets[] and ->next by rcu_assign_pointer(). pneigh_ifdown_and_unlock() uses list_head to avoid overwriting ->next and moving RCU iterators to another list. pndisc_destructor() (only IPv6 ndisc uses this) uses a mutex, so it is not delayed to call_rcu(), where we cannot sleep. This is fine because the mcast code works with RCU and ipv6_dev_mc_dec() frees mcast objects after RCU grace period. While at it, we change the return type of pneigh_ifdown_and_unlock() to void. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 4 ++++ net/core/neighbour.c | 51 +++++++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7f3d57da5689a..a877e56210b22 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -180,6 +180,10 @@ struct pneigh_entry { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; + union { + struct list_head free_node; + struct rcu_head rcu; + }; u32 flags; u8 protocol; bool permanent; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 814a45fb1962e..6725a40b2db3a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,9 +54,9 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm); +static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -803,12 +803,20 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; - tbl->phash_buckets[hash_val] = n; + rcu_assign_pointer(tbl->phash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); out: return n; } +static void pneigh_destroy(struct rcu_head *rcu) +{ + struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); + + netdev_put(n->dev, &n->dev_tracker); + kfree(n); +} + int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { @@ -821,12 +829,13 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { - *np = n->next; + rcu_assign_pointer(*np, n->next); write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); return 0; } } @@ -834,11 +843,12 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, return -ENOENT; } -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm) +static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev, + bool skip_perm) { - struct pneigh_entry *n, **np, *freelist = NULL; + struct pneigh_entry *n, **np; + LIST_HEAD(head); u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { @@ -847,25 +857,26 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, if (skip_perm && n->permanent) goto skip; if (!dev || n->dev == dev) { - *np = n->next; - n->next = freelist; - freelist = n; + rcu_assign_pointer(*np, n->next); + list_add(&n->free_node, &head); continue; } skip: np = &n->next; } } + write_unlock_bh(&tbl->lock); - while ((n = freelist)) { - freelist = n->next; - n->next = NULL; + + while (!list_empty(&head)) { + n = list_first_entry(&head, typeof(*n), free_node); + list_del(&n->free_node); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); } - return -ENOENT; } static inline void neigh_parms_put(struct neigh_parms *parms) -- 2.50.0.727.gbf7dc18ff4-goog We will convert pneigh readers to RCU, and its flags and protocol will be read locklessly. Let's annotate the access to the two fields. Note that all access to pn->permanent is under RTNL (neigh_add() and pneigh_ifdown_and_unlock()), so WRITE_ONCE() and READ_ONCE() are not needed. Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6725a40b2db3a..e88b9a4bfe6ea 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2039,10 +2039,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENOBUFS; pn = pneigh_create(tbl, net, dst, dev); if (pn) { - pn->flags = ndm_flags; + WRITE_ONCE(pn->flags, ndm_flags); pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); if (protocol) - pn->protocol = protocol; + WRITE_ONCE(pn->protocol, protocol); err = 0; } goto out; @@ -2678,8 +2678,9 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nlh == NULL) return -EMSGSIZE; - neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; - neigh_flags = pn->flags & NTF_OLD_MASK; + neigh_flags = READ_ONCE(pn->flags); + neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; + neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; @@ -2693,7 +2694,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; - if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, READ_ONCE(pn->protocol))) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; -- 2.50.0.727.gbf7dc18ff4-goog Only __dev_get_by_index() is the RTNL dependant in neigh_get(). Let's replace it with dev_get_by_index_rcu() and convert RTM_GETNEIGH to RCU. Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e88b9a4bfe6ea..c5bd52dfd3e5b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3048,6 +3048,8 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; + rcu_read_lock(); + tbl = neigh_find_table(ndm->ndm_family); if (!tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); @@ -3064,7 +3066,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, dst = nla_data(tb[NDA_DST]); if (ndm->ndm_ifindex) { - dev = __dev_get_by_index(net, ndm->ndm_ifindex); + dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); err = -ENODEV; @@ -3099,8 +3101,11 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto err; } + rcu_read_unlock(); + return rtnl_unicast(skb, net, pid); err: + rcu_read_unlock(); kfree_skb(skb); return err; } @@ -3900,7 +3905,7 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, - .flags = RTNL_FLAG_DUMP_UNLOCKED}, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, }; -- 2.50.0.727.gbf7dc18ff4-goog Now pneigh_entry is guaranteed to be alive during the RCU critical section even without holding tbl->lock. Let's drop read_lock_bh(&tbl->lock) and use rcu_dereference() to iterate tbl->phash_buckets[] in pneigh_dump_table() Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c5bd52dfd3e5b..e8ca84e2ddf30 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2801,12 +2801,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - read_lock_bh(&tbl->lock); - for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; n; + n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -2815,16 +2814,13 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); - if (err < 0) { - read_unlock_bh(&tbl->lock); + if (err < 0) goto out; - } next: idx++; } } - read_unlock_bh(&tbl->lock); out: cb->args[3] = h; cb->args[4] = idx; -- 2.50.0.727.gbf7dc18ff4-goog Now pneigh_entry is guaranteed to be alive during the RCU critical section even without holding tbl->lock. Let's use rcu_dereference() in pneigh_get_{first,next}(). Note that neigh_seq_start() still holds tbl->lock for the normal neighbour entry. Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e8ca84e2ddf30..e762e88328255 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3309,9 +3309,9 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = tbl->phash_buckets[bucket]; + pn = rcu_dereference(tbl->phash_buckets[bucket]); while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3329,15 +3329,15 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = pn->next; + pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = tbl->phash_buckets[state->bucket]; + pn = rcu_dereference(tbl->phash_buckets[state->bucket]); while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } -- 2.50.0.727.gbf7dc18ff4-goog __pneigh_lookup() is the lockless version of pneigh_lookup(), but its only caller pndisc_is_router() holds the table lock and reads pneigh_netry.flags. This is because accessing pneigh_entry after pneigh_lookup() was illegal unless the caller holds RTNL or the table lock. Now, pneigh_entry is guaranteed to be alive during the RCU critical section. Let's call pneigh_lookup() and use READ_ONCE() for n->flags in pndisc_is_router() and remove __pneigh_lookup(). Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 2 -- net/core/neighbour.c | 11 ----------- net/ipv6/ndisc.c | 6 ++---- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index a877e56210b22..1670e2a388556 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -381,8 +381,6 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e762e88328255..c1e5fda2cf628 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -736,17 +736,6 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, return NULL; } -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, struct net_device *dev) -{ - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); - - return __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); -} -EXPORT_SYMBOL_GPL(__pneigh_lookup); - struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a3ac26c1df6d8..7d5abb3158ec9 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -768,11 +768,9 @@ static int pndisc_is_router(const void *pkey, struct pneigh_entry *n; int ret = -1; - read_lock_bh(&nd_tbl.lock); - n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) - ret = !!(n->flags & NTF_ROUTER); - read_unlock_bh(&nd_tbl.lock); + ret = !!(READ_ONCE(n->flags) & NTF_ROUTER); return ret; } -- 2.50.0.727.gbf7dc18ff4-goog Now, all callers of pneigh_lookup() are under RCU, and the read lock there is no longer needed. Let's drop the lock, inline __pneigh_lookup_1() to pneigh_lookup(), and call it from pneigh_create(). The next patch will remove tbl->lock from pneigh_create(). Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c1e5fda2cf628..9d716852e0e7d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -720,22 +720,6 @@ static u32 pneigh_hash(const void *pkey, unsigned int key_len) return hash_val; } -static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, - struct net *net, - const void *pkey, - unsigned int key_len, - struct net_device *dev) -{ - while (n) { - if (!memcmp(n->key, pkey, key_len) && - net_eq(pneigh_net(n), net) && - (n->dev == dev || !n->dev)) - return n; - n = n->next; - } - return NULL; -} - struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) @@ -746,13 +730,19 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(tbl->phash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); - read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); - read_unlock_bh(&tbl->lock); + while (n) { + if (!memcmp(n->key, pkey, key_len) && + net_eq(pneigh_net(n), net) && + (n->dev == dev || !n->dev)) + return n; - return n; + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->lock)); + } + + return NULL; } EXPORT_IPV6_MOD(pneigh_lookup); @@ -761,19 +751,18 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net_device *dev) { struct pneigh_entry *n; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + unsigned int key_len; + u32 hash_val; ASSERT_RTNL(); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); + n = pneigh_lookup(tbl, net, pkey, dev); read_unlock_bh(&tbl->lock); - if (n) goto out; + key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; @@ -790,6 +779,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, goto out; } + hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); -- 2.50.0.727.gbf7dc18ff4-goog tbl->phash_buckets[] is only modified in the slow path by pneigh_create() and pneigh_delete() under the table lock. Both of them are called under RTNL, so no extra lock is needed, but we will remove RTNL from the paths. pneigh_create() looks up a pneigh_entry, and this part can be lockless, but it would complicate the logic like 1. lookup 2. allocate pengih_entry for GFP_KERNEL 3. lookup again but under lock 4. if found, return it after freeing the allocated memory 5. else, return the new one Instead, let's add a per-table mutex and run lookup and allocation under it. Even though RTNL is removed, the neigh table is per-protocol one, so this locking granularity is fine until we make the table per-netns. Note that updating pneigh_entry part in neigh_add() is still protected by RTNL and will be moved to pneigh_create() in the next patch. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 1 + net/core/neighbour.c | 39 +++++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 1670e2a388556..af6fe50703041 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -240,6 +240,7 @@ struct neigh_table { unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; + struct mutex phash_lock; struct pneigh_entry **phash_buckets; }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9d716852e0e7d..78f2457a101c4 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,9 +54,8 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm); +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -437,7 +436,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); - pneigh_ifdown_and_unlock(tbl, dev, skip_perm); + write_unlock_bh(&tbl->lock); + + pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) @@ -731,7 +732,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); n = rcu_dereference_check(tbl->phash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); + lockdep_is_held(&tbl->phash_lock)); while (n) { if (!memcmp(n->key, pkey, key_len) && @@ -739,7 +740,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, (n->dev == dev || !n->dev)) return n; - n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->lock)); + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); } return NULL; @@ -754,11 +755,9 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, unsigned int key_len; u32 hash_val; - ASSERT_RTNL(); + mutex_lock(&tbl->phash_lock); - read_lock_bh(&tbl->lock); n = pneigh_lookup(tbl, net, pkey, dev); - read_unlock_bh(&tbl->lock); if (n) goto out; @@ -780,11 +779,10 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, } hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); - write_unlock_bh(&tbl->lock); out: + mutex_unlock(&tbl->phash_lock); return n; } @@ -803,13 +801,15 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); + mutex_lock(&tbl->phash_lock); + for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); if (tbl->pdestructor) tbl->pdestructor(n); @@ -818,18 +818,20 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, return 0; } } - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); return -ENOENT; } -static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm) +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { struct pneigh_entry *n, **np; LIST_HEAD(head); u32 h; + mutex_lock(&tbl->phash_lock); + for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = *np) != NULL) { @@ -845,7 +847,7 @@ static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, } } - write_unlock_bh(&tbl->lock); + mutex_unlock(&tbl->phash_lock); while (!list_empty(&head)) { n = list_first_entry(&head, typeof(*n), free_node); @@ -1792,6 +1794,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, -- 2.50.0.727.gbf7dc18ff4-goog neigh_add() updates pneigh_entry() found or created by pneigh_create(). This update is serialised by RTNL, but we will remove it. Let's move the update part to pneigh_create() and make it return errno instead of a pointer of pneigh_entry. Now, the pneigh code is RTNL free. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 5 +++-- net/core/neighbour.c | 34 ++++++++++++++++------------------ net/ipv4/arp.c | 4 +--- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index af6fe50703041..a4f72db41ed69 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -382,8 +382,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); -struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); +int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, + struct net_device *dev, u32 flags, u8 protocol, + bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 78f2457a101c4..636fbdda0cb2e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -747,24 +747,27 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, } EXPORT_IPV6_MOD(pneigh_lookup); -struct pneigh_entry *pneigh_create(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev) +int pneigh_create(struct neigh_table *tbl, struct net *net, + const void *pkey, struct net_device *dev, + u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; + int err = 0; mutex_lock(&tbl->phash_lock); n = pneigh_lookup(tbl, net, pkey, dev); if (n) - goto out; + goto update; key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); - if (!n) + if (!n) { + err = -ENOBUFS; goto out; + } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); @@ -774,16 +777,20 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); - n = NULL; + err = -ENOBUFS; goto out; } hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); +update: + WRITE_ONCE(n->flags, flags); + n->permanent = permanent; + WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); - return n; + return err; } static void pneigh_destroy(struct rcu_head *rcu) @@ -2011,22 +2018,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; - if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } - err = -ENOBUFS; - pn = pneigh_create(tbl, net, dst, dev); - if (pn) { - WRITE_ONCE(pn->flags, ndm_flags); - pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); - if (protocol) - WRITE_ONCE(pn->protocol, protocol); - err = 0; - } + err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, + !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d93b5735b0ba4..5cfc1c9396732 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1089,9 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_create(&arp_tbl, net, &ip, dev)) - return -ENOBUFS; - return 0; + return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false); } return arp_req_set_proxy(net, dev, 1); -- 2.50.0.727.gbf7dc18ff4-goog