neigh_get() passes 4 local variable pointers to neigh_valid_get_req(). If it returns a pointer of struct ndmsg, we do not need to pass two of them. Signed-off-by: Kuniyuki Iwashima --- v3: Return ERR_PTR() direclty instead of goto in neigh_valid_get_req(). --- net/core/neighbour.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 43a5dcbb5f9c7..eb074d602ed08 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2910,10 +2910,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return err; } -static int neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, - void **dst, int *dev_idx, u8 *ndm_flags, - struct netlink_ext_ack *extack) +static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, + struct neigh_table **tbl, void **dst, + struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; @@ -2922,31 +2921,29 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) - return err; + return ERR_PTR(err); - *ndm_flags = ndm->ndm_flags; - *dev_idx = ndm->ndm_ifindex; *tbl = neigh_find_table(ndm->ndm_family); - if (*tbl == NULL) { + if (!*tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - return -EAFNOSUPPORT; + return ERR_PTR(-EAFNOSUPPORT); } for (i = 0; i <= NDA_MAX; ++i) { @@ -2957,17 +2954,17 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, case NDA_DST: if (nla_len(tb[i]) != (int)(*tbl)->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } *dst = nla_data(tb[i]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } } - return 0; + return ndm; } static inline size_t neigh_nlmsg_size(void) @@ -3038,18 +3035,16 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net_device *dev = NULL; struct neigh_table *tbl = NULL; struct neighbour *neigh; + struct ndmsg *ndm; void *dst = NULL; - u8 ndm_flags = 0; - int dev_idx = 0; int err; - err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, - extack); - if (err < 0) - return err; + ndm = neigh_valid_get_req(nlh, &tbl, &dst, extack); + if (IS_ERR(ndm)) + return PTR_ERR(ndm); - if (dev_idx) { - dev = __dev_get_by_index(net, dev_idx); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; @@ -3061,7 +3056,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -EINVAL; } - if (ndm_flags & NTF_PROXY) { + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; pn = pneigh_lookup(tbl, net, dst, dev, 0); -- 2.50.0.727.gbf7dc18ff4-goog We will remove RTNL for neigh_get() and run it under RCU instead. neigh_get() returns -EINVAL in the following cases: * NDA_DST is not specified * Both ndm->ndm_ifindex and NTF_PROXY are not specified These validations do not require RCU. Let's move them to neigh_valid_get_req(). While at it, the extack string for the first case is replaced with NL_SET_ERR_ATTR_MISS(). Signed-off-by: Kuniyuki Iwashima --- v3: Use NL_SET_ERR_ATTR_MISS() for NDA_DST --- net/core/neighbour.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index eb074d602ed08..babedf5e68c44 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2935,6 +2935,11 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, return ERR_PTR(-EINVAL); } + if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { + NL_SET_ERR_MSG(extack, "No device specified"); + return ERR_PTR(-EINVAL); + } + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) @@ -2947,11 +2952,13 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, } for (i = 0; i <= NDA_MAX; ++i) { - if (!tb[i]) - continue; - switch (i) { case NDA_DST: + if (!tb[i]) { + NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); + return ERR_PTR(-EINVAL); + } + if (nla_len(tb[i]) != (int)(*tbl)->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); return ERR_PTR(-EINVAL); @@ -2959,6 +2966,9 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, *dst = nla_data(tb[i]); break; default: + if (!tb[i]) + continue; + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); return ERR_PTR(-EINVAL); } @@ -3051,11 +3061,6 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, } } - if (!dst) { - NL_SET_ERR_MSG(extack, "Network address not specified"); - return -EINVAL; - } - if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; @@ -3068,11 +3073,6 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, nlh->nlmsg_seq, tbl); } - if (!dev) { - NL_SET_ERR_MSG(extack, "No device specified"); - return -EINVAL; - } - neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); -- 2.50.0.727.gbf7dc18ff4-goog We will remove RTNL for neigh_get() and run it under RCU instead. neigh_get_reply() and pneigh_get_reply() allocate skb with GFP_KERNEL. Let's move the allocation before __dev_get_by_index() in neigh_get(). Now, neigh_get_reply() and pneigh_get_reply() are inlined and rtnl_unicast() is factorised. We will convert pneigh_lookup() to __pneigh_lookup() later. Signed-off-by: Kuniyuki Iwashima --- v3: Rename label 'err' to 'err_free_skb' --- net/core/neighbour.c | 88 ++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 56 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index babedf5e68c44..e73a4e37ad0a0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2988,27 +2988,6 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int neigh_get_reply(struct net *net, struct neighbour *neigh, - u32 pid, u32 seq) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) @@ -3017,34 +2996,16 @@ static inline size_t pneigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, - u32 pid, u32 seq, struct neigh_table *tbl) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); + u32 pid = NETLINK_CB(in_skb).portid; struct net_device *dev = NULL; struct neigh_table *tbl = NULL; + u32 seq = nlh->nlmsg_seq; struct neighbour *neigh; + struct sk_buff *skb; struct ndmsg *ndm; void *dst = NULL; int err; @@ -3053,11 +3014,19 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (IS_ERR(ndm)) return PTR_ERR(ndm); + if (ndm->ndm_flags & NTF_PROXY) + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + else + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); - return -ENODEV; + err = -ENODEV; + goto err_free_skb; } } @@ -3067,23 +3036,30 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 0); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); - return -ENOENT; + err = -ENOENT; + goto err_free_skb; } - return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, tbl); - } - - neigh = neigh_lookup(tbl, dst, dev); - if (!neigh) { - NL_SET_ERR_MSG(extack, "Neighbour entry not found"); - return -ENOENT; - } - err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq); + err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) + goto err_free_skb; + } else { + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + err = -ENOENT; + goto err_free_skb; + } - neigh_release(neigh); + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + neigh_release(neigh); + if (err) + goto err_free_skb; + } + return rtnl_unicast(skb, net, pid); +err_free_skb: + kfree_skb(skb); return err; } -- 2.50.0.727.gbf7dc18ff4-goog neigh_valid_get_req() calls neigh_find_table() to fetch neigh_tables[]. neigh_find_table() uses rcu_dereference_rtnl(), but RTNL actually does not protect it at all; neigh_table_clear() can be called without RTNL and only waits for RCU readers by synchronize_rcu(). Fortunately, there is no bug because IPv4 is built-in, IPv6 cannot be unloaded, and DECNET was removed. To fetch neigh_tables[] by rcu_dereference() later, let's move neigh_find_table() from neigh_valid_get_req() to neigh_get(). Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e73a4e37ad0a0..c6ff53df9a012 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2911,10 +2911,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) } static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, void **dst, + struct nlattr **tb, struct netlink_ext_ack *extack) { - struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; @@ -2945,12 +2944,6 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, if (err < 0) return ERR_PTR(err); - *tbl = neigh_find_table(ndm->ndm_family); - if (!*tbl) { - NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - return ERR_PTR(-EAFNOSUPPORT); - } - for (i = 0; i <= NDA_MAX; ++i) { switch (i) { case NDA_DST: @@ -2958,12 +2951,6 @@ static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); return ERR_PTR(-EINVAL); } - - if (nla_len(tb[i]) != (int)(*tbl)->key_len) { - NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - return ERR_PTR(-EINVAL); - } - *dst = nla_data(tb[i]); break; default: if (!tb[i]) @@ -3001,16 +2988,17 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, { struct net *net = sock_net(in_skb->sk); u32 pid = NETLINK_CB(in_skb).portid; + struct nlattr *tb[NDA_MAX + 1]; struct net_device *dev = NULL; - struct neigh_table *tbl = NULL; u32 seq = nlh->nlmsg_seq; + struct neigh_table *tbl; struct neighbour *neigh; struct sk_buff *skb; struct ndmsg *ndm; - void *dst = NULL; + void *dst; int err; - ndm = neigh_valid_get_req(nlh, &tbl, &dst, extack); + ndm = neigh_valid_get_req(nlh, tb, extack); if (IS_ERR(ndm)) return PTR_ERR(ndm); @@ -3021,6 +3009,21 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; + tbl = neigh_find_table(ndm->ndm_family); + if (!tbl) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + err = -EAFNOSUPPORT; + goto err_free_skb; + } + + if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + err = -EINVAL; + goto err_free_skb; + } + + dst = nla_data(tb[NDA_DST]); + if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) { -- 2.50.0.727.gbf7dc18ff4-goog pneigh_lookup() has ASSERT_RTNL() in the middle of the function, which is confusing. When called with the last argument, creat, 0, pneigh_lookup() literally looks up a proxy neighbour entry. This is the case of the reader path as the fast path and RTM_GETNEIGH. pneigh_lookup(), however, creates a pneigh_entry when called with creat 1 from RTM_NEWNEIGH and SIOCSARP, which require RTNL. Let's split pneigh_lookup() into two functions. We will convert all the reader paths to RCU, and read_lock_bh(&tbl->lock) in the new pneigh_lookup() will be dropped. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 5 +++-- net/core/neighbour.c | 39 +++++++++++++++++++++++++++++---------- net/ipv4/arp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/ndisc.c | 2 +- 5 files changed, 36 insertions(+), 16 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7e865b14749d6..7f3d57da5689a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -376,10 +376,11 @@ unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev, - int creat); + const void *key, struct net_device *dev); struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); +struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, + const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c6ff53df9a012..b073b14799ce1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -746,24 +747,44 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, } EXPORT_SYMBOL_GPL(__pneigh_lookup); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev, int creat) +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + + read_lock_bh(&tbl->lock); + n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); + read_unlock_bh(&tbl->lock); + + return n; +} +EXPORT_IPV6_MOD(pneigh_lookup); + +struct pneigh_entry *pneigh_create(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) { struct pneigh_entry *n; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); + ASSERT_RTNL(); + read_lock_bh(&tbl->lock); n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); - if (n || !creat) + if (n) goto out; - ASSERT_RTNL(); - n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; @@ -787,8 +808,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, out: return n; } -EXPORT_SYMBOL(pneigh_lookup); - int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) @@ -2007,7 +2026,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); + pn = pneigh_create(tbl, net, dst, dev); if (pn) { pn->flags = ndm_flags; pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); @@ -3036,7 +3055,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - pn = pneigh_lookup(tbl, net, dst, dev, 0); + pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c0440d61cf2ff..d93b5735b0ba4 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -864,7 +864,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && - pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { + pneigh_lookup(&arp_tbl, net, &tip, dev)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -1089,7 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) + if (!pneigh_create(&arp_tbl, net, &ip, dev)) return -ENOBUFS; return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fcc20c7250eb0..0412f85446958 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -563,7 +563,7 @@ int ip6_forward(struct sk_buff *skb) /* XXX: idev->cnf.proxy_ndp? */ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d4c5876e17718..a3ac26c1df6d8 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1100,7 +1100,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && READ_ONCE(net->ipv6.devconf_all->forwarding) && READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + pneigh_lookup(&nd_tbl, net, &msg->target, dev)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } -- 2.50.0.727.gbf7dc18ff4-goog The next patch will free pneigh_entry with call_rcu(). Then, we need to annotate neigh_table.phash_buckets[] and pneigh_entry.next with __rcu. To make the next patch cleaner, let's annotate the fields in advance. Currently, all accesses to the fields are under the neigh table lock, so rcu_dereference_protected() is used with 1 for now, but most of them (except in pneigh_delete() and pneigh_ifdown_and_unlock()) will be replaced with rcu_dereference() and rcu_dereference_check(). Note that pneigh_ifdown_and_unlock() changes pneigh_entry.next to a local list, which is illegal because the RCU iterator could be moved to another list. This part will be fixed in the next patch. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 4 ++-- net/core/neighbour.c | 52 ++++++++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7f3d57da5689a..1ddc44a042000 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -176,7 +176,7 @@ struct neigh_ops { }; struct pneigh_entry { - struct pneigh_entry *next; + struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; @@ -236,7 +236,7 @@ struct neigh_table { unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; - struct pneigh_entry **phash_buckets; + struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b073b14799ce1..8bbf3ec592906 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -731,7 +731,8 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; - n = n->next; + + n = rcu_dereference_protected(n->next, 1); } return NULL; } @@ -742,7 +743,7 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); - return __pneigh_lookup_1(tbl->phash_buckets[hash_val], + return __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); } EXPORT_SYMBOL_GPL(__pneigh_lookup); @@ -759,7 +760,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); @@ -778,7 +779,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, ASSERT_RTNL(); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); @@ -803,7 +804,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; - tbl->phash_buckets[hash_val] = n; + rcu_assign_pointer(tbl->phash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); out: return n; @@ -812,16 +813,20 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - struct pneigh_entry *n, **np; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + struct pneigh_entry *n, __rcu **np; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); - for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + for (np = &tbl->phash_buckets[hash_val]; + (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { - *np = n->next; + rcu_assign_pointer(*np, n->next); write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); @@ -838,17 +843,17 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - struct pneigh_entry *n, **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np, *freelist = NULL; u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, 1)) != NULL) { if (skip_perm && n->permanent) goto skip; if (!dev || n->dev == dev) { - *np = n->next; - n->next = freelist; + rcu_assign_pointer(*np, n->next); + rcu_assign_pointer(n->next, freelist); freelist = n; continue; } @@ -858,7 +863,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, } write_unlock_bh(&tbl->lock); while ((n = freelist)) { - freelist = n->next; + freelist = rcu_dereference_protected(n->next, 1); n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); @@ -2794,7 +2799,9 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference_protected(tbl->phash_buckets[h], 1), idx = 0; + n; + n = rcu_dereference_protected(n->next, 1)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -3288,9 +3295,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = tbl->phash_buckets[bucket]; + pn = rcu_dereference_protected(tbl->phash_buckets[bucket], 1); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); if (pn) break; } @@ -3308,15 +3316,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = tbl->phash_buckets[state->bucket]; + + pn = rcu_dereference_protected(tbl->phash_buckets[state->bucket], 1); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); if (pn) break; } -- 2.50.0.727.gbf7dc18ff4-goog We will convert RTM_GETNEIGH to RCU. neigh_get() looks up pneigh_entry by pneigh_lookup() and passes it to pneigh_fill_info(). Then, we must ensure that the entry is alive till pneigh_fill_info() completes, but read_lock_bh(&tbl->lock) in pneigh_lookup() does not guarantee that. Also, we will convert all readers of tbl->phash_buckets[] to RCU. Let's use call_rcu() to free pneigh_entry and update phash_buckets[] and ->next by rcu_assign_pointer(). pneigh_ifdown_and_unlock() uses list_head to avoid overwriting ->next and moving RCU iterators to another list. pndisc_destructor() (only IPv6 ndisc uses this) uses a mutex, so it is not delayed to call_rcu(), where we cannot sleep. This is fine because the mcast code works with RCU and ipv6_dev_mc_dec() frees mcast objects after RCU grace period. While at it, we change the return type of pneigh_ifdown_and_unlock() to void. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 4 ++++ net/core/neighbour.c | 45 +++++++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 1ddc44a042000..6d7f9aa53a7a9 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -180,6 +180,10 @@ struct pneigh_entry { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; + union { + struct list_head free_node; + struct rcu_head rcu; + }; u32 flags; u8 protocol; bool permanent; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8bbf3ec592906..f5690e5904cba 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,9 +54,9 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm); +static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -810,6 +810,14 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, return n; } +static void pneigh_destroy(struct rcu_head *rcu) +{ + struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); + + netdev_put(n->dev, &n->dev_tracker); + kfree(n); +} + int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { @@ -828,10 +836,11 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); return 0; } } @@ -839,11 +848,12 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, return -ENOENT; } -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm) +static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev, + bool skip_perm) { - struct pneigh_entry *n, __rcu **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np; + LIST_HEAD(head); u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { @@ -853,24 +863,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, goto skip; if (!dev || n->dev == dev) { rcu_assign_pointer(*np, n->next); - rcu_assign_pointer(n->next, freelist); - freelist = n; + list_add(&n->free_node, &head); continue; } skip: np = &n->next; } } + write_unlock_bh(&tbl->lock); - while ((n = freelist)) { - freelist = rcu_dereference_protected(n->next, 1); - n->next = NULL; + + while (!list_empty(&head)) { + n = list_first_entry(&head, typeof(*n), free_node); + list_del(&n->free_node); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); } - return -ENOENT; } static inline void neigh_parms_put(struct neigh_parms *parms) -- 2.50.0.727.gbf7dc18ff4-goog We will convert pneigh readers to RCU, and its flags and protocol will be read locklessly. Let's annotate the access to the two fields. Note that all access to pn->permanent is under RTNL (neigh_add() and pneigh_ifdown_and_unlock()), so WRITE_ONCE() and READ_ONCE() are not needed. Signed-off-by: Kuniyuki Iwashima --- v3: Cache pn->protocol in pneigh_fill_info() --- net/core/neighbour.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f5690e5904cba..f58b534a706a6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2044,10 +2044,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENOBUFS; pn = pneigh_create(tbl, net, dst, dev); if (pn) { - pn->flags = ndm_flags; + WRITE_ONCE(pn->flags, ndm_flags); pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); if (protocol) - pn->protocol = protocol; + WRITE_ONCE(pn->protocol, protocol); err = 0; } goto out; @@ -2678,13 +2678,15 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; + u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; - neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; - neigh_flags = pn->flags & NTF_OLD_MASK; + neigh_flags = READ_ONCE(pn->flags); + neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; + neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; @@ -2698,7 +2700,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; - if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + protocol = READ_ONCE(pn->protocol); + if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; -- 2.50.0.727.gbf7dc18ff4-goog Only __dev_get_by_index() is the RTNL dependant in neigh_get(). Let's replace it with dev_get_by_index_rcu() and convert RTM_GETNEIGH to RCU. Signed-off-by: Kuniyuki Iwashima --- v3: Rename err label to err_unlock --- net/core/neighbour.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f58b534a706a6..017f41792332b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3049,27 +3049,29 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; + rcu_read_lock(); + tbl = neigh_find_table(ndm->ndm_family); if (!tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); err = -EAFNOSUPPORT; - goto err_free_skb; + goto err_unlock; } if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); err = -EINVAL; - goto err_free_skb; + goto err_unlock; } dst = nla_data(tb[NDA_DST]); if (ndm->ndm_ifindex) { - dev = __dev_get_by_index(net, ndm->ndm_ifindex); + dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); err = -ENODEV; - goto err_free_skb; + goto err_unlock; } } @@ -3080,28 +3082,31 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; - goto err_free_skb; + goto err_unlock; } err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); if (err) - goto err_free_skb; + goto err_unlock; } else { neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); err = -ENOENT; - goto err_free_skb; + goto err_unlock; } err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); neigh_release(neigh); if (err) - goto err_free_skb; + goto err_unlock; } + rcu_read_unlock(); + return rtnl_unicast(skb, net, pid); -err_free_skb: +err_unlock: + rcu_read_unlock(); kfree_skb(skb); return err; } @@ -3904,7 +3909,7 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, - .flags = RTNL_FLAG_DUMP_UNLOCKED}, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, }; -- 2.50.0.727.gbf7dc18ff4-goog Now pneigh_entry is guaranteed to be alive during the RCU critical section even without holding tbl->lock. Let's drop read_lock_bh(&tbl->lock) and use rcu_dereference() to iterate tbl->phash_buckets[] in pneigh_dump_table() Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 017f41792332b..c7e0611219710 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2808,14 +2808,12 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - read_lock_bh(&tbl->lock); - for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = rcu_dereference_protected(tbl->phash_buckets[h], 1), idx = 0; + for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; n; - n = rcu_dereference_protected(n->next, 1)) { + n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -2824,16 +2822,13 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); - if (err < 0) { - read_unlock_bh(&tbl->lock); + if (err < 0) goto out; - } next: idx++; } } - read_unlock_bh(&tbl->lock); out: cb->args[3] = h; cb->args[4] = idx; -- 2.50.0.727.gbf7dc18ff4-goog Now pneigh_entry is guaranteed to be alive during the RCU critical section even without holding tbl->lock. Let's use rcu_dereference() in pneigh_get_{first,next}(). Note that neigh_seq_start() still holds tbl->lock for the normal neighbour entry. Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c7e0611219710..27650f52d659d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3309,10 +3309,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = rcu_dereference_protected(tbl->phash_buckets[bucket], 1); + pn = rcu_dereference(tbl->phash_buckets[bucket]); while (pn && !net_eq(pneigh_net(pn), net)) - pn = rcu_dereference_protected(pn->next, 1); + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3330,17 +3330,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = rcu_dereference_protected(pn->next, 1); + pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = rcu_dereference_protected(tbl->phash_buckets[state->bucket], 1); + pn = rcu_dereference(tbl->phash_buckets[state->bucket]); while (pn && !net_eq(pneigh_net(pn), net)) - pn = rcu_dereference_protected(pn->next, 1); + pn = rcu_dereference(pn->next); if (pn) break; } -- 2.50.0.727.gbf7dc18ff4-goog __pneigh_lookup() is the lockless version of pneigh_lookup(), but its only caller pndisc_is_router() holds the table lock and reads pneigh_netry.flags. This is because accessing pneigh_entry after pneigh_lookup() was illegal unless the caller holds RTNL or the table lock. Now, pneigh_entry is guaranteed to be alive during the RCU critical section. Let's call pneigh_lookup() and use READ_ONCE() for n->flags in pndisc_is_router() and remove __pneigh_lookup(). Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 2 -- net/core/neighbour.c | 11 ----------- net/ipv6/ndisc.c | 6 ++---- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 6d7f9aa53a7a9..f8c7261cd4ebb 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -381,8 +381,6 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 27650f52d659d..e607bda7355ea 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -737,17 +737,6 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, return NULL; } -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, struct net_device *dev) -{ - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); - - return __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), - net, pkey, key_len, dev); -} -EXPORT_SYMBOL_GPL(__pneigh_lookup); - struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a3ac26c1df6d8..7d5abb3158ec9 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -768,11 +768,9 @@ static int pndisc_is_router(const void *pkey, struct pneigh_entry *n; int ret = -1; - read_lock_bh(&nd_tbl.lock); - n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) - ret = !!(n->flags & NTF_ROUTER); - read_unlock_bh(&nd_tbl.lock); + ret = !!(READ_ONCE(n->flags) & NTF_ROUTER); return ret; } -- 2.50.0.727.gbf7dc18ff4-goog Now, all callers of pneigh_lookup() are under RCU, and the read lock there is no longer needed. Let's drop the lock, inline __pneigh_lookup_1() to pneigh_lookup(), and call it from pneigh_create(). The next patch will remove tbl->lock from pneigh_create(). Signed-off-by: Kuniyuki Iwashima --- net/core/neighbour.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e607bda7355ea..6096c8d41fc95 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -720,23 +720,6 @@ static u32 pneigh_hash(const void *pkey, unsigned int key_len) return hash_val; } -static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, - struct net *net, - const void *pkey, - unsigned int key_len, - struct net_device *dev) -{ - while (n) { - if (!memcmp(n->key, pkey, key_len) && - net_eq(pneigh_net(n), net) && - (n->dev == dev || !n->dev)) - return n; - - n = rcu_dereference_protected(n->next, 1); - } - return NULL; -} - struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) @@ -747,13 +730,19 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(tbl->phash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); - read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), - net, pkey, key_len, dev); - read_unlock_bh(&tbl->lock); + while (n) { + if (!memcmp(n->key, pkey, key_len) && + net_eq(pneigh_net(n), net) && + (n->dev == dev || !n->dev)) + return n; - return n; + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->lock)); + } + + return NULL; } EXPORT_IPV6_MOD(pneigh_lookup); @@ -762,19 +751,18 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net_device *dev) { struct pneigh_entry *n; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + unsigned int key_len; + u32 hash_val; ASSERT_RTNL(); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), - net, pkey, key_len, dev); + n = pneigh_lookup(tbl, net, pkey, dev); read_unlock_bh(&tbl->lock); - if (n) goto out; + key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; @@ -791,6 +779,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, goto out; } + hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); -- 2.50.0.727.gbf7dc18ff4-goog tbl->phash_buckets[] is only modified in the slow path by pneigh_create() and pneigh_delete() under the table lock. Both of them are called under RTNL, so no extra lock is needed, but we will remove RTNL from the paths. pneigh_create() looks up a pneigh_entry, and this part can be lockless, but it would complicate the logic like 1. lookup 2. allocate pengih_entry for GFP_KERNEL 3. lookup again but under lock 4. if found, return it after freeing the allocated memory 5. else, return the new one Instead, let's add a per-table mutex and run lookup and allocation under it. Note that updating pneigh_entry part in neigh_add() is still protected by RTNL and will be moved to pneigh_create() in the next patch. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 1 + net/core/neighbour.c | 39 +++++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f8c7261cd4ebb..f333f9ebc4259 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -240,6 +240,7 @@ struct neigh_table { unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; + struct mutex phash_lock; struct pneigh_entry __rcu **phash_buckets; }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6096c8d41fc95..7e8a672dad6d1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,9 +54,8 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm); +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -437,7 +436,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); - pneigh_ifdown_and_unlock(tbl, dev, skip_perm); + write_unlock_bh(&tbl->lock); + + pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) @@ -731,7 +732,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); n = rcu_dereference_check(tbl->phash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); + lockdep_is_held(&tbl->phash_lock)); while (n) { if (!memcmp(n->key, pkey, key_len) && @@ -739,7 +740,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, (n->dev == dev || !n->dev)) return n; - n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->lock)); + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); } return NULL; @@ -754,11 +755,9 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, unsigned int key_len; u32 hash_val; - ASSERT_RTNL(); + mutex_lock(&tbl->phash_lock); - read_lock_bh(&tbl->lock); n = pneigh_lookup(tbl, net, pkey, dev); - read_unlock_bh(&tbl->lock); if (n) goto out; @@ -780,11 +779,10 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, } hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); - write_unlock_bh(&tbl->lock); out: + mutex_unlock(&tbl->phash_lock); return n; } @@ -806,14 +804,16 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); + mutex_lock(&tbl->phash_lock); + for (np = &tbl->phash_buckets[hash_val]; (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); if (tbl->pdestructor) tbl->pdestructor(n); @@ -822,18 +822,20 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, return 0; } } - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); return -ENOENT; } -static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm) +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { struct pneigh_entry *n, __rcu **np; LIST_HEAD(head); u32 h; + mutex_lock(&tbl->phash_lock); + for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = rcu_dereference_protected(*np, 1)) != NULL) { @@ -849,7 +851,7 @@ static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, } } - write_unlock_bh(&tbl->lock); + mutex_unlock(&tbl->phash_lock); while (!list_empty(&head)) { n = list_first_entry(&head, typeof(*n), free_node); @@ -1796,6 +1798,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, -- 2.50.0.727.gbf7dc18ff4-goog neigh_add() updates pneigh_entry() found or created by pneigh_create(). This update is serialised by RTNL, but we will remove it. Let's move the update part to pneigh_create() and make it return errno instead of a pointer of pneigh_entry. Now, the pneigh code is RTNL free. Signed-off-by: Kuniyuki Iwashima --- include/net/neighbour.h | 5 +++-- net/core/neighbour.c | 34 ++++++++++++++++------------------ net/ipv4/arp.c | 4 +--- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f333f9ebc4259..4a30bd458c5a9 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -382,8 +382,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); -struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); +int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, + struct net_device *dev, u32 flags, u8 protocol, + bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7e8a672dad6d1..7cd3cfe08b312 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -747,24 +747,27 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, } EXPORT_IPV6_MOD(pneigh_lookup); -struct pneigh_entry *pneigh_create(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev) +int pneigh_create(struct neigh_table *tbl, struct net *net, + const void *pkey, struct net_device *dev, + u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; + int err = 0; mutex_lock(&tbl->phash_lock); n = pneigh_lookup(tbl, net, pkey, dev); if (n) - goto out; + goto update; key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); - if (!n) + if (!n) { + err = -ENOBUFS; goto out; + } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); @@ -774,16 +777,20 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); - n = NULL; + err = -ENOBUFS; goto out; } hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); +update: + WRITE_ONCE(n->flags, flags); + n->permanent = permanent; + WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); - return n; + return err; } static void pneigh_destroy(struct rcu_head *rcu) @@ -2015,22 +2022,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; - if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } - err = -ENOBUFS; - pn = pneigh_create(tbl, net, dst, dev); - if (pn) { - WRITE_ONCE(pn->flags, ndm_flags); - pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); - if (protocol) - WRITE_ONCE(pn->protocol, protocol); - err = 0; - } + err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, + !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d93b5735b0ba4..5cfc1c9396732 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1089,9 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_create(&arp_tbl, net, &ip, dev)) - return -ENOBUFS; - return 0; + return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false); } return arp_req_set_proxy(net, dev, 1); -- 2.50.0.727.gbf7dc18ff4-goog