From: Jesper Dangaard Brouer The per-netns IPv4 local address hash table (inet_addr_lst) is a fixed-size hlist with 256 buckets (IN4_ADDR_HSIZE). On hosts with many addresses -- e.g. ~700 on Cloudflare edge nodes -- the average chain length reaches ~2.8, making inet_lookup_ifaddr_rcu() visible in perf profiles on the unconnected UDP sendmsg path via __ip_dev_find(). Replace the fixed hlist with an rhltable (resizable hash linked table) that grows and shrinks automatically as addresses are added or removed. The rhl variant is needed because the same IP can exist on multiple interfaces. A plain rhashtable would reject the second insert with -EEXIST, and removing one interface's address would silently drop the other from the table. All current callers only need first-match semantics, which rhltable_lookup() provides. The rhashtable_params are tuned for this use case: - No explicit .hashfn: with key_len = sizeof(__be32), the default path calls jhash2(key, 1, seed) which the compiler fully inlines. - .obj_cmpfn: a direct __be32 comparison replacing the generic memcmp() in the default rhashtable_compare(). The compiler inlines this to a single cmp instruction. - .min_size = 32: most network namespaces only have loopback, so 32 buckets (256 bytes) is sufficient and saves memory compared to the old fixed 256-bucket table (2048 bytes per netns). With these settings, objdump confirms zero indirect calls and zero function calls to hashfn or cmpfn in the lookup path. The check_lifetime() work function previously iterated all hash buckets directly. Convert it to walk for_each_netdev -> in_dev->ifa_list, which is the natural way to enumerate all addresses and avoids coupling the lifetime logic to hash table internals. The rhltable serves as a lookup cache for __ip_dev_find(). If rhltable_insert() fails (e.g. -ENOMEM during table resize), the address remains on in_dev->ifa_list and lookups fall back to the slower but always-correct fib_table_lookup() path. A pr_warn is emitted on insert failure for diagnostics. On remove, -ENOENT is tolerated since the preceding insert may have failed. Reported-by: Ivan Babrou Signed-off-by: Jesper Dangaard Brouer --- include/linux/inetdevice.h | 3 +- include/net/ip.h | 5 -- include/net/netns/ipv4.h | 4 +- net/ipv4/Kconfig | 16 ---- net/ipv4/devinet.c | 149 +++++++++++++++++++++---------------- 5 files changed, 88 insertions(+), 89 deletions(-) diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index dccbeb25f701..e2f7a2f721c9 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -13,6 +13,7 @@ #include #include #include +#include struct ipv4_devconf { void *sysctl; @@ -141,7 +142,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) ARP_EVICT_NOCARRIER) struct in_ifaddr { - struct hlist_node addr_lst; + struct rhlist_head addr_lst; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; struct rcu_head rcu_head; diff --git a/include/net/ip.h b/include/net/ip.h index f39a3787fedd..03932ec93d67 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -705,11 +705,6 @@ static inline unsigned int ipv4_addr_hash(__be32 ip) return (__force unsigned int) ip; } -static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval) -{ - return jhash_1word((__force u32)ip, initval); -} - static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 80ccd4dda8e0..f956ea1b23ca 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -11,11 +11,11 @@ #include #include #include +#include struct ctl_table_header; struct ipv4_devconf; struct fib_rules_ops; -struct hlist_head; struct fib_table; struct sock; struct local_ports { @@ -296,7 +296,7 @@ struct netns_ipv4 { atomic_t rt_genid; siphash_key_t ip_id_key; - struct hlist_head *inet_addr_lst; + struct rhltable inet_addr_lst; struct delayed_work addr_chk_work; }; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 3c5e5e74b3e4..df922f9f5289 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -402,22 +402,6 @@ config INET_IPCOMP If unsure, say Y. -config INET_ADDR_HASH_BUCKETS - int "IPv4 address hash table size" if EXPERT - range 64 16384 - default 256 - help - Number of hash buckets for looking up local IPv4 addresses, - e.g. during route output to validate the source address via - __ip_dev_find(). Rounded up to the nearest power of 2. - - Hosts with many IPv4 addresses benefit from a larger table to reduce - hash chain lengths. This is particularly relevant when sending using - unconnected UDP sockets. - - The default of 256 is fine for most systems. A value of 1024 - suits hosts with ~500+ addresses. - config INET_TABLE_PERTURB_ORDER int "INET: Source port perturbation table size (as power of 2)" if EXPERT default 16 diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 9e3da06fb618..a02a31d68b2f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -49,6 +49,7 @@ #include "igmp_internal.h" #include #include +#include #ifdef CONFIG_SYSCTL #include #endif @@ -108,28 +109,45 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_PROTO] = { .type = NLA_U8 }, }; -#define IN4_ADDR_HSIZE_SHIFT order_base_2(CONFIG_INET_ADDR_HASH_BUCKETS) -#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) - -static u32 inet_addr_hash(const struct net *net, __be32 addr) +static int inet_addr_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { - u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); + const struct in_ifaddr *ifa = obj; + const __be32 *key = arg->key; - return hash_32(val, IN4_ADDR_HSIZE_SHIFT); + return *key != ifa->ifa_local; } +static const struct rhashtable_params inet_addr_rht_params = { + .head_offset = offsetof(struct in_ifaddr, addr_lst), + .key_offset = offsetof(struct in_ifaddr, ifa_local), + .key_len = sizeof(__be32), + .min_size = 32, + .obj_cmpfn = inet_addr_cmpfn, + .automatic_shrinking = true, +}; + static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { - u32 hash = inet_addr_hash(net, ifa->ifa_local); + int err; ASSERT_RTNL(); - hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); + err = rhltable_insert(&net->ipv4.inet_addr_lst, &ifa->addr_lst, + inet_addr_rht_params); + /* Non-fatal: lookups fall back to fib_table_lookup() */ + if (unlikely(err)) + pr_warn("%s() failed for %pI4: %d\n", + __func__, &ifa->ifa_local, err); } -static void inet_hash_remove(struct in_ifaddr *ifa) +static void inet_hash_remove(struct net *net, struct in_ifaddr *ifa) { + int err; + ASSERT_RTNL(); - hlist_del_init_rcu(&ifa->addr_lst); + err = rhltable_remove(&net->ipv4.inet_addr_lst, &ifa->addr_lst, + inet_addr_rht_params); + /* -ENOENT is fine: insert may have failed earlier (e.g. -ENOMEM) */ + WARN_ON_ONCE(err && err != -ENOENT); } /** @@ -173,12 +191,12 @@ EXPORT_SYMBOL(__ip_dev_find); /* called under RCU lock */ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) { - u32 hash = inet_addr_hash(net, addr); - struct in_ifaddr *ifa; + struct rhlist_head *rhl; - hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) - if (ifa->ifa_local == addr) - return ifa; + rhl = rhltable_lookup(&net->ipv4.inet_addr_lst, &addr, + inet_addr_rht_params); + if (rhl) + return container_of(rhl, struct in_ifaddr, addr_lst); return NULL; } @@ -216,7 +234,7 @@ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) in_dev_hold(in_dev); ifa->ifa_dev = in_dev; - INIT_HLIST_NODE(&ifa->addr_lst); + memset(&ifa->addr_lst, 0, sizeof(ifa->addr_lst)); return ifa; } @@ -405,7 +423,7 @@ static void __inet_del_ifa(struct in_device *in_dev, } if (!do_promote) { - inet_hash_remove(ifa); + inet_hash_remove(dev_net(in_dev->dev), ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); @@ -434,7 +452,7 @@ static void __inet_del_ifa(struct in_device *in_dev, /* 2. Unlink it */ *ifap = ifa1->ifa_next; - inet_hash_remove(ifa1); + inet_hash_remove(dev_net(in_dev->dev), ifa1); /* 3. Announce address deletion */ @@ -709,21 +727,24 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; + bool change_needed = false; + struct in_device *in_dev; + struct net_device *dev; struct in_ifaddr *ifa; - struct hlist_node *n; struct net *net; - int i; net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); - for (i = 0; i < IN4_ADDR_HSIZE; i++) { - struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; - bool change_needed = false; + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; - rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, head, addr_lst) { + for (ifa = rcu_dereference(in_dev->ifa_list); ifa; + ifa = rcu_dereference(ifa->ifa_next)) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; @@ -757,43 +778,47 @@ static void check_lifetime(struct work_struct *work) next = tstamp + preferred_lft * HZ; } } - rcu_read_unlock(); - if (!change_needed) - continue; + } + rcu_read_unlock(); + if (change_needed) { rtnl_net_lock(net); - hlist_for_each_entry_safe(ifa, n, head, addr_lst) { - unsigned long age; + for_each_netdev(net, dev) { + struct in_ifaddr __rcu **ifap; - if (ifa->ifa_flags & IFA_F_PERMANENT) + in_dev = __in_dev_get_rtnl_net(dev); + if (!in_dev) continue; - /* We try to batch several events at once. */ - age = (now - ifa->ifa_tstamp + - ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + ifap = &in_dev->ifa_list; + ifa = rtnl_net_dereference(net, *ifap); + while (ifa) { + unsigned long age; - if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && - age >= ifa->ifa_valid_lft) { - struct in_ifaddr __rcu **ifap; - struct in_ifaddr *tmp; - - ifap = &ifa->ifa_dev->ifa_list; - tmp = rtnl_net_dereference(net, *ifap); - while (tmp) { - if (tmp == ifa) { - inet_del_ifa(ifa->ifa_dev, - ifap, 1); - break; - } - ifap = &tmp->ifa_next; - tmp = rtnl_net_dereference(net, *ifap); + if (ifa->ifa_flags & IFA_F_PERMANENT) { + ifap = &ifa->ifa_next; + ifa = rtnl_net_dereference(net, *ifap); + continue; } - } else if (ifa->ifa_preferred_lft != - INFINITY_LIFE_TIME && - age >= ifa->ifa_preferred_lft && - !(ifa->ifa_flags & IFA_F_DEPRECATED)) { - ifa->ifa_flags |= IFA_F_DEPRECATED; - rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); + + /* We try to batch several events at once. */ + age = (now - ifa->ifa_tstamp + + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + + if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && + age >= ifa->ifa_valid_lft) { + inet_del_ifa(in_dev, ifap, 1); + ifa = rtnl_net_dereference(net, *ifap); + continue; + } else if (ifa->ifa_preferred_lft != + INFINITY_LIFE_TIME && + age >= ifa->ifa_preferred_lft && + !(ifa->ifa_flags & IFA_F_DEPRECATED)) { + ifa->ifa_flags |= IFA_F_DEPRECATED; + rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); + } + ifap = &ifa->ifa_next; + ifa = rtnl_net_dereference(net, *ifap); } } rtnl_net_unlock(net); @@ -2786,12 +2811,9 @@ static __net_init int devinet_init_net(struct net *net) #endif struct ipv4_devconf *all, *dflt; int err; - int i; - err = -ENOMEM; - net->ipv4.inet_addr_lst = kmalloc_objs(struct hlist_head, - IN4_ADDR_HSIZE); - if (!net->ipv4.inet_addr_lst) + err = rhltable_init(&net->ipv4.inet_addr_lst, &inet_addr_rht_params); + if (err) goto err_alloc_hash; all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); @@ -2854,9 +2876,6 @@ static __net_init int devinet_init_net(struct net *net) net->ipv4.forw_hdr = forw_hdr; #endif - for (i = 0; i < IN4_ADDR_HSIZE; i++) - INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); - INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); net->ipv4.devconf_all = all; @@ -2876,7 +2895,7 @@ static __net_init int devinet_init_net(struct net *net) err_alloc_dflt: kfree(all); err_alloc_all: - kfree(net->ipv4.inet_addr_lst); + rhltable_destroy(&net->ipv4.inet_addr_lst); err_alloc_hash: return err; } @@ -2900,7 +2919,7 @@ static __net_exit void devinet_exit_net(struct net *net) #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); - kfree(net->ipv4.inet_addr_lst); + rhltable_destroy(&net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { -- 2.43.0