As conn_tab is per-net, better to show the current hash table size to users instead of the ip_vs_conn_tab_size (max). Signed-off-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_ctl.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b472e564b769..3129b15dadc2 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -281,6 +281,13 @@ static void est_reload_work_handler(struct work_struct *work) mutex_unlock(&ipvs->est_mutex); } +static int get_conn_tab_size(struct netns_ipvs *ipvs) +{ + struct ip_vs_rht *t = rcu_dereference(ipvs->conn_tab); + + return t? t->size : 0; +} + int ip_vs_use_count_inc(void) { @@ -2742,10 +2749,14 @@ static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) static int ip_vs_info_seq_show(struct seq_file *seq, void *v) { + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + NVERSION(IP_VS_VERSION_CODE), + get_conn_tab_size(ipvs)); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, @@ -3424,9 +3435,13 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_VERSION: { char buf[64]; + int csize; + rcu_read_lock(); + csize = get_conn_tab_size(ipvs); + rcu_read_unlock(); sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + NVERSION(IP_VS_VERSION_CODE), csize); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; @@ -3438,8 +3453,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_INFO: { struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; - info.size = ip_vs_conn_tab_size; + rcu_read_lock(); + info.size = get_conn_tab_size(ipvs); + rcu_read_unlock(); info.num_services = atomic_read(&ipvs->num_services[IP_VS_AF_INET]); if (copy_to_user(user, &info, sizeof(info)) != 0) @@ -4448,7 +4466,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE) || nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - ip_vs_conn_tab_size)) + get_conn_tab_size(ipvs))) goto nla_put_failure; break; } -- 2.53.0 Add /proc/net/ip_vs_status to show current state of IPVS. The motivation for this new /proc interface is to provide the output for the users to help them decide when to tune the load factor for hash tables, which is possible with the new sysctl knobs coming in followup patch. The output also includes information for the kthreads used for stats. Signed-off-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_ctl.c | 145 +++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 3129b15dadc2..88c3f145fa4a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2919,6 +2919,144 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) return 0; } + +static int ip_vs_status_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned int resched_score = 0; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_head *head; + struct ip_vs_service *svc; + struct ip_vs_rht *t, *pt; + struct hlist_bl_node *e; + int old_gen, new_gen; + u32 counts[8]; + u32 bucket; + int count; + u32 sum1; + u32 sum; + int i; + + rcu_read_lock(); + + t = rcu_dereference(ipvs->conn_tab); + + seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count)); + seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n", + t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); + + if (!atomic_read(&ipvs->conn_count)) + goto after_conns; + old_gen = atomic_read(&ipvs->conn_tab_changes); + +repeat_conn: + smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ + memset(counts, 0, sizeof(counts)); + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { + for (bucket = 0; bucket < t->size; bucket++) { + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + + count = 0; + resched_score++; + ip_vs_rht_walk_bucket_rcu(t, bucket, head) { + count = 0; + hlist_bl_for_each_entry_rcu(hn, e, head, node) + count++; + } + resched_score += count; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + new_gen = atomic_read(&ipvs->conn_tab_changes); + /* New table installed ? */ + if (old_gen != new_gen) { + old_gen = new_gen; + goto repeat_conn; + } + } + counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + } + } + for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) + sum += counts[i]; + sum1 = sum - counts[0]; + seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n", + counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + for (i = 1; i < ARRAY_SIZE(counts); i++) { + if (!counts[i]) + continue; + seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n", + i, counts[i], + (unsigned long)counts[i] * 100 / max(sum1, 1U)); + } + +after_conns: + t = rcu_dereference(ipvs->svc_table); + + count = ip_vs_get_num_services(ipvs); + seq_printf(seq, "Services:\t%d\n", count); + seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", + t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); + + if (!count) + goto after_svc; + old_gen = atomic_read(&ipvs->svc_table_changes); + +repeat_svc: + smp_rmb(); /* ipvs->svc_table and svc_table_changes */ + memset(counts, 0, sizeof(counts)); + ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { + for (bucket = 0; bucket < t->size; bucket++) { + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + + count = 0; + resched_score++; + ip_vs_rht_walk_bucket_rcu(t, bucket, head) { + count = 0; + hlist_bl_for_each_entry_rcu(svc, e, head, + s_list) + count++; + } + resched_score += count; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + new_gen = atomic_read(&ipvs->svc_table_changes); + /* New table installed ? */ + if (old_gen != new_gen) { + old_gen = new_gen; + goto repeat_svc; + } + } + counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + } + } + for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) + sum += counts[i]; + sum1 = sum - counts[0]; + seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n", + counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + for (i = 1; i < ARRAY_SIZE(counts); i++) { + if (!counts[i]) + continue; + seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n", + i, counts[i], + (unsigned long)counts[i] * 100 / max(sum1, 1U)); + } + +after_svc: + seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", + ipvs->est_kt_count, ipvs->est_max_threads); + seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); + seq_printf(seq, "Stats thread ests:\t%d\n", + ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * + IPVS_EST_NTICKS); + + rcu_read_unlock(); + return 0; +} + #endif /* @@ -4826,6 +4964,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; + if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net, + ip_vs_status_show, NULL)) + goto err_status; #endif ret = ip_vs_control_net_init_sysctl(ipvs); @@ -4836,6 +4977,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) err: #ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_status", ipvs->net->proc_net); + +err_status: remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); err_percpu: @@ -4861,6 +5005,7 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) ip_vs_control_net_cleanup_sysctl(ipvs); cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_status", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); -- 2.53.0 Allow the default load factor for the connection and service tables to be configured. Signed-off-by: Julian Anastasov --- Documentation/networking/ipvs-sysctl.rst | 35 +++++++++++ net/netfilter/ipvs/ip_vs_ctl.c | 76 ++++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst index 3fb5fa142eef..3c43857d7dbd 100644 --- a/Documentation/networking/ipvs-sysctl.rst +++ b/Documentation/networking/ipvs-sysctl.rst @@ -29,6 +29,31 @@ backup_only - BOOLEAN If set, disable the director function while the server is in backup mode to avoid packet loops for DR/TUN methods. +conn_lfactor - INTEGER + Possible values: -8 (larger table) .. 8 (smaller table) + + Default: -4 + + Controls the sizing of the connection hash table based on the + load factor (number of connections per table buckets): + 2^conn_lfactor = nodes / buckets + As result, the table grows if load increases and shrinks when + load decreases in the range of 2^8 - 2^conn_tab_bits (module + parameter). + The value is a shift count where negative values select + buckets = (connection hash nodes << -value) while positive + values select buckets = (connection hash nodes >> value). The + negative values reduce the collisions and reduce the time for + lookups but increase the table size. Positive values will + tolerate load above 100% when using smaller table is + preferred with the cost of more collisions. If using NAT + connections consider decreasing the value with one because + they add two nodes in the hash table. + + Example: + -4: grow if load goes above 6% (buckets = nodes * 16) + 2: grow if load goes above 400% (buckets = nodes / 4) + conn_reuse_mode - INTEGER 1 - default @@ -219,6 +244,16 @@ secure_tcp - INTEGER The value definition is the same as that of drop_entry and drop_packet. +svc_lfactor - INTEGER + Possible values: -8 (larger table) .. 8 (smaller table) + + Default: -3 + + Controls the sizing of the service hash table based on the + load factor (number of services per table buckets). The table + will grow and shrink in the range of 2^4 - 2^20. + See conn_lfactor for explanation. + sync_threshold - vector of 2 INTEGERs: sync_threshold, sync_period default 3 50 diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 88c3f145fa4a..df7430639e79 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2439,6 +2439,60 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, return ret; } +static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + if (val < -8 || val > 8) { + ret = -EINVAL; + } else { + *valp = val; + if (rcu_dereference_protected(ipvs->conn_tab, 1)) + mod_delayed_work(system_unbound_wq, + &ipvs->conn_resize_work, 0); + } + } + return ret; +} + +static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + if (val < -8 || val > 8) { + ret = -EINVAL; + } else { + *valp = val; + if (rcu_dereference_protected(ipvs->svc_table, 1)) + mod_delayed_work(system_unbound_wq, + &ipvs->svc_resize_work, 0); + } + } + return ret; +} + /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without @@ -2627,6 +2681,18 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = ipvs_proc_est_nice, }, + { + .procname = "conn_lfactor", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_conn_lfactor, + }, + { + .procname = "svc_lfactor", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_svc_lfactor, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -4854,6 +4920,16 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; + if (unpriv) + tbl[idx].mode = 0444; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_conn_lfactor; + + if (unpriv) + tbl[idx].mode = 0444; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_svc_lfactor; + #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) -- 2.53.0