The atomic cmpxchg operations for the nth-mode matching is a scaling concern, on our production servers with 192 CPUs. The iptables rules that does sampling of every 10000 packets exists on INPUT and OUTPUT chains. Thus, these nth-counter rules are hit for every packets on the system with high concurrency. Our use-case is statistical sampling, where we don't need an accurate packet across all CPUs in the system. Thus, we implement per-CPU counters for the nth-mode match. This replaces the XT_STATISTIC_MODE_NTH, to avoid having to change userspace tooling. We keep and move atomic variant under XT_STATISTIC_MODE_NTH_ATOMIC mode, which userspace can easily be extended to leverage if this is necessary. Signed-off-by: Jesper Dangaard Brouer --- include/uapi/linux/netfilter/xt_statistic.h | 1 + net/netfilter/xt_statistic.c | 37 ++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/netfilter/xt_statistic.h b/include/uapi/linux/netfilter/xt_statistic.h index bbce6fcb26e3..f399dd27ff61 100644 --- a/include/uapi/linux/netfilter/xt_statistic.h +++ b/include/uapi/linux/netfilter/xt_statistic.h @@ -7,6 +7,7 @@ enum xt_statistic_mode { XT_STATISTIC_MODE_RANDOM, XT_STATISTIC_MODE_NTH, + XT_STATISTIC_MODE_NTH_ATOMIC, __XT_STATISTIC_MODE_MAX }; #define XT_STATISTIC_MODE_MAX (__XT_STATISTIC_MODE_MAX - 1) diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index d352c171f24d..165bff0a76e5 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -17,6 +17,7 @@ struct xt_statistic_priv { atomic_t count; + u32 __percpu *cnt_pcpu; } ____cacheline_aligned_in_smp; MODULE_LICENSE("GPL"); @@ -63,6 +64,21 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) ret = !ret; break; case XT_STATISTIC_MODE_NTH: + pkt_cnt = gso_pkt_cnt(skb); + do { + match = false; + oval = this_cpu_read(*priv->cnt_pcpu); + nval = oval + pkt_cnt; + if (nval > info->u.nth.every) { + match = true; + nval = nval - info->u.nth.every - 1; + nval = min(nval, info->u.nth.every); + } + } while (this_cpu_cmpxchg(*priv->cnt_pcpu, oval, nval) != oval); + if (match) + ret = !ret; + break; + case XT_STATISTIC_MODE_NTH_ATOMIC: pkt_cnt = gso_pkt_cnt(skb); do { match = false; @@ -85,6 +101,10 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) static int statistic_mt_check(const struct xt_mtchk_param *par) { struct xt_statistic_info *info = par->matchinfo; + struct xt_statistic_priv *priv; + u32 *this_cpu; + u32 nth_count; + int cpu; if (info->mode > XT_STATISTIC_MODE_MAX || info->flags & ~XT_STATISTIC_MASK) @@ -93,7 +113,21 @@ static int statistic_mt_check(const struct xt_mtchk_param *par) info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); if (info->master == NULL) return -ENOMEM; - atomic_set(&info->master->count, info->u.nth.count); + priv = info->master; + + priv->cnt_pcpu = alloc_percpu(u32); + if (!priv->cnt_pcpu) { + kfree(priv); + return -ENOMEM; + } + + /* Userspace specifies start nth.count value */ + nth_count = info->u.nth.count; + for_each_possible_cpu(cpu) { + this_cpu = per_cpu_ptr(priv->cnt_pcpu, cpu); + (*this_cpu) = nth_count; + } + atomic_set(&priv->count, nth_count); return 0; } @@ -102,6 +136,7 @@ static void statistic_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_statistic_info *info = par->matchinfo; + free_percpu(info->master->cnt_pcpu); kfree(info->master); }