The atomic cmpxchg operations for the nth-mode matching is a scaling
concern, on our production servers with 192 CPUs. The iptables rules that
does sampling of every 10000 packets exists on INPUT and OUTPUT chains.
Thus, these nth-counter rules are hit for every packets on the system with
high concurrency.

Our use-case is statistical sampling, where we don't need an accurate packet
across all CPUs in the system. Thus, we implement per-CPU counters for the
nth-mode match.

This replaces the XT_STATISTIC_MODE_NTH, to avoid having to change userspace
tooling. We keep and move atomic variant under XT_STATISTIC_MODE_NTH_ATOMIC
mode, which userspace can easily be extended to leverage if this is
necessary.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 include/uapi/linux/netfilter/xt_statistic.h |    1 +
 net/netfilter/xt_statistic.c                |   37 ++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/xt_statistic.h b/include/uapi/linux/netfilter/xt_statistic.h
index bbce6fcb26e3..f399dd27ff61 100644
--- a/include/uapi/linux/netfilter/xt_statistic.h
+++ b/include/uapi/linux/netfilter/xt_statistic.h
@@ -7,6 +7,7 @@
 enum xt_statistic_mode {
 	XT_STATISTIC_MODE_RANDOM,
 	XT_STATISTIC_MODE_NTH,
+	XT_STATISTIC_MODE_NTH_ATOMIC,
 	__XT_STATISTIC_MODE_MAX
 };
 #define XT_STATISTIC_MODE_MAX (__XT_STATISTIC_MODE_MAX - 1)
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index d352c171f24d..165bff0a76e5 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -17,6 +17,7 @@
 
 struct xt_statistic_priv {
 	atomic_t count;
+	u32 __percpu *cnt_pcpu;
 } ____cacheline_aligned_in_smp;
 
 MODULE_LICENSE("GPL");
@@ -63,6 +64,21 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
 			ret = !ret;
 		break;
 	case XT_STATISTIC_MODE_NTH:
+		pkt_cnt = gso_pkt_cnt(skb);
+		do {
+			match = false;
+			oval = this_cpu_read(*priv->cnt_pcpu);
+			nval = oval + pkt_cnt;
+			if (nval > info->u.nth.every) {
+				match = true;
+				nval = nval - info->u.nth.every - 1;
+				nval = min(nval, info->u.nth.every);
+			}
+		} while (this_cpu_cmpxchg(*priv->cnt_pcpu, oval, nval) != oval);
+		if (match)
+			ret = !ret;
+		break;
+	case XT_STATISTIC_MODE_NTH_ATOMIC:
 		pkt_cnt = gso_pkt_cnt(skb);
 		do {
 			match = false;
@@ -85,6 +101,10 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
 static int statistic_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_statistic_info *info = par->matchinfo;
+	struct xt_statistic_priv *priv;
+	u32 *this_cpu;
+	u32 nth_count;
+	int cpu;
 
 	if (info->mode > XT_STATISTIC_MODE_MAX ||
 	    info->flags & ~XT_STATISTIC_MASK)
@@ -93,7 +113,21 @@ static int statistic_mt_check(const struct xt_mtchk_param *par)
 	info->master = kzalloc(sizeof(*info->master), GFP_KERNEL);
 	if (info->master == NULL)
 		return -ENOMEM;
-	atomic_set(&info->master->count, info->u.nth.count);
+	priv = info->master;
+
+	priv->cnt_pcpu = alloc_percpu(u32);
+	if (!priv->cnt_pcpu) {
+		kfree(priv);
+		return -ENOMEM;
+	}
+
+	/* Userspace specifies start nth.count value */
+	nth_count = info->u.nth.count;
+	for_each_possible_cpu(cpu) {
+		this_cpu = per_cpu_ptr(priv->cnt_pcpu, cpu);
+		(*this_cpu) = nth_count;
+	}
+	atomic_set(&priv->count, nth_count);
 
 	return 0;
 }
@@ -102,6 +136,7 @@ static void statistic_mt_destroy(const struct xt_mtdtor_param *par)
 {
 	const struct xt_statistic_info *info = par->matchinfo;
 
+	free_percpu(info->master->cnt_pcpu);
 	kfree(info->master);
 }