The iptables statistic nth mode is documented to match one packet every nth packets. When packets gets GRO/GSO aggregated before traversing the statistic nth match, then they get accounted as a single packet. This patch takes into account the number of packet frags a GRO/GSO packet contains for the xt_statistic match. Signed-off-by: Jesper Dangaard Brouer --- net/netfilter/xt_statistic.c | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index b26c1dcfc27b..d352c171f24d 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -25,12 +25,37 @@ MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); MODULE_ALIAS("ipt_statistic"); MODULE_ALIAS("ip6t_statistic"); +static int gso_pkt_cnt(const struct sk_buff *skb) +{ + int pkt_cnt = 1; + + if (!skb_is_gso(skb)) + return pkt_cnt; + + /* GSO packets contain many smaller packets. This makes the probability + * incorrect, when wanting the probability to be per packet based. + */ + if (skb_has_frag_list(skb)) { + struct sk_buff *iter; + + skb_walk_frags(skb, iter) + pkt_cnt++; + } else { + pkt_cnt += skb_shinfo(skb)->nr_frags; + } + + return pkt_cnt; +} + static bool statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_statistic_info *info = par->matchinfo; + struct xt_statistic_priv *priv = info->master; bool ret = info->flags & XT_STATISTIC_INVERT; - int nval, oval; + u32 nval, oval; + int pkt_cnt; + bool match; switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: @@ -38,11 +63,18 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) ret = !ret; break; case XT_STATISTIC_MODE_NTH: + pkt_cnt = gso_pkt_cnt(skb); do { - oval = atomic_read(&info->master->count); - nval = (oval == info->u.nth.every) ? 0 : oval + 1; - } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval); - if (nval == 0) + match = false; + oval = atomic_read(&priv->count); + nval = oval + pkt_cnt; + if (nval > info->u.nth.every) { + match = true; + nval = nval - info->u.nth.every - 1; + nval = min(nval, info->u.nth.every); + } + } while (atomic_cmpxchg(&priv->count, oval, nval) != oval); + if (match) ret = !ret; break; } The atomic cmpxchg operations for the nth-mode matching is a scaling concern, on our production servers with 192 CPUs. The iptables rules that does sampling of every 10000 packets exists on INPUT and OUTPUT chains. Thus, these nth-counter rules are hit for every packets on the system with high concurrency. Our use-case is statistical sampling, where we don't need an accurate packet across all CPUs in the system. Thus, we implement per-CPU counters for the nth-mode match. This replaces the XT_STATISTIC_MODE_NTH, to avoid having to change userspace tooling. We keep and move atomic variant under XT_STATISTIC_MODE_NTH_ATOMIC mode, which userspace can easily be extended to leverage if this is necessary. Signed-off-by: Jesper Dangaard Brouer --- include/uapi/linux/netfilter/xt_statistic.h | 1 + net/netfilter/xt_statistic.c | 37 ++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/netfilter/xt_statistic.h b/include/uapi/linux/netfilter/xt_statistic.h index bbce6fcb26e3..f399dd27ff61 100644 --- a/include/uapi/linux/netfilter/xt_statistic.h +++ b/include/uapi/linux/netfilter/xt_statistic.h @@ -7,6 +7,7 @@ enum xt_statistic_mode { XT_STATISTIC_MODE_RANDOM, XT_STATISTIC_MODE_NTH, + XT_STATISTIC_MODE_NTH_ATOMIC, __XT_STATISTIC_MODE_MAX }; #define XT_STATISTIC_MODE_MAX (__XT_STATISTIC_MODE_MAX - 1) diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index d352c171f24d..165bff0a76e5 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -17,6 +17,7 @@ struct xt_statistic_priv { atomic_t count; + u32 __percpu *cnt_pcpu; } ____cacheline_aligned_in_smp; MODULE_LICENSE("GPL"); @@ -63,6 +64,21 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) ret = !ret; break; case XT_STATISTIC_MODE_NTH: + pkt_cnt = gso_pkt_cnt(skb); + do { + match = false; + oval = this_cpu_read(*priv->cnt_pcpu); + nval = oval + pkt_cnt; + if (nval > info->u.nth.every) { + match = true; + nval = nval - info->u.nth.every - 1; + nval = min(nval, info->u.nth.every); + } + } while (this_cpu_cmpxchg(*priv->cnt_pcpu, oval, nval) != oval); + if (match) + ret = !ret; + break; + case XT_STATISTIC_MODE_NTH_ATOMIC: pkt_cnt = gso_pkt_cnt(skb); do { match = false; @@ -85,6 +101,10 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) static int statistic_mt_check(const struct xt_mtchk_param *par) { struct xt_statistic_info *info = par->matchinfo; + struct xt_statistic_priv *priv; + u32 *this_cpu; + u32 nth_count; + int cpu; if (info->mode > XT_STATISTIC_MODE_MAX || info->flags & ~XT_STATISTIC_MASK) @@ -93,7 +113,21 @@ static int statistic_mt_check(const struct xt_mtchk_param *par) info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); if (info->master == NULL) return -ENOMEM; - atomic_set(&info->master->count, info->u.nth.count); + priv = info->master; + + priv->cnt_pcpu = alloc_percpu(u32); + if (!priv->cnt_pcpu) { + kfree(priv); + return -ENOMEM; + } + + /* Userspace specifies start nth.count value */ + nth_count = info->u.nth.count; + for_each_possible_cpu(cpu) { + this_cpu = per_cpu_ptr(priv->cnt_pcpu, cpu); + (*this_cpu) = nth_count; + } + atomic_set(&priv->count, nth_count); return 0; } @@ -102,6 +136,7 @@ static void statistic_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_statistic_info *info = par->matchinfo; + free_percpu(info->master->cnt_pcpu); kfree(info->master); } This is not for upstream comsumption. Used this while developing the patch to valid the two cases was exercised. Nacked-by: Jesper Dangaard Brouer --- net/netfilter/xt_statistic.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 165bff0a76e5..016669a71f2a 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -4,6 +4,7 @@ * * Based on ipt_random and ipt_nth by Fabrice MARIE . */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -26,7 +27,12 @@ MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); MODULE_ALIAS("ipt_statistic"); MODULE_ALIAS("ip6t_statistic"); -static int gso_pkt_cnt(const struct sk_buff *skb) +enum gso_type { + SKB_GSO_FRAGS_LIST, + SKB_GSO_FRAGS_ARRAY +}; + +static int gso_pkt_cnt(const struct sk_buff *skb, enum gso_type *type) { int pkt_cnt = 1; @@ -39,9 +45,11 @@ static int gso_pkt_cnt(const struct sk_buff *skb) if (skb_has_frag_list(skb)) { struct sk_buff *iter; + *type = SKB_GSO_FRAGS_LIST; skb_walk_frags(skb, iter) pkt_cnt++; } else { + *type = SKB_GSO_FRAGS_ARRAY; pkt_cnt += skb_shinfo(skb)->nr_frags; } @@ -54,9 +62,10 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_statistic_info *info = par->matchinfo; struct xt_statistic_priv *priv = info->master; bool ret = info->flags & XT_STATISTIC_INVERT; + enum gso_type gso_type; + bool match = false; u32 nval, oval; int pkt_cnt; - bool match; switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: @@ -64,7 +73,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) ret = !ret; break; case XT_STATISTIC_MODE_NTH: - pkt_cnt = gso_pkt_cnt(skb); + pkt_cnt = gso_pkt_cnt(skb, &gso_type); do { match = false; oval = this_cpu_read(*priv->cnt_pcpu); @@ -79,7 +88,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) ret = !ret; break; case XT_STATISTIC_MODE_NTH_ATOMIC: - pkt_cnt = gso_pkt_cnt(skb); + pkt_cnt = gso_pkt_cnt(skb, &gso_type); do { match = false; oval = atomic_read(&priv->count); @@ -95,6 +104,10 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) break; } + if (match) + pr_info("debug XXX: SKB is GRO type:%d contains %d packets\n", + gso_type, pkt_cnt); + return ret; } @@ -154,11 +167,13 @@ static struct xt_match xt_statistic_mt_reg __read_mostly = { static int __init statistic_mt_init(void) { + pr_info("module init\n"); return xt_register_match(&xt_statistic_mt_reg); } static void __exit statistic_mt_exit(void) { + pr_info("module exit\n"); xt_unregister_match(&xt_statistic_mt_reg); }