The current implementation of rps_record_sock_flow() updates the flow table every time a socket is processed on a different CPU. In high-load scenarios, especially with Accelerated RFS (ARFS), this triggers frequent flow steering updates via ndo_rx_flow_steer. For drivers like mlx5 that implement hardware flow steering, these constant updates lead to significant contention on internal driver locks (e.g., arfs_lock). This contention often becomes a performance bottleneck that outweighs the steering benefits. This patch introduces a cache-aware update strategy: the flow record is only updated if the flow migrates across Last Level Cache (LLC) boundaries. This minimizes expensive hardware reconfigurations while preserving cache locality for the application. Signed-off-by: Chuang Wang --- include/net/rps.h | 17 +-------------- net/core/dev.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/include/net/rps.h b/include/net/rps.h index e33c6a2fa8bb..2cd8698a79d5 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -55,22 +55,7 @@ struct rps_sock_flow_table { #define RPS_NO_CPU 0xffff -static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash) -{ - unsigned int index = hash & rps_tag_to_mask(tag_ptr); - u32 val = hash & ~net_hotdata.rps_cpu_mask; - struct rps_sock_flow_table *table; - - /* We only give a hint, preemption can change CPU under us */ - val |= raw_smp_processor_id(); - - table = rps_tag_to_table(tag_ptr); - /* The following WRITE_ONCE() is paired with the READ_ONCE() - * here, and another one in get_rps_cpu(). - */ - if (READ_ONCE(table[index].ent) != val) - WRITE_ONCE(table[index].ent, val); -} +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash); static inline void _sock_rps_record_flow_hash(__u32 hash) { diff --git a/net/core/dev.c b/net/core/dev.c index 203dc36aaed5..770cfb6fe06b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5175,6 +5175,60 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, return cpu; } +/** + * rps_record_cond - Determine if RPS flow table should be updated + * @old_val: Previous flow record value + * @new_val: Target flow record value + * + * Returns true if the record needs an update. + */ +static inline bool rps_record_cond(u32 old_val, u32 new_val) +{ + u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask; + u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask; + + if (old_val == new_val) + return false; + + /* Force update if the recorded CPU is invalid or has gone offline */ + if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu)) + return true; + + /* + * Force an update if the current task is no longer permitted + * to run on the old_cpu. + */ + if (!cpumask_test_cpu(old_cpu, current->cpus_ptr)) + return true; + + /* + * If CPUs do not share a cache, allow the update to prevent + * expensive remote memory accesses and cache misses. + */ + if (!cpus_share_cache(old_cpu, new_cpu)) + return true; + + return false; +} + +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash) +{ + unsigned int index = hash & rps_tag_to_mask(tag_ptr); + u32 val = hash & ~net_hotdata.rps_cpu_mask; + struct rps_sock_flow_table *table; + + /* We only give a hint, preemption can change CPU under us */ + val |= raw_smp_processor_id(); + + table = rps_tag_to_table(tag_ptr); + /* The following WRITE_ONCE() is paired with the READ_ONCE() + * here, and another one in get_rps_cpu(). + */ + if (rps_record_cond(READ_ONCE(table[index].ent), val)) + WRITE_ONCE(table[index].ent, val); +} +EXPORT_SYMBOL(rps_record_sock_flow); + #ifdef CONFIG_RFS_ACCEL /** -- 2.47.3