Instead of storing the @log at the beginning of rps_dev_flow_table use 5 low order bits of the rps_tag_ptr to store the log of the size. This removes a potential cache line miss (for light traffic). This allows us to switch to one high-order allocation instead of vmalloc() when CONFIG_RFS_ACCEL is not set. Signed-off-by: Eric Dumazet --- include/net/netdev_rx_queue.h | 3 ++- include/net/rps.h | 4 +-- net/core/dev.c | 43 ++++++++++++++++++------------ net/core/net-sysfs.c | 49 ++++++++++++++++++++--------------- 4 files changed, 57 insertions(+), 42 deletions(-) diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index cfa72c4853876c6fcb84b5c551580d9205f7b29d..08f81329fc11dc86767f9da661be8c7194dc1da2 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -8,13 +8,14 @@ #include #include #include +#include /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_RPS struct rps_map __rcu *rps_map; - struct rps_dev_flow_table __rcu *rps_flow_table; + rps_tag_ptr rps_flow_table; #endif struct kobject kobj; const struct attribute_group **groups; diff --git a/include/net/rps.h b/include/net/rps.h index 98f96d6c606e63f5ac31a9cf98ce8a0fb6486ba9..d065de46148f38f395e503ea6f086a283f819489 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -43,11 +43,9 @@ struct rps_dev_flow { * The rps_dev_flow_table structure contains a table of flow mappings. */ struct rps_dev_flow_table { - u8 log; struct rps_dev_flow flows[]; }; -#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ - ((_num) * sizeof(struct rps_dev_flow))) +#define RPS_DEV_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_dev_flow_table, flows[_num])) /* * The rps_sock_flow_table contains mappings of flows to the last CPU diff --git a/net/core/dev.c b/net/core/dev.c index 23cc1fc5e608310d12956286c7b0199a481f19ab..d37de5de75da84d04b7c7bc8dfa6d96f990442b2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4968,9 +4968,9 @@ EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); -static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr) { - return hash_32(hash, flow_table->log); + return hash_32(hash, rps_tag_to_log(tag_ptr)); } #ifdef CONFIG_RFS_ACCEL @@ -4986,7 +4986,7 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) * Return: true if flow was recently active. */ static bool rps_flow_is_active(struct rps_dev_flow *rflow, - struct rps_dev_flow_table *flow_table, + u8 log, unsigned int cpu) { unsigned int flow_last_active; @@ -4999,7 +4999,7 @@ static bool rps_flow_is_active(struct rps_dev_flow *rflow, flow_last_active = READ_ONCE(rflow->last_qtail); return (int)(sd_input_head - flow_last_active) < - (int)(10 << flow_table->log); + (int)(10 << log); } #endif @@ -5014,6 +5014,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; struct rps_dev_flow *tmp_rflow; + rps_tag_ptr q_tag_ptr; unsigned int tmp_cpu; u16 rxq_index; u32 flow_id; @@ -5028,16 +5029,18 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; rxqueue = dev->_rx + rxq_index; - flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (!flow_table) + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); + if (!q_tag_ptr) goto out; - flow_id = rfs_slot(hash, flow_table); + flow_id = rfs_slot(hash, q_tag_ptr); + flow_table = rps_tag_to_table(q_tag_ptr); tmp_rflow = &flow_table->flows[flow_id]; tmp_cpu = READ_ONCE(tmp_rflow->cpu); if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { - if (rps_flow_is_active(tmp_rflow, flow_table, + if (rps_flow_is_active(tmp_rflow, + rps_tag_to_log(q_tag_ptr), tmp_cpu)) { if (hash != READ_ONCE(tmp_rflow->hash) || next_cpu == tmp_cpu) @@ -5076,8 +5079,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue = dev->_rx; - struct rps_dev_flow_table *flow_table; - rps_tag_ptr global_tag_ptr; + rps_tag_ptr global_tag_ptr, q_tag_ptr; struct rps_map *map; int cpu = -1; u32 tcpu; @@ -5098,9 +5100,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ - flow_table = rcu_dereference(rxqueue->rps_flow_table); + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); - if (!flow_table && !map) + if (!q_tag_ptr && !map) goto done; skb_reset_network_header(skb); @@ -5109,8 +5111,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto done; global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); - if (flow_table && global_tag_ptr) { + if (q_tag_ptr && global_tag_ptr) { struct rps_sock_flow_table *sock_flow_table; + struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; u32 next_cpu; u32 flow_id; @@ -5130,7 +5133,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; + flow_id = rfs_slot(hash, q_tag_ptr); + flow_table = rps_tag_to_table(q_tag_ptr); + rflow = &flow_table->flows[flow_id]; tcpu = rflow->cpu; /* @@ -5192,17 +5197,21 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; + rps_tag_ptr q_tag_ptr; bool expire = true; + u8 log; rcu_read_lock(); - flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id < (1UL << flow_table->log)) { + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); + log = rps_tag_to_log(q_tag_ptr); + if (q_tag_ptr && flow_id < (1UL << log)) { unsigned int cpu; + flow_table = rps_tag_to_table(q_tag_ptr); rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && - rps_flow_is_active(rflow, flow_table, cpu)) + rps_flow_is_active(rflow, log, cpu)) expire = false; } rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fd6f81930bc6437957f32206c84db87ee242fede..77950dd8de55802383cddc74fd74e14c7cc4a02a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1060,14 +1060,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, char *buf) { - struct rps_dev_flow_table *flow_table; unsigned long val = 0; + rps_tag_ptr tag_ptr; - rcu_read_lock(); - flow_table = rcu_dereference(queue->rps_flow_table); - if (flow_table) - val = 1UL << flow_table->log; - rcu_read_unlock(); + tag_ptr = READ_ONCE(queue->rps_flow_table); + if (tag_ptr) + val = 1UL << rps_tag_to_log(tag_ptr); return sysfs_emit(buf, "%lu\n", val); } @@ -1075,8 +1073,10 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len) { + struct rps_dev_flow_table *table; + rps_tag_ptr otag, tag_ptr = 0UL; unsigned long mask, count; - struct rps_dev_flow_table *table, *old_table; + size_t sz; int rc; if (!capable(CAP_NET_ADMIN)) @@ -1107,24 +1107,31 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return -EINVAL; } #endif - table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); + sz = max_t(size_t, RPS_DEV_FLOW_TABLE_SIZE(mask + 1), + PAGE_SIZE); + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || + is_power_of_2(sz)) + table = kvmalloc(sz, GFP_KERNEL); + else + table = vmalloc(sz); if (!table) return -ENOMEM; - - table->log = ilog2(mask) + 1; + tag_ptr = (rps_tag_ptr)table; + if (rps_tag_to_log(tag_ptr)) { + pr_err_once("store_rps_dev_flow_table_cnt() got a non page aligned allocation.\n"); + kvfree(table); + return -ENOMEM; + } + tag_ptr |= (ilog2(mask) + 1); for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; table->flows[count].filter = RPS_NO_FILTER; } - } else { - table = NULL; } - old_table = unrcu_pointer(xchg(&queue->rps_flow_table, - RCU_INITIALIZER(table))); - - if (old_table) - kvfree_rcu_mightsleep(old_table); + otag = xchg(&queue->rps_flow_table, tag_ptr); + if (otag) + kvfree_rcu_mightsleep(rps_tag_to_table(otag)); return len; } @@ -1150,7 +1157,7 @@ static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); #ifdef CONFIG_RPS - struct rps_dev_flow_table *old_table; + rps_tag_ptr tag_ptr; struct rps_map *map; map = rcu_dereference_protected(queue->rps_map, 1); @@ -1159,9 +1166,9 @@ static void rx_queue_release(struct kobject *kobj) kfree_rcu(map, rcu); } - old_table = unrcu_pointer(xchg(&queue->rps_flow_table, NULL)); - if (old_table) - kvfree_rcu_mightsleep(old_table); + tag_ptr = xchg(&queue->rps_flow_table, 0UL); + if (tag_ptr) + kvfree_rcu_mightsleep(rps_tag_to_table(tag_ptr)); #endif memset(kobj, 0, sizeof(*kobj)); -- 2.53.0.473.g4a7958ca14-goog