Instead of storing the @log at the beginning of rps_dev_flow_table use 5 low order bits of the rps_tag_ptr to store the log of the size. This removes a potential cache line miss (for light traffic). This allows us to switch to one high-order allocation instead of vmalloc() when CONFIG_RFS_ACCEL is not set. Signed-off-by: Eric Dumazet --- include/net/netdev_rx_queue.h | 3 +- include/net/rps.h | 10 ----- net/core/dev.c | 53 +++++++++++++++----------- net/core/net-sysfs.c | 70 +++++++++++++++++------------------ 4 files changed, 67 insertions(+), 69 deletions(-) diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index cfa72c4853876c6fcb84b5c551580d9205f7b29d..08f81329fc11dc86767f9da661be8c7194dc1da2 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -8,13 +8,14 @@ #include #include #include +#include /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_RPS struct rps_map __rcu *rps_map; - struct rps_dev_flow_table __rcu *rps_flow_table; + rps_tag_ptr rps_flow_table; #endif struct kobject kobj; const struct attribute_group **groups; diff --git a/include/net/rps.h b/include/net/rps.h index e900480e828b487c721b3ef392f4abb427ad442c..e33c6a2fa8bbca3555ecccbbf9132d01cc433c36 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -39,16 +39,6 @@ struct rps_dev_flow { }; #define RPS_NO_FILTER 0xffff -/* - * The rps_dev_flow_table structure contains a table of flow mappings. - */ -struct rps_dev_flow_table { - u8 log; - struct rps_dev_flow flows[]; -}; -#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ - ((_num) * sizeof(struct rps_dev_flow))) - /* * The rps_sock_flow_table contains mappings of flows to the last CPU * on which they were processed by the application (set in recvmsg). diff --git a/net/core/dev.c b/net/core/dev.c index d4837b058b2ff02e94f9590e310edbcb06dad0f2..053a30a8c0ea4464d3b61c7dde8ad916eeef1c19 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4968,16 +4968,16 @@ EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); -static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr) { - return hash_32(hash, flow_table->log); + return hash_32(hash, rps_tag_to_log(tag_ptr)); } #ifdef CONFIG_RFS_ACCEL /** * rps_flow_is_active - check whether the flow is recently active. * @rflow: Specific flow to check activity. - * @flow_table: per-queue flowtable that @rflow belongs to. + * @log: ilog2(hashsize). * @cpu: CPU saved in @rflow. * * If the CPU has processed many packets since the flow's last activity @@ -4986,7 +4986,7 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) * Return: true if flow was recently active. */ static bool rps_flow_is_active(struct rps_dev_flow *rflow, - struct rps_dev_flow_table *flow_table, + u8 log, unsigned int cpu) { unsigned int flow_last_active; @@ -4999,7 +4999,7 @@ static bool rps_flow_is_active(struct rps_dev_flow *rflow, flow_last_active = READ_ONCE(rflow->last_qtail); return (int)(sd_input_head - flow_last_active) < - (int)(10 << flow_table->log); + (int)(10 << log); } #endif @@ -5011,9 +5011,10 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; - struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *flow_table; struct rps_dev_flow *old_rflow; struct rps_dev_flow *tmp_rflow; + rps_tag_ptr q_tag_ptr; unsigned int tmp_cpu; u16 rxq_index; u32 flow_id; @@ -5028,16 +5029,18 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; rxqueue = dev->_rx + rxq_index; - flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (!flow_table) + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); + if (!q_tag_ptr) goto out; - flow_id = rfs_slot(hash, flow_table); - tmp_rflow = &flow_table->flows[flow_id]; + flow_id = rfs_slot(hash, q_tag_ptr); + flow_table = rps_tag_to_table(q_tag_ptr); + tmp_rflow = flow_table + flow_id; tmp_cpu = READ_ONCE(tmp_rflow->cpu); if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { - if (rps_flow_is_active(tmp_rflow, flow_table, + if (rps_flow_is_active(tmp_rflow, + rps_tag_to_log(q_tag_ptr), tmp_cpu)) { if (hash != READ_ONCE(tmp_rflow->hash) || next_cpu == tmp_cpu) @@ -5076,8 +5079,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue = dev->_rx; - struct rps_dev_flow_table *flow_table; - rps_tag_ptr global_tag_ptr; + rps_tag_ptr global_tag_ptr, q_tag_ptr; struct rps_map *map; int cpu = -1; u32 tcpu; @@ -5098,9 +5100,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ - flow_table = rcu_dereference(rxqueue->rps_flow_table); + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); - if (!flow_table && !map) + if (!q_tag_ptr && !map) goto done; skb_reset_network_header(skb); @@ -5109,8 +5111,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto done; global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); - if (flow_table && global_tag_ptr) { + if (q_tag_ptr && global_tag_ptr) { struct rps_sock_flow_table *sock_flow_table; + struct rps_dev_flow *flow_table; struct rps_dev_flow *rflow; u32 next_cpu; u32 flow_id; @@ -5130,7 +5133,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; + flow_id = rfs_slot(hash, q_tag_ptr); + flow_table = rps_tag_to_table(q_tag_ptr); + rflow = flow_table + flow_id; tcpu = rflow->cpu; /* @@ -5190,19 +5195,23 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; - struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *flow_table; struct rps_dev_flow *rflow; + rps_tag_ptr q_tag_ptr; bool expire = true; + u8 log; rcu_read_lock(); - flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id < (1UL << flow_table->log)) { + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); + log = rps_tag_to_log(q_tag_ptr); + if (q_tag_ptr && flow_id < (1UL << log)) { unsigned int cpu; - rflow = &flow_table->flows[flow_id]; + flow_table = rps_tag_to_table(q_tag_ptr); + rflow = flow_table + flow_id; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && - rps_flow_is_active(rflow, flow_table, cpu)) + rps_flow_is_active(rflow, log, cpu)) expire = false; } rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fd6f81930bc6437957f32206c84db87ee242fede..2ce011fae2490b3bd950cf8d9089e7d71cc0fd7a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1060,14 +1060,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, char *buf) { - struct rps_dev_flow_table *flow_table; unsigned long val = 0; + rps_tag_ptr tag_ptr; - rcu_read_lock(); - flow_table = rcu_dereference(queue->rps_flow_table); - if (flow_table) - val = 1UL << flow_table->log; - rcu_read_unlock(); + tag_ptr = READ_ONCE(queue->rps_flow_table); + if (tag_ptr) + val = 1UL << rps_tag_to_log(tag_ptr); return sysfs_emit(buf, "%lu\n", val); } @@ -1075,8 +1073,10 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len) { + rps_tag_ptr otag, tag_ptr = 0UL; + struct rps_dev_flow *table; unsigned long mask, count; - struct rps_dev_flow_table *table, *old_table; + size_t sz; int rc; if (!capable(CAP_NET_ADMIN)) @@ -1093,38 +1093,36 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, */ while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); - /* On 64 bit arches, must check mask fits in table->mask (u32), - * and on 32bit arches, must check - * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. - */ -#if BITS_PER_LONG > 32 - if (mask > (unsigned long)(u32)mask) - return -EINVAL; -#else - if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) - / sizeof(struct rps_dev_flow)) { - /* Enforce a limit to prevent overflow */ + + /* Do not accept too large tables. */ + if (mask > (INT_MAX / sizeof(*table) - 1)) return -EINVAL; - } -#endif - table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); + + sz = max_t(size_t, sizeof(*table) * (mask + 1), + PAGE_SIZE); + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || + is_power_of_2(sizeof(*table))) + table = kvmalloc(sz, GFP_KERNEL); + else + table = vmalloc(sz); if (!table) return -ENOMEM; - - table->log = ilog2(mask) + 1; + tag_ptr = (rps_tag_ptr)table; + if (rps_tag_to_log(tag_ptr)) { + pr_err_once("store_rps_dev_flow_table_cnt() got a non page aligned allocation.\n"); + kvfree(table); + return -ENOMEM; + } + tag_ptr |= (ilog2(mask) + 1); for (count = 0; count <= mask; count++) { - table->flows[count].cpu = RPS_NO_CPU; - table->flows[count].filter = RPS_NO_FILTER; + table[count].cpu = RPS_NO_CPU; + table[count].filter = RPS_NO_FILTER; } - } else { - table = NULL; } - old_table = unrcu_pointer(xchg(&queue->rps_flow_table, - RCU_INITIALIZER(table))); - - if (old_table) - kvfree_rcu_mightsleep(old_table); + otag = xchg(&queue->rps_flow_table, tag_ptr); + if (otag) + kvfree_rcu_mightsleep(rps_tag_to_table(otag)); return len; } @@ -1150,7 +1148,7 @@ static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); #ifdef CONFIG_RPS - struct rps_dev_flow_table *old_table; + rps_tag_ptr tag_ptr; struct rps_map *map; map = rcu_dereference_protected(queue->rps_map, 1); @@ -1159,9 +1157,9 @@ static void rx_queue_release(struct kobject *kobj) kfree_rcu(map, rcu); } - old_table = unrcu_pointer(xchg(&queue->rps_flow_table, NULL)); - if (old_table) - kvfree_rcu_mightsleep(old_table); + tag_ptr = xchg(&queue->rps_flow_table, 0UL); + if (tag_ptr) + kvfree_rcu_mightsleep(rps_tag_to_table(tag_ptr)); #endif memset(kobj, 0, sizeof(*kobj)); -- 2.53.0.473.g4a7958ca14-goog