In order to protect flow operations from RTNL contention, this patch decouples flow_table modifications from ovs_mutex by means of the following: 1 - Create a new mutex inside the flow_table that protects it from concurrent modifications. Putting the mutex inside flow_table makes it easier to consume for functions inside flow_table.c that do not currently take pointers to the datapath. Some function signatures need to be changed to accept flow_table so that lockdep checks can be performed. 2 - Create a reference count to temporarily extend rcu protection from the datapath to the flow_table. One reference is held by the datapath, the other is temporarily increased during flow modifications. Signed-off-by: Adrian Moreno --- net/openvswitch/datapath.c | 174 ++++++++++++++++++++++------------- net/openvswitch/flow_table.c | 46 +++++++-- net/openvswitch/flow_table.h | 37 +++++++- 3 files changed, 181 insertions(+), 76 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 72ad3ed12675..700ea79a324e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -88,13 +88,17 @@ static void ovs_notify(struct genl_family *family, * DOC: Locking: * * All writes e.g. Writes to device state (add/remove datapath, port, set - * operations on vports, etc.), Writes to other state (flow table - * modifications, set miscellaneous datapath parameters, etc.) are protected - * by ovs_lock. + * operations on vports, etc.) and writes to other datapath parameters + * are protected by ovs_lock. + * + * Writes to the flow table are NOT protected by ovs_lock. Instead, a per-table + * mutex and reference count are used (see comment above "struct flow_table" + * definition). On some few occasions, the per-flow table mutex is nested + * inside ovs_mutex. * * Reads are protected by RCU. * - * There are a few special cases (mostly stats) that have their own + * There are a few other special cases (mostly stats) that have their own * synchronization but they nest under all of above and don't interact with * each other. * @@ -759,16 +763,19 @@ static struct genl_family dp_packet_genl_family __ro_after_init = { static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, struct ovs_dp_megaflow_stats *mega_stats) { - struct flow_table *table = ovsl_dereference(dp->table); + struct flow_table *table; int i; memset(mega_stats, 0, sizeof(*mega_stats)); memset(stats, 0, sizeof(*stats)); + rcu_read_lock(); + table = rcu_dereference(dp->table); if (table) { stats->n_flows = ovs_flow_tbl_count(table); mega_stats->n_masks = ovs_flow_tbl_num_masks(table); } + rcu_read_unlock(); stats->n_hit = stats->n_missed = stats->n_lost = 0; @@ -1081,17 +1088,25 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_kfree_acts; } - ovs_lock(); + rcu_read_lock(); dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; - goto err_unlock_ovs; + rcu_read_unlock(); + goto err_kfree_reply; } - table = ovsl_dereference(dp->table); - if (!table) { + table = rcu_dereference(dp->table); + if (!table || !ovs_flow_tbl_get(table)) { error = -ENODEV; - goto err_unlock_ovs; + rcu_read_unlock(); + goto err_kfree_reply; } + rcu_read_unlock(); + + /* It is safe to dereference "table" after leaving rcu read-protected + * region because it's pinned by refcount. + */ + mutex_lock(&table->lock); /* Check if this is a duplicate flow */ if (ovs_identifier_is_ufid(&new_flow->id)) @@ -1105,7 +1120,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) error = ovs_flow_tbl_insert(table, new_flow, &mask); if (unlikely(error)) { acts = NULL; - goto err_unlock_ovs; + goto err_unlock_tbl; } if (unlikely(reply)) { @@ -1117,7 +1132,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) ufid_flags); BUG_ON(error < 0); } - ovs_unlock(); } else { struct sw_flow_actions *old_acts; @@ -1130,7 +1144,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))) { error = -EEXIST; - goto err_unlock_ovs; + goto err_unlock_tbl; } /* The flow identifier has to be the same for flow updates. * Look for any overlapping flow. @@ -1143,7 +1157,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) flow = NULL; if (!flow) { error = -ENOENT; - goto err_unlock_ovs; + goto err_unlock_tbl; } } /* Update actions. */ @@ -1159,20 +1173,23 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) ufid_flags); BUG_ON(error < 0); } - ovs_unlock(); - ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } + mutex_unlock(&table->lock); + ovs_flow_tbl_put(table); + if (reply) ovs_notify(&dp_flow_genl_family, reply, info); kfree(key); return 0; -err_unlock_ovs: - ovs_unlock(); +err_unlock_tbl: + mutex_unlock(&table->lock); + ovs_flow_tbl_put(table); +err_kfree_reply: kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); @@ -1301,17 +1318,26 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) } } - ovs_lock(); + rcu_read_lock(); dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; - goto err_unlock_ovs; + rcu_read_unlock(); + goto err_free_reply; } - table = ovsl_dereference(dp->table); - if (!table) { + table = rcu_dereference(dp->table); + if (!table || !ovs_flow_tbl_get(table)) { + rcu_read_unlock(); error = -ENODEV; - goto err_unlock_ovs; + goto err_free_reply; } + rcu_read_unlock(); + + /* It is safe to dereference "table" after leaving rcu read-protected + * region because it's pinned by refcount. + */ + mutex_lock(&table->lock); + /* Check that the flow exists. */ if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(table, &sfid); @@ -1319,7 +1345,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) flow = ovs_flow_tbl_lookup_exact(table, &match); if (unlikely(!flow)) { error = -ENOENT; - goto err_unlock_ovs; + goto err_unlock_tbl; } /* Update actions, if present. */ @@ -1345,14 +1371,16 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(reply)) { error = PTR_ERR(reply); - goto err_unlock_ovs; + goto err_unlock_tbl; } } /* Clear stats. */ if (a[OVS_FLOW_ATTR_CLEAR]) ovs_flow_stats_clear(flow, table); - ovs_unlock(); + + mutex_unlock(&table->lock); + ovs_flow_tbl_put(table); if (reply) ovs_notify(&dp_flow_genl_family, reply, info); @@ -1361,8 +1389,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) return 0; -err_unlock_ovs: - ovs_unlock(); +err_unlock_tbl: + mutex_unlock(&table->lock); + ovs_flow_tbl_put(table); +err_free_reply: kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); @@ -1400,17 +1430,24 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) if (err) return err; - ovs_lock(); + rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { - err = -ENODEV; - goto unlock; + rcu_read_unlock(); + return -ENODEV; } - table = ovsl_dereference(dp->table); - if (!table) { - err = -ENODEV; - goto unlock; + table = rcu_dereference(dp->table); + if (!table || !ovs_flow_tbl_get(table)) { + rcu_read_unlock(); + return -ENODEV; } + rcu_read_unlock(); + + /* It is safe to dereference "table" after leaving rcu read-protected + * region because it's pinned by refcount. + */ + mutex_lock(&table->lock); + if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(table, &ufid); @@ -1429,10 +1466,12 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - ovs_unlock(); + mutex_unlock(&table->lock); + ovs_flow_tbl_put(table); return genlmsg_reply(reply, info); unlock: - ovs_unlock(); + mutex_unlock(&table->lock); + ovs_flow_tbl_put(table); return err; } @@ -1462,17 +1501,24 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) return err; } - ovs_lock(); + rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (unlikely(!dp)) { - err = -ENODEV; - goto unlock; + rcu_read_unlock(); + return -ENODEV; } - table = ovsl_dereference(dp->table); - if (!table) { - err = -ENODEV; - goto unlock; + table = rcu_dereference(dp->table); + if (!table || !ovs_flow_tbl_get(table)) { + rcu_read_unlock(); + return -ENODEV; } + rcu_read_unlock(); + + /* It is safe to dereference "table" after leaving rcu read-protected + * region because it's pinned by refcount. + */ + mutex_lock(&table->lock); + if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { err = ovs_flow_tbl_flush(table); @@ -1489,7 +1535,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) } ovs_flow_tbl_remove(table, flow); - ovs_unlock(); + mutex_unlock(&table->lock); reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, &flow->id, info, false, ufid_flags); @@ -1516,10 +1562,12 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) } out_free: + ovs_flow_tbl_put(table); ovs_flow_free(flow, true); return 0; unlock: - ovs_unlock(); + mutex_unlock(&table->lock); + ovs_flow_tbl_put(table); return err; } @@ -1545,7 +1593,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_unlock(); return -ENODEV; } - table = rcu_dereference_ovsl(dp->table); + table = rcu_dereference(dp->table); if (!table) { rcu_read_unlock(); return -ENODEV; @@ -1650,10 +1698,6 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, struct flow_table *table; int err, pids_len; - table = ovsl_dereference(dp->table); - if (!table) - return -ENODEV; - ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, flags, cmd); if (!ovs_header) @@ -1678,8 +1722,12 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; - if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, - ovs_flow_tbl_masks_cache_size(table))) + rcu_read_lock(); + table = rcu_dereference(dp->table); + err = table ? nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(table)) : 0; + rcu_read_unlock(); + if (err) goto nla_put_failure; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) { @@ -1817,7 +1865,9 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) return -ENODEV; cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + mutex_lock(&table->lock); err = ovs_flow_tbl_masks_cache_resize(table, cache_size); + mutex_unlock(&table->lock); if (err) return err; } @@ -1968,7 +2018,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: - call_rcu(&table->rcu, ovs_flow_tbl_destroy_rcu); + ovs_flow_tbl_put(table); err_destroy_dp: kfree(dp); err_destroy_reply: @@ -2003,16 +2053,11 @@ static void __dp_destroy(struct datapath *dp) */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); - /* Flush sw_flow in the tables. RCU cb only releases resource - * such as dp, ports and tables. That may avoid some issues - * such as RCU usage warning. - */ - table_instance_flow_flush(table, ovsl_dereference(table->ti), - ovsl_dereference(table->ufid_ti)); + rcu_assign_pointer(dp->table, NULL); + ovs_flow_tbl_put(table); - /* RCU destroy the ports, meters and flow tables. */ + /* RCU destroy the ports and meters. */ call_rcu(&dp->rcu, destroy_dp_rcu); - call_rcu(&table->rcu, ovs_flow_tbl_destroy_rcu); } static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) @@ -2656,9 +2701,12 @@ static void ovs_dp_masks_rebalance(struct work_struct *work) ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { table = ovsl_dereference(dp->table); - if (!table) + if (!table || !ovs_flow_tbl_get(table)) continue; + mutex_lock(&table->lock); ovs_flow_masks_rebalance(table); + mutex_unlock(&table->lock); + ovs_flow_tbl_put(table); } ovs_unlock(); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 2ae168f31f50..3934873a44c3 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -45,6 +45,16 @@ static struct kmem_cache *flow_cache; struct kmem_cache *flow_stats_cache __read_mostly; +#ifdef CONFIG_LOCKDEP +int lockdep_ovs_tbl_is_held(const struct flow_table *table) +{ + if (debug_locks) + return lockdep_is_held(&table->lock); + else + return 1; +} +#endif + static u16 range_n_bytes(const struct sw_flow_key_range *range) { return range->end - range->start; @@ -102,7 +112,7 @@ struct sw_flow *ovs_flow_alloc(void) int ovs_flow_tbl_count(const struct flow_table *table) { - return table->count; + return READ_ONCE(table->count); } static void flow_free(struct sw_flow *flow) @@ -417,6 +427,10 @@ struct flow_table *ovs_flow_tbl_alloc(void) table = kzalloc_obj(*table, GFP_KERNEL); if (!table) return ERR_PTR(-ENOMEM); + + mutex_init(&table->lock); + refcount_set(&table->refcnt, 1); + mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); if (!mc) goto free_table; @@ -449,6 +463,7 @@ struct flow_table *ovs_flow_tbl_alloc(void) free_mask_cache: __mask_cache_destroy(mc); free_table: + mutex_destroy(&table->lock); kfree(table); return ERR_PTR(-ENOMEM); } @@ -467,7 +482,7 @@ static void table_instance_flow_free(struct flow_table *table, struct sw_flow *flow) { hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); - table->count--; + WRITE_ONCE(table->count, table->count - 1); if (ovs_identifier_is_ufid(&flow->id)) { hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); @@ -477,10 +492,10 @@ static void table_instance_flow_free(struct flow_table *table, flow_mask_remove(table, flow->mask); } -/* Must be called with OVS mutex held. */ -void table_instance_flow_flush(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti) +/* Must be called with table mutex held. */ +static void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti) { int i; @@ -500,7 +515,7 @@ void table_instance_flow_flush(struct flow_table *table, if (WARN_ON(table->count != 0 || table->ufid_count != 0)) { - table->count = 0; + WRITE_ONCE(table->count, 0); table->ufid_count = 0; } } @@ -513,7 +528,7 @@ static void table_instance_destroy(struct table_instance *ti, } /* No need for locking this function is called from RCU callback. */ -void ovs_flow_tbl_destroy_rcu(struct rcu_head *rcu) +static void ovs_flow_tbl_destroy_rcu(struct rcu_head *rcu) { struct flow_table *table = container_of(rcu, struct flow_table, rcu); @@ -525,9 +540,22 @@ void ovs_flow_tbl_destroy_rcu(struct rcu_head *rcu) call_rcu(&mc->rcu, mask_cache_rcu_cb); call_rcu(&ma->rcu, mask_array_rcu_cb); table_instance_destroy(ti, ufid_ti); + mutex_destroy(&table->lock); kfree(table); } +void ovs_flow_tbl_put(struct flow_table *table) +{ + if (refcount_dec_and_test(&table->refcnt)) { + mutex_lock(&table->lock); + table_instance_flow_flush(table, + ovs_tbl_dereference(table->ti, table), + ovs_tbl_dereference(table->ufid_ti, table)); + mutex_unlock(&table->lock); + call_rcu(&table->rcu, ovs_flow_tbl_destroy_rcu); + } +} + struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, u32 *bucket, u32 *last) { @@ -1060,7 +1088,7 @@ static void flow_key_insert(struct flow_table *table, struct sw_flow *flow) flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range); ti = ovs_tbl_dereference(table->ti, table); table_instance_insert(ti, flow); - table->count++; + WRITE_ONCE(table->count, table->count + 1); /* Expand table, if necessary, to make room. */ if (table->count > ti->n_buckets) diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 3e5e9845c28a..1b5242a97813 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -59,7 +59,31 @@ struct table_instance { u32 hash_seed; }; +/* Locking: + * + * flow_table is _not_ protected by ovs_lock (see comment above ovs_mutex + * in datapath.c). + * + * All writes to flow_table are protected by the embedded "lock". + * In order to ensure datapath destruction does not trigger the destruction + * of the flow_table, "refcnt" is used. Therefore, writers must: + * 1 - Enter rcu read-protected section + * 2 - Increase "table->refcnt" + * 3 - Leave rcu read-protected section (to avoid using mutexes inside rcu) + * 4 - Lock "table->lock" + * 5 - Perform modifications + * 6 - Release "table->lock" + * 7 - Decrease "table->refcnt" + * + * Reads are protected by RCU. + * + * Note with this schema, it's possible that a flow operation is performed on a + * flow_table that is about to be freed. + */ struct flow_table { + /* Locks flow table writes. */ + struct mutex lock; + refcount_t refcnt; struct rcu_head rcu; struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; @@ -72,11 +96,15 @@ struct flow_table { extern struct kmem_cache *flow_stats_cache; +#ifdef CONFIG_LOCKDEP +int lockdep_ovs_tbl_is_held(const struct flow_table *table); +#else static inline int lockdep_ovs_tbl_is_held(const struct flow_table *table __always_unused) { return 1; } +#endif #define ASSERT_OVS_TBL(tbl) WARN_ON(!lockdep_ovs_tbl_is_held(tbl)) @@ -95,7 +123,11 @@ struct sw_flow *ovs_flow_alloc(void); void ovs_flow_free(struct sw_flow *, bool deferred); struct flow_table *ovs_flow_tbl_alloc(void); -void ovs_flow_tbl_destroy_rcu(struct rcu_head *table); +void ovs_flow_tbl_put(struct flow_table *table); +static inline bool ovs_flow_tbl_get(struct flow_table *table) +{ + return refcount_inc_not_zero(&table->refcnt); +} int ovs_flow_tbl_count(const struct flow_table *table); int ovs_flow_tbl_flush(struct flow_table *flow_table); @@ -125,8 +157,5 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask); void ovs_flow_masks_rebalance(struct flow_table *table); -void table_instance_flow_flush(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti); #endif /* flow_table.h */ -- 2.54.0