From: Bobby Eshleman Rename the 'tx_vec' field in struct net_devmem_dmabuf_binding to 'vec'. This field holds pointers to net_iov structures. The rename prepares for reusing 'vec' for both TX and RX directions. No functional change intended. Reviewed-by: Mina Almasry Signed-off-by: Bobby Eshleman --- net/core/devmem.c | 22 +++++++++++----------- net/core/devmem.h | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/net/core/devmem.c b/net/core/devmem.c index 1d04754bc756..4dee2666dd07 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -75,7 +75,7 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); - kvfree(binding->tx_vec); + kvfree(binding->vec); kfree(binding); } @@ -232,10 +232,10 @@ net_devmem_bind_dmabuf(struct net_device *dev, } if (direction == DMA_TO_DEVICE) { - binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, - sizeof(struct net_iov *), - GFP_KERNEL); - if (!binding->tx_vec) { + binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->vec) { err = -ENOMEM; goto err_unmap; } @@ -249,7 +249,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM; - goto err_tx_vec; + goto err_vec; } virtual = 0; @@ -295,7 +295,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); if (direction == DMA_TO_DEVICE) - binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; + binding->vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; } virtual += len; @@ -315,8 +315,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, gen_pool_for_each_chunk(binding->chunk_pool, net_devmem_dmabuf_free_chunk_owner, NULL); gen_pool_destroy(binding->chunk_pool); -err_tx_vec: - kvfree(binding->tx_vec); +err_vec: + kvfree(binding->vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, direction); @@ -363,7 +363,7 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, int err = 0; binding = net_devmem_lookup_dmabuf(dmabuf_id); - if (!binding || !binding->tx_vec) { + if (!binding || !binding->vec) { err = -EINVAL; goto out_err; } @@ -414,7 +414,7 @@ net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, *off = virt_addr % PAGE_SIZE; *size = PAGE_SIZE - *off; - return binding->tx_vec[virt_addr / PAGE_SIZE]; + return binding->vec[virt_addr / PAGE_SIZE]; } /*** "Dmabuf devmem memory provider" ***/ diff --git a/net/core/devmem.h b/net/core/devmem.h index 101150d761af..2ada54fb63d7 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -63,7 +63,7 @@ struct net_devmem_dmabuf_binding { * address. This array is convenient to map the virtual addresses to * net_iovs in the TX path. */ - struct net_iov **tx_vec; + struct net_iov **vec; struct work_struct unbind_w; }; -- 2.47.3 From: Bobby Eshleman Refactor sock_devmem_dontneed() in preparation for supporting both autorelease and manual token release modes. Split the function into two parts: - sock_devmem_dontneed(): handles input validation, token allocation, and copying from userspace - sock_devmem_dontneed_autorelease(): performs the actual token release via xarray lookup and page pool put This separation allows a future commit to add a parallel sock_devmem_dontneed_manual_release() function that uses a different token tracking mechanism (per-niov reference counting) without duplicating the input validation logic. The refactoring is purely mechanical with no functional change. Only intended to minimize the noise in subsequent patches. Reviewed-by: Mina Almasry Signed-off-by: Bobby Eshleman --- net/core/sock.c | 52 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 7a9bbc2afcf0..5562f517d889 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1082,30 +1082,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes) #define MAX_DONTNEED_FRAGS 1024 static noinline_for_stack int -sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) +sock_devmem_dontneed_autorelease(struct sock *sk, struct dmabuf_token *tokens, + unsigned int num_tokens) { - unsigned int num_tokens, i, j, k, netmem_num = 0; - struct dmabuf_token *tokens; + unsigned int i, j, k, netmem_num = 0; int ret = 0, num_frags = 0; netmem_ref netmems[16]; - if (!sk_is_tcp(sk)) - return -EBADF; - - if (optlen % sizeof(*tokens) || - optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) - return -EINVAL; - - num_tokens = optlen / sizeof(*tokens); - tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); - if (!tokens) - return -ENOMEM; - - if (copy_from_sockptr(tokens, optval, optlen)) { - kvfree(tokens); - return -EFAULT; - } - xa_lock_bh(&sk->sk_user_frags); for (i = 0; i < num_tokens; i++) { for (j = 0; j < tokens[i].token_count; j++) { @@ -1135,6 +1118,35 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + return ret; +} + +static noinline_for_stack int +sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) +{ + struct dmabuf_token *tokens; + unsigned int num_tokens; + int ret; + + if (!sk_is_tcp(sk)) + return -EBADF; + + if (optlen % sizeof(*tokens) || + optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) + return -EINVAL; + + num_tokens = optlen / sizeof(*tokens); + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); + if (!tokens) + return -ENOMEM; + + if (copy_from_sockptr(tokens, optval, optlen)) { + kvfree(tokens); + return -EFAULT; + } + + ret = sock_devmem_dontneed_autorelease(sk, tokens, num_tokens); + kvfree(tokens); return ret; } -- 2.47.3 From: Bobby Eshleman Add alternative token management implementation (autorelease vs non-autorelease) that replaces xarray-based token lookups with direct array access using page offsets as dmabuf tokens. When enabled, this eliminates xarray overhead and reduces CPU utilization in devmem RX threads by approximately 13%. This patch changes the meaning of tokens when this option is used. Tokens previously referred to unique fragments of pages. With this option tokens instead represent references to pages, not fragments. Because of this, multiple tokens may refer to the same page and so have identical value (e.g., two small fragments may coexist on the same page). The token and offset pair that the user receives uniquely identifies fragments if needed. This assumes that the user is not attempting to sort / uniq the token list using tokens alone. This introduces a restriction: devmem RX sockets cannot switch dmabuf bindings when using the autorelease off option. This is necessary because 32-bit tokens lack sufficient bits to encode both large dmabuf page counts and binding/queue IDs. For example, a system with 8 NICs and 32 queues needs 8 bits for binding IDs, leaving only 24 bits for pages (64GB max). This restriction aligns with common usage, as steering flows to different queues/devices is often undesirable for TCP. This patch adds an atomic uref counter to net_iov for tracking user references via binding->vec. The pp_ref_count is only updated on uref transitions from zero to one or from one to zero, to minimize atomic overhead. If a user fails to refill and closes before returning all tokens, the binding will finish the uref release when unbound. A flag "autorelease" is added per-socket. This will be used for enabling the old behavior of the kernel releasing references for the sockets upon close(2) (autorelease), instead of requiring that socket users do this themselves. The autorelease flag is always true in this patch, meaning that the old (non-optimized) behavior is kept unconditionally. A future patch supports a user-facing knob to toggle this feature and will change the default to false for the improved performance. An outstanding_urefs counter is added per-socket so that changes to the autorelease mode can be rejected for active sockets. The dmabuf unbind path always checks for any leaked urefs. Signed-off-by: Bobby Eshleman --- Changes in v6: - remove sk_devmem_info.autorelease, using binding->autorelease instead - move binding->autorelease check to outside of net_devmem_dmabuf_binding_put_urefs() (Mina) - remove overly defensive net_is_devmem_iov() (Mina) - add comment about multiple urefs mapping to a single netmem ref (Mina) - remove overly defense netmem NULL and netmem_is_net_iov checks (Mina) - use niov without casting back and forth with netmem (Mina) - move the autorelease flag from per-binding to per-socket (Mina) - remove the batching logic in sock_devmem_dontneed_manual_release() (Mina) - move autorelease check inside tcp_xa_pool_commit() (Mina) - remove single-binding restriction for autorelease mode (Mina) - unbind always checks for leaked urefs Changes in v5: - remove unused variables - introduce autorelease flag, preparing for future patch toggle new behavior Changes in v3: - make urefs per-binding instead of per-socket, reducing memory footprint - fallback to cleaning up references in dmabuf unbind if socket leaked tokens - drop ethtool patch Changes in v2: - always use GFP_ZERO for binding->vec (Mina) - remove WARN for changed binding (Mina) - remove extraneous binding ref get (Mina) - remove WARNs on invalid user input (Mina) - pre-assign niovs in binding->vec for RX case (Mina) - use atomic_set(, 0) to initialize sk_user_frags.urefs - fix length of alloc for urefs --- include/net/netmem.h | 1 + include/net/sock.h | 13 +++++++-- net/core/devmem.c | 42 ++++++++++++++++++++++------- net/core/devmem.h | 2 +- net/core/sock.c | 55 +++++++++++++++++++++++++++++++++----- net/ipv4/tcp.c | 69 ++++++++++++++++++++++++++++++++++++++---------- net/ipv4/tcp_ipv4.c | 11 +++++--- net/ipv4/tcp_minisocks.c | 5 +++- 8 files changed, 161 insertions(+), 37 deletions(-) diff --git a/include/net/netmem.h b/include/net/netmem.h index 9e10f4ac50c3..80d2263ba4ed 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -112,6 +112,7 @@ struct net_iov { }; struct net_iov_area *owner; enum net_iov_type type; + atomic_t uref; }; struct net_iov_area { diff --git a/include/net/sock.h b/include/net/sock.h index c7e58b8e8a90..548fabacff7c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -350,7 +350,11 @@ struct sk_filter; * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference - * @sk_user_frags: xarray of pages the user is holding a reference on. + * @sk_devmem_info: the devmem binding information for the socket + * @frags: xarray of tokens for autorelease mode + * @binding: pointer to the dmabuf binding + * @outstanding_urefs: count of outstanding user references + * @autorelease: if true, tokens released on close; if false, user must release * @sk_owner: reference to the real owner of the socket that calls * sock_lock_init_class_and_name(). */ @@ -579,7 +583,12 @@ struct sock { struct numa_drop_counters *sk_drop_counters; struct rcu_head sk_rcu; netns_tracker ns_tracker; - struct xarray sk_user_frags; + struct { + struct xarray frags; + struct net_devmem_dmabuf_binding *binding; + atomic_t outstanding_urefs; + bool autorelease; + } sk_devmem_info; #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) struct module *sk_owner; diff --git a/net/core/devmem.c b/net/core/devmem.c index 4dee2666dd07..904d19e58f4b 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +117,24 @@ void net_devmem_free_dmabuf(struct net_iov *niov) gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); } +static void +net_devmem_dmabuf_binding_put_urefs(struct net_devmem_dmabuf_binding *binding) +{ + int i; + + for (i = 0; i < binding->dmabuf->size / PAGE_SIZE; i++) { + struct net_iov *niov; + netmem_ref netmem; + + niov = binding->vec[i]; + netmem = net_iov_to_netmem(niov); + + /* Multiple urefs map to only a single netmem ref. */ + if (atomic_xchg(&niov->uref, 0) > 0) + WARN_ON_ONCE(!napi_pp_put_page(netmem)); + } +} + void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) { struct netdev_rx_queue *rxq; @@ -143,6 +162,10 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } + /* Clean up any lingering urefs from sockets that had autorelease + * disabled. + */ + net_devmem_dmabuf_binding_put_urefs(binding); net_devmem_dmabuf_binding_put(binding); } @@ -231,14 +254,13 @@ net_devmem_bind_dmabuf(struct net_device *dev, goto err_detach; } - if (direction == DMA_TO_DEVICE) { - binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, - sizeof(struct net_iov *), - GFP_KERNEL); - if (!binding->vec) { - err = -ENOMEM; - goto err_unmap; - } + /* Used by TX and also by RX when socket has autorelease disabled */ + binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL | __GFP_ZERO); + if (!binding->vec) { + err = -ENOMEM; + goto err_unmap; } /* For simplicity we expect to make PAGE_SIZE allocations, but the @@ -292,10 +314,10 @@ net_devmem_bind_dmabuf(struct net_device *dev, niov = &owner->area.niovs[i]; niov->type = NET_IOV_DMABUF; niov->owner = &owner->area; + atomic_set(&niov->uref, 0); page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); - if (direction == DMA_TO_DEVICE) - binding->vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; + binding->vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; } virtual += len; diff --git a/net/core/devmem.h b/net/core/devmem.h index 2ada54fb63d7..d4eb28d079bb 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -61,7 +61,7 @@ struct net_devmem_dmabuf_binding { /* Array of net_iov pointers for this binding, sorted by virtual * address. This array is convenient to map the virtual addresses to - * net_iovs in the TX path. + * net_iovs. */ struct net_iov **vec; diff --git a/net/core/sock.c b/net/core/sock.c index 5562f517d889..465645c1d74f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -87,6 +87,7 @@ #include #include +#include #include #include #include @@ -151,6 +152,7 @@ #include #include "dev.h" +#include "devmem.h" static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -1081,6 +1083,43 @@ static int sock_reserve_memory(struct sock *sk, int bytes) #define MAX_DONTNEED_TOKENS 128 #define MAX_DONTNEED_FRAGS 1024 +static noinline_for_stack int +sock_devmem_dontneed_manual_release(struct sock *sk, struct dmabuf_token *tokens, + unsigned int num_tokens) +{ + struct net_iov *niov; + unsigned int i, j; + netmem_ref netmem; + unsigned int token; + int num_frags = 0; + int ret; + + if (!sk->sk_devmem_info.binding) + return -EINVAL; + + for (i = 0; i < num_tokens; i++) { + for (j = 0; j < tokens[i].token_count; j++) { + token = tokens[i].token_start + j; + if (token >= sk->sk_devmem_info.binding->dmabuf->size / PAGE_SIZE) + break; + + if (++num_frags > MAX_DONTNEED_FRAGS) + return ret; + + niov = sk->sk_devmem_info.binding->vec[token]; + if (atomic_dec_and_test(&niov->uref)) { + netmem = net_iov_to_netmem(niov); + WARN_ON_ONCE(!napi_pp_put_page(netmem)); + } + ret++; + } + } + + atomic_sub(ret, &sk->sk_devmem_info.outstanding_urefs); + + return ret; +} + static noinline_for_stack int sock_devmem_dontneed_autorelease(struct sock *sk, struct dmabuf_token *tokens, unsigned int num_tokens) @@ -1089,32 +1128,32 @@ sock_devmem_dontneed_autorelease(struct sock *sk, struct dmabuf_token *tokens, int ret = 0, num_frags = 0; netmem_ref netmems[16]; - xa_lock_bh(&sk->sk_user_frags); + xa_lock_bh(&sk->sk_devmem_info.frags); for (i = 0; i < num_tokens; i++) { for (j = 0; j < tokens[i].token_count; j++) { if (++num_frags > MAX_DONTNEED_FRAGS) goto frag_limit_reached; netmem_ref netmem = (__force netmem_ref)__xa_erase( - &sk->sk_user_frags, tokens[i].token_start + j); + &sk->sk_devmem_info.frags, tokens[i].token_start + j); if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) continue; netmems[netmem_num++] = netmem; if (netmem_num == ARRAY_SIZE(netmems)) { - xa_unlock_bh(&sk->sk_user_frags); + xa_unlock_bh(&sk->sk_devmem_info.frags); for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); netmem_num = 0; - xa_lock_bh(&sk->sk_user_frags); + xa_lock_bh(&sk->sk_devmem_info.frags); } ret++; } } frag_limit_reached: - xa_unlock_bh(&sk->sk_user_frags); + xa_unlock_bh(&sk->sk_devmem_info.frags); for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); @@ -1145,7 +1184,11 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) return -EFAULT; } - ret = sock_devmem_dontneed_autorelease(sk, tokens, num_tokens); + if (sk->sk_devmem_info.autorelease) + ret = sock_devmem_dontneed_autorelease(sk, tokens, num_tokens); + else + ret = sock_devmem_dontneed_manual_release(sk, tokens, + num_tokens); kvfree(tokens); return ret; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a9345aa5a2e5..052875c1b547 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -260,6 +260,7 @@ #include #include #include +#include #include #include #include @@ -492,7 +493,10 @@ void tcp_init_sock(struct sock *sk) set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); - xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); + xa_init_flags(&sk->sk_devmem_info.frags, XA_FLAGS_ALLOC1); + sk->sk_devmem_info.binding = NULL; + atomic_set(&sk->sk_devmem_info.outstanding_urefs, 0); + sk->sk_devmem_info.autorelease = true; } EXPORT_IPV6_MOD(tcp_init_sock); @@ -2424,11 +2428,11 @@ static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p) /* Commit part that has been copied to user space. */ for (i = 0; i < p->idx; i++) - __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY, + __xa_cmpxchg(&sk->sk_devmem_info.frags, p->tokens[i], XA_ZERO_ENTRY, (__force void *)p->netmems[i], GFP_KERNEL); /* Rollback what has been pre-allocated and is no longer needed. */ for (; i < p->max; i++) - __xa_erase(&sk->sk_user_frags, p->tokens[i]); + __xa_erase(&sk->sk_devmem_info.frags, p->tokens[i]); p->max = 0; p->idx = 0; @@ -2436,14 +2440,17 @@ static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p) static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p) { + if (!sk->sk_devmem_info.autorelease) + return; + if (!p->max) return; - xa_lock_bh(&sk->sk_user_frags); + xa_lock_bh(&sk->sk_devmem_info.frags); tcp_xa_pool_commit_locked(sk, p); - xa_unlock_bh(&sk->sk_user_frags); + xa_unlock_bh(&sk->sk_devmem_info.frags); } static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p, @@ -2454,18 +2461,18 @@ static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p, if (p->idx < p->max) return 0; - xa_lock_bh(&sk->sk_user_frags); + xa_lock_bh(&sk->sk_devmem_info.frags); tcp_xa_pool_commit_locked(sk, p); for (k = 0; k < max_frags; k++) { - err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k], + err = __xa_alloc(&sk->sk_devmem_info.frags, &p->tokens[k], XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL); if (err) break; } - xa_unlock_bh(&sk->sk_user_frags); + xa_unlock_bh(&sk->sk_devmem_info.frags); p->max = k; p->idx = 0; @@ -2479,12 +2486,14 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, unsigned int offset, struct msghdr *msg, int remaining_len) { + struct net_devmem_dmabuf_binding *binding = NULL; struct dmabuf_cmsg dmabuf_cmsg = { 0 }; struct tcp_xa_pool tcp_xa_pool; unsigned int start; int i, copy, n; int sent = 0; int err = 0; + int refs; tcp_xa_pool.max = 0; tcp_xa_pool.idx = 0; @@ -2536,6 +2545,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct net_iov *niov; u64 frag_offset; + u32 token; int end; /* !skb_frags_readable() should indicate that ALL the @@ -2568,13 +2578,32 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, start; dmabuf_cmsg.frag_offset = frag_offset; dmabuf_cmsg.frag_size = copy; - err = tcp_xa_pool_refill(sk, &tcp_xa_pool, - skb_shinfo(skb)->nr_frags - i); - if (err) + + binding = net_devmem_iov_binding(niov); + + if (!sk->sk_devmem_info.binding) + sk->sk_devmem_info.binding = binding; + + if (sk->sk_devmem_info.binding != binding) { + err = -EFAULT; goto out; + } + + if (sk->sk_devmem_info.autorelease) { + err = tcp_xa_pool_refill(sk, &tcp_xa_pool, + skb_shinfo(skb)->nr_frags - i); + if (err) + goto out; + + dmabuf_cmsg.frag_token = + tcp_xa_pool.tokens[tcp_xa_pool.idx]; + } else { + token = net_iov_virtual_addr(niov) >> PAGE_SHIFT; + dmabuf_cmsg.frag_token = token; + } + /* Will perform the exchange later */ - dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; @@ -2587,8 +2616,15 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, if (err) goto out; - atomic_long_inc(&niov->pp_ref_count); - tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); + if (sk->sk_devmem_info.autorelease) { + atomic_long_inc(&niov->pp_ref_count); + tcp_xa_pool.netmems[tcp_xa_pool.idx++] = + skb_frag_netmem(frag); + } else { + if (atomic_inc_return(&niov->uref) == 1) + atomic_long_inc(&niov->pp_ref_count); + refs++; + } sent += copy; @@ -2599,6 +2635,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } tcp_xa_pool_commit(sk, &tcp_xa_pool); + if (!remaining_len) goto out; @@ -2617,9 +2654,13 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, out: tcp_xa_pool_commit(sk, &tcp_xa_pool); + if (!sent) sent = err; + if (refs > 0) + atomic_add(refs, &sk->sk_devmem_info.outstanding_urefs); + return sent; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 40a76da5364a..dbb7a71e3cce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -89,6 +89,9 @@ #include +#include +#include "../core/devmem.h" + #include #ifdef CONFIG_TCP_MD5SIG @@ -2493,7 +2496,7 @@ static void tcp_release_user_frags(struct sock *sk) unsigned long index; void *netmem; - xa_for_each(&sk->sk_user_frags, index, netmem) + xa_for_each(&sk->sk_devmem_info.frags, index, netmem) WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); #endif } @@ -2502,9 +2505,11 @@ void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - tcp_release_user_frags(sk); + if (sk->sk_devmem_info.binding && sk->sk_devmem_info.autorelease) + tcp_release_user_frags(sk); - xa_destroy(&sk->sk_user_frags); + xa_destroy(&sk->sk_devmem_info.frags); + sk->sk_devmem_info.binding = NULL; trace_tcp_destroy_sock(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ded2cf1f6006..a017dea35bb8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -663,7 +663,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); - xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); + xa_init_flags(&newsk->sk_devmem_info.frags, XA_FLAGS_ALLOC1); + newsk->sk_devmem_info.binding = NULL; + atomic_set(&newsk->sk_devmem_info.outstanding_urefs, 0); + newsk->sk_devmem_info.autorelease = true; return newsk; } -- 2.47.3 From: Bobby Eshleman Add SO_DEVMEM_AUTORELEASE socket option to allow applications to control token release behavior on a per-socket basis. The socket option accepts boolean values (0 or 1): - 1 (true): outstanding tokens are automatically released when the socket closes - 0 (false): outstanding tokens are released when the dmabuf is unbound The option can only be changed when the socket has no outstanding tokens, enforced by checking: 1. The frags xarray is empty (no tokens in autorelease mode) 2. The outstanding_urefs counter is zero (no tokens in manual mode) This restriction prevents inconsistent token tracking state between acquisition and release calls. If either condition fails, setsockopt returns -EBUSY. The default state is autorelease off. Signed-off-by: Bobby Eshleman --- include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 51 +++++++++++++++++++++++++++++++++ net/ipv4/tcp.c | 2 +- tools/include/uapi/asm-generic/socket.h | 2 ++ 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 53b5a8c002b1..59302318bb34 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -150,6 +150,8 @@ #define SO_INQ 84 #define SCM_INQ SO_INQ +#define SO_DEVMEM_AUTORELEASE 85 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index 465645c1d74f..27af476f3cd3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1160,6 +1160,46 @@ sock_devmem_dontneed_autorelease(struct sock *sk, struct dmabuf_token *tokens, return ret; } +static noinline_for_stack int +sock_devmem_set_autorelease(struct sock *sk, sockptr_t optval, unsigned int optlen) +{ + int val; + + if (!sk_is_tcp(sk)) + return -EBADF; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + /* Validate that val is 0 or 1 */ + if (val != 0 && val != 1) + return -EINVAL; + + sockopt_lock_sock(sk); + + /* Can only change autorelease if: + * 1. No tokens in the frags xarray (autorelease mode) + * 2. No outstanding urefs (manual release mode) + */ + if (!xa_empty(&sk->sk_devmem_info.frags)) { + sockopt_release_sock(sk); + return -EBUSY; + } + + if (atomic_read(&sk->sk_devmem_info.outstanding_urefs) > 0) { + sockopt_release_sock(sk); + return -EBUSY; + } + + sk->sk_devmem_info.autorelease = !!val; + + sockopt_release_sock(sk); + return 0; +} + static noinline_for_stack int sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) { @@ -1351,6 +1391,9 @@ int sk_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_PAGE_POOL case SO_DEVMEM_DONTNEED: return sock_devmem_dontneed(sk, optval, optlen); + + case SO_DEVMEM_AUTORELEASE: + return sock_devmem_set_autorelease(sk, optval, optlen); #endif case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: @@ -2208,6 +2251,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = READ_ONCE(sk->sk_txrehash); break; +#ifdef CONFIG_PAGE_POOL + case SO_DEVMEM_AUTORELEASE: + if (!sk_is_tcp(sk)) + return -EBADF; + v.val = sk->sk_devmem_info.autorelease; + break; +#endif + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 052875c1b547..8226ba892b36 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -496,7 +496,7 @@ void tcp_init_sock(struct sock *sk) xa_init_flags(&sk->sk_devmem_info.frags, XA_FLAGS_ALLOC1); sk->sk_devmem_info.binding = NULL; atomic_set(&sk->sk_devmem_info.outstanding_urefs, 0); - sk->sk_devmem_info.autorelease = true; + sk->sk_devmem_info.autorelease = false; } EXPORT_IPV6_MOD(tcp_init_sock); diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index f333a0ac4ee4..9710a3d7cc4d 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -147,6 +147,8 @@ #define SO_PASSRIGHTS 83 +#define SO_DEVMEM_AUTORELEASE 85 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) -- 2.47.3 From: Bobby Eshleman Update devmem.rst documentation to describe the new SO_DEVMEM_AUTORELEASE socket option and its usage. Document the following: - The two token release modes (automatic vs manual) - How to use SO_DEVMEM_AUTORELEASE to control the behavior - Performance benefits of disabling autorelease (~10% CPU reduction) - Restrictions and caveats of manual token release - Usage examples for both getsockopt and setsockopt Signed-off-by: Bobby Eshleman --- Documentation/networking/devmem.rst | 70 +++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/devmem.rst b/Documentation/networking/devmem.rst index a6cd7236bfbd..1bfce686dce6 100644 --- a/Documentation/networking/devmem.rst +++ b/Documentation/networking/devmem.rst @@ -215,8 +215,8 @@ Freeing frags ------------- Frags received via SCM_DEVMEM_DMABUF are pinned by the kernel while the user -processes the frag. The user must return the frag to the kernel via -SO_DEVMEM_DONTNEED:: +processes the frag. Users should return tokens to the kernel via +SO_DEVMEM_DONTNEED when they are done processing the data:: ret = setsockopt(client_fd, SOL_SOCKET, SO_DEVMEM_DONTNEED, &token, sizeof(token)); @@ -235,6 +235,72 @@ can be less than the tokens provided by the user in case of: (a) an internal kernel leak bug. (b) the user passed more than 1024 frags. + +Autorelease Control +~~~~~~~~~~~~~~~~~~~ + +The SO_DEVMEM_AUTORELEASE socket option controls what happens to outstanding +tokens (tokens not released via SO_DEVMEM_DONTNEED) when the socket closes:: + + int autorelease = 0; /* 0 = manual release, 1 = automatic release */ + ret = setsockopt(client_fd, SOL_SOCKET, SO_DEVMEM_AUTORELEASE, + &autorelease, sizeof(autorelease)); + + /* Query current setting */ + int current_val; + socklen_t len = sizeof(current_val); + ret = getsockopt(client_fd, SOL_SOCKET, SO_DEVMEM_AUTORELEASE, + ¤t_val, &len); + +When autorelease is disabled (default): + +- Outstanding tokens are NOT released when the socket closes +- Outstanding tokens are only released when the dmabuf is unbound +- Provides better performance by eliminating xarray overhead (~10% CPU reduction) +- Kernel tracks tokens via atomic reference counters in net_iov structures + +When autorelease is enabled: + +- Outstanding tokens are automatically released when the socket closes +- Backwards compatible behavior +- Kernel tracks tokens in an xarray per socket + +Important: In both modes, applications should call SO_DEVMEM_DONTNEED to +return tokens as soon as they are done processing. The autorelease setting only +affects what happens to tokens that are still outstanding when close() is called. + +The autorelease setting can only be changed when the socket has no outstanding +tokens. If tokens are present, setsockopt returns -EBUSY. + + +Performance Considerations +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Disabling autorelease provides approximately ~10% CPU utilization improvement in +RX workloads by: + +- Eliminating xarray allocations and lookups for token tracking +- Using atomic reference counters instead +- Reducing lock contention on the xarray spinlock + +However, applications must ensure all tokens are released via +SO_DEVMEM_DONTNEED before closing the socket, otherwise the backing pages will +remain pinned until the dmabuf is unbound. + + +Caveats +~~~~~~~ + +- With autorelease disabled, sockets cannot switch between different dmabuf + bindings. This restriction exists because tokens in this mode do not encode + the binding information necessary to perform the token release. + +- Applications using manual release mode (autorelease=0) must ensure all tokens + are returned via SO_DEVMEM_DONTNEED before socket close to avoid resource + leaks during the lifetime of the dmabuf binding. Tokens not released before + close() will only be freed when the dmabuf is unbound. + + TX Interface ============ -- 2.47.3 From: Bobby Eshleman Add -A flag to ncdevmem to set autorelease mode. Add tests for the SO_DEVMEM_AUTORELEASE socket option: New tests include: - check_sockopt_autorelease_default: Verifies default value is 0 - check_sockopt_autorelease_set_0: Tests setting to 0 and reading back - check_sockopt_autorelease_set_1: Tests toggling from 0 to 1 - check_sockopt_autorelease_invalid: Tests invalid value (2) returns EINVAL - check_autorelease_disabled: Tests ncdevmem in manual token release mode - check_autorelease_enabled: Tests ncdevmem in autorelease mode All check_sockopt tests gracefully skip with KsftSkipEx if SO_DEVMEM_AUTORELEASE is not supported by the kernel. Signed-off-by: Bobby Eshleman --- tools/testing/selftests/drivers/net/hw/devmem.py | 115 +++++++++++++++++++++- tools/testing/selftests/drivers/net/hw/ncdevmem.c | 20 +++- 2 files changed, 133 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py index 45c2d49d55b6..29ec179d651f 100755 --- a/tools/testing/selftests/drivers/net/hw/devmem.py +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +import socket +import errno + from os import path from lib.py import ksft_run, ksft_exit from lib.py import ksft_eq, KsftSkipEx @@ -63,12 +66,122 @@ def check_tx_chunks(cfg) -> None: ksft_eq(socat.stdout.strip(), "hello\nworld") +@ksft_disruptive +def check_autorelease_disabled(cfg) -> None: + """Test RX with autorelease disabled (requires manual token release in ncdevmem)""" + require_devmem(cfg) + + port = rand_port() + socat = f"socat -u - TCP{cfg.addr_ipver}:{cfg.baddr}:{port},bind={cfg.remote_baddr}:{port}" + listen_cmd = f"{cfg.bin_local} -l -f {cfg.ifname} -s {cfg.addr} -p {port} -c {cfg.remote_addr} -v 7 -A 0" + + with bkg(listen_cmd, exit_wait=True) as ncdevmem: + wait_port_listen(port) + cmd(f"yes $(echo -e \x01\x02\x03\x04\x05\x06) | \ + head -c 1K | {socat}", host=cfg.remote, shell=True) + + ksft_eq(ncdevmem.ret, 0) + + +@ksft_disruptive +def check_autorelease_enabled(cfg) -> None: + """Test RX with autorelease enabled (requires token autorelease in ncdevmem)""" + require_devmem(cfg) + + port = rand_port() + socat = f"socat -u - TCP{cfg.addr_ipver}:{cfg.baddr}:{port},bind={cfg.remote_baddr}:{port}" + listen_cmd = f"{cfg.bin_local} -l -f {cfg.ifname} -s {cfg.addr} -p {port} -c {cfg.remote_addr} -v 7 -A 1" + + with bkg(listen_cmd, exit_wait=True) as ncdevmem: + wait_port_listen(port) + cmd(f"yes $(echo -e \x01\x02\x03\x04\x05\x06) | \ + head -c 1K | {socat}", host=cfg.remote, shell=True) + + ksft_eq(ncdevmem.ret, 0) + + +def check_sockopt_autorelease_default(cfg) -> None: + """Test that SO_DEVMEM_AUTORELEASE default is 0""" + SO_DEVMEM_AUTORELEASE = 85 + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + val = sock.getsockopt(socket.SOL_SOCKET, SO_DEVMEM_AUTORELEASE) + ksft_eq(val, 0, "Default autorelease should be 0") + except OSError as e: + if e.errno == errno.ENOPROTOOPT: + raise KsftSkipEx("SO_DEVMEM_AUTORELEASE not supported") + raise + finally: + sock.close() + + +def check_sockopt_autorelease_set_0(cfg) -> None: + """Test setting SO_DEVMEM_AUTORELEASE to 0""" + SO_DEVMEM_AUTORELEASE = 85 + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + sock.setsockopt(socket.SOL_SOCKET, SO_DEVMEM_AUTORELEASE, 0) + val = sock.getsockopt(socket.SOL_SOCKET, SO_DEVMEM_AUTORELEASE) + ksft_eq(val, 0, "Autorelease should be 0 after setting") + except OSError as e: + if e.errno == errno.ENOPROTOOPT: + raise KsftSkipEx("SO_DEVMEM_AUTORELEASE not supported") + raise + finally: + sock.close() + + +def check_sockopt_autorelease_set_1(cfg) -> None: + """Test setting SO_DEVMEM_AUTORELEASE to 1""" + SO_DEVMEM_AUTORELEASE = 85 + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + # First set to 0 + sock.setsockopt(socket.SOL_SOCKET, SO_DEVMEM_AUTORELEASE, 0) + # Then set back to 1 + sock.setsockopt(socket.SOL_SOCKET, SO_DEVMEM_AUTORELEASE, 1) + val = sock.getsockopt(socket.SOL_SOCKET, SO_DEVMEM_AUTORELEASE) + ksft_eq(val, 1, "Autorelease should be 1 after setting") + except OSError as e: + if e.errno == errno.ENOPROTOOPT: + raise KsftSkipEx("SO_DEVMEM_AUTORELEASE not supported") + raise + finally: + sock.close() + + +def check_sockopt_autorelease_invalid(cfg) -> None: + """Test that SO_DEVMEM_AUTORELEASE rejects invalid values""" + SO_DEVMEM_AUTORELEASE = 85 + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + try: + sock.setsockopt(socket.SOL_SOCKET, SO_DEVMEM_AUTORELEASE, 2) + raise Exception("setsockopt should have failed with EINVAL") + except OSError as e: + if e.errno == errno.ENOPROTOOPT: + raise KsftSkipEx("SO_DEVMEM_AUTORELEASE not supported") + ksft_eq(e.errno, errno.EINVAL, "Should fail with EINVAL for invalid value") + finally: + sock.close() + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem") cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - ksft_run([check_rx, check_tx, check_tx_chunks], + ksft_run([check_rx, check_tx, check_tx_chunks, + check_autorelease_enabled, + check_autorelease_disabled, + check_sockopt_autorelease_default, + check_sockopt_autorelease_set_0, + check_sockopt_autorelease_set_1, + check_sockopt_autorelease_invalid], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 3288ed04ce08..34d608d07bec 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -83,6 +83,10 @@ #define MSG_SOCK_DEVMEM 0x2000000 #endif +#ifndef SO_DEVMEM_AUTORELEASE +#define SO_DEVMEM_AUTORELEASE 85 +#endif + #define MAX_IOV 1024 static size_t max_chunk; @@ -97,6 +101,7 @@ static unsigned int ifindex; static unsigned int dmabuf_id; static uint32_t tx_dmabuf_id; static int waittime_ms = 500; +static int autorelease = -1; /* System state loaded by current_config_load() */ #define MAX_FLOWS 8 @@ -890,6 +895,16 @@ static int do_server(struct memory_buffer *mem) if (enable_reuseaddr(socket_fd)) goto err_close_socket; + if (autorelease >= 0) { + ret = setsockopt(socket_fd, SOL_SOCKET, SO_DEVMEM_AUTORELEASE, + &autorelease, sizeof(autorelease)); + if (ret) { + pr_err("SO_DEVMEM_AUTORELEASE failed"); + goto err_close_socket; + } + fprintf(stderr, "Set SO_DEVMEM_AUTORELEASE to %d\n", autorelease); + } + fprintf(stderr, "binding to address %s:%d\n", server_ip, ntohs(server_sin.sin6_port)); @@ -1397,7 +1412,7 @@ int main(int argc, char *argv[]) int is_server = 0, opt; int ret, err = 1; - while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:z:")) != -1) { + while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:z:A:")) != -1) { switch (opt) { case 'l': is_server = 1; @@ -1426,6 +1441,9 @@ int main(int argc, char *argv[]) case 'z': max_chunk = atoi(optarg); break; + case 'A': + autorelease = atoi(optarg); + break; case '?': fprintf(stderr, "unknown option: %c\n", optopt); break; -- 2.47.3