__netlink_dump_start() installs cb->skb, takes the module reference, and sets cb_running before calling netlink_dump(sk, true). If that first call returns via errout_skb the callback state is left behind: cb_running stays set, module_put() and consume_skb(cb->skb) are deferred until recvmsg() drives the dump back through the success path, or netlink_release() on close runs the catch-all cleanup. On sustained alloc failure neither fires. Factor the teardown into netlink_dump_cleanup(nlk, drop) shared by the dump success path, the lock_taken=true errout_skb path, and netlink_release(). The @drop flag preserves the existing split: consume_skb() on normal completion, kfree_skb() on abort. Validation on a UML guest: an unprivileged task opens NETLINK_ROUTE, preloads sk_rmem_alloc, then issues RTM_GETLINK | NLM_F_DUMP. Stock kernel leaves cb_running stuck at 1 until recvmsg() or close() drives it. Patched kernel clears cb_running immediately on the lock_taken=true failure; the recvmsg continuation path is unchanged. At scale: 3500 wedged sockets in a 256M guest show about 3.8-3.9 MiB of extra unreclaimable slab (~1.1 KiB/sock) on stock vs zero on patched. RLIMIT_NOFILE bounds the test before OOM, so this is a local availability cleanup rather than an exhaustion primitive. Fixes: 16b304f3404f ("netlink: Eliminate kmalloc in netlink dump operation.") Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito --- v2 (per Jakub's review <20260420103715.347fbd4a@kernel.org>): * commit message names both paths that do clear the state (recvmsg-driven retry on drain, netlink_release() on close) and notes that neither fires on sustained alloc failure * moved the UML validation into the commit message * extracted netlink_dump_cleanup(nlk, bool drop); shared with netlink_release() and the success path. The bool preserves the existing kfree_skb / consume_skb split. v1: https://lore.kernel.org/netdev/20260420162734.854587-1-michael.bommarito@gmail.com/ net/netlink/af_netlink.c | 47 ++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4d609d5cf406..ab21a6218631 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -131,6 +131,7 @@ static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { }; static int netlink_dump(struct sock *sk, bool lock_taken); +static void netlink_dump_cleanup(struct netlink_sock *nlk, bool drop); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion @@ -763,13 +764,8 @@ static int netlink_release(struct socket *sock) } /* Terminate any outstanding dump */ - if (nlk->cb_running) { - if (nlk->cb.done) - nlk->cb.done(&nlk->cb); - module_put(nlk->cb.module); - kfree_skb(nlk->cb.skb); - WRITE_ONCE(nlk->cb_running, false); - } + if (nlk->cb_running) + netlink_dump_cleanup(nlk, true); module_put(nlk->module); @@ -2250,6 +2246,26 @@ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, return 0; } +/* Must be called with nl_cb_mutex NOT held. @drop=true frees the skb + * via kfree_skb() so drop-monitor sees the teardown; @drop=false uses + * consume_skb() for the normal-completion path. + */ +static void netlink_dump_cleanup(struct netlink_sock *nlk, bool drop) +{ + struct module *module = nlk->cb.module; + struct sk_buff *skb = nlk->cb.skb; + + if (nlk->cb.done) + nlk->cb.done(&nlk->cb); + + WRITE_ONCE(nlk->cb_running, false); + module_put(module); + if (drop) + kfree_skb(skb); + else + consume_skb(skb); +} + static int netlink_dump(struct sock *sk, bool lock_taken) { struct netlink_sock *nlk = nlk_sk(sk); @@ -2258,7 +2274,6 @@ static int netlink_dump(struct sock *sk, bool lock_taken) struct sk_buff *skb = NULL; unsigned int rmem, rcvbuf; size_t max_recvmsg_len; - struct module *module; int err = -ENOBUFS; int alloc_min_size; int alloc_size; @@ -2366,19 +2381,19 @@ static int netlink_dump(struct sock *sk, bool lock_taken) else __netlink_sendskb(sk, skb); - if (cb->done) - cb->done(cb); - - WRITE_ONCE(nlk->cb_running, false); - module = cb->module; - skb = cb->skb; mutex_unlock(&nlk->nl_cb_mutex); - module_put(module); - consume_skb(skb); + netlink_dump_cleanup(nlk, false); return 0; errout_skb: mutex_unlock(&nlk->nl_cb_mutex); + /* The recvmsg() retry path (lock_taken=false) keeps cb_running so + * the next recvmsg() can drive the dump forward once receive room + * is available; only the initial __netlink_dump_start() failure + * owns the teardown. + */ + if (lock_taken) + netlink_dump_cleanup(nlk, true); kfree_skb(skb); return err; } -- 2.53.0