From: Chuck Lever Evaluating a dangling pointer is undefined behavior under the C standard. In tls_sw_read_sock(), consume_skb(skb) in the fully- consumed path frees the skb, but the "do { } while (skb)" loop condition then evaluates that freed pointer. Although the value is never dereferenced -- the loop either continues and overwrites skb, or exits -- any future change that adds a dereference between consume_skb() and the loop condition would produce a silent use-after-free. Fixes: 662fbcec32f4 ("net/tls: implement ->read_sock()") Reviewed-by: Hannes Reinecke Reviewed-by: Alistair Francis Signed-off-by: Chuck Lever --- net/tls/tls_sw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a656ce235758..108d417dcfb7 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2355,7 +2355,7 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, goto read_sock_end; decrypted = 0; - do { + for (;;) { if (!skb_queue_empty(&ctx->rx_list)) { skb = __skb_dequeue(&ctx->rx_list); rxm = strp_msg(skb); @@ -2406,10 +2406,11 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, goto read_sock_requeue; } else { consume_skb(skb); + skb = NULL; if (!desc->count) - skb = NULL; + break; } - } while (skb); + } read_sock_end: tls_rx_reader_release(sk, ctx); -- 2.53.0 From: Chuck Lever tls_strp_msg_done() conflates releasing the current record with checking for the next one via tls_strp_check_rcv(). Batch processing requires releasing a record without immediately triggering that check, so the release step is separated into tls_strp_msg_release(). tls_strp_msg_done() is preserved as a wrapper for existing callers. Reviewed-by: Hannes Reinecke Reviewed-by: Alistair Francis Signed-off-by: Chuck Lever --- net/tls/tls.h | 1 + net/tls/tls_strp.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/net/tls/tls.h b/net/tls/tls.h index e8f81a006520..a97f1acef31d 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -193,6 +193,7 @@ int tls_strp_init(struct tls_strparser *strp, struct sock *sk); void tls_strp_data_ready(struct tls_strparser *strp); void tls_strp_check_rcv(struct tls_strparser *strp); +void tls_strp_msg_release(struct tls_strparser *strp); void tls_strp_msg_done(struct tls_strparser *strp); int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 98e12f0ff57e..a7648ebde162 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -581,7 +581,16 @@ static void tls_strp_work(struct work_struct *w) release_sock(strp->sk); } -void tls_strp_msg_done(struct tls_strparser *strp) +/** + * tls_strp_msg_release - release the current strparser message + * @strp: TLS stream parser instance + * + * Release the current record without triggering a check for the + * next record. Callers must invoke tls_strp_check_rcv() before + * releasing the socket lock, or queued data will stall until + * the next tls_strp_data_ready() event. + */ +void tls_strp_msg_release(struct tls_strparser *strp) { WARN_ON(!strp->stm.full_len); @@ -592,7 +601,11 @@ void tls_strp_msg_done(struct tls_strparser *strp) WRITE_ONCE(strp->msg_ready, 0); memset(&strp->stm, 0, sizeof(strp->stm)); +} +void tls_strp_msg_done(struct tls_strparser *strp) +{ + tls_strp_msg_release(strp); tls_strp_check_rcv(strp); } -- 2.53.0 From: Chuck Lever During tls_sw_read_sock(), each record release triggers tls_strp_msg_done(), which calls back through the strparser into saved_data_ready(). For a batch of N records, the first N-1 wakeups are pure overhead: the read_sock callback is already running and will pick up subsequent records on the next iteration. Remove the per-record wakeup from the record-release path by introducing tls_rx_rec_release(), which calls tls_strp_msg_release() instead of tls_strp_msg_done(). Factor tls_rx_msg_ready() out of tls_strp_read_sock() so that parsing a record no longer fires the callback directly, and introduce tls_strp_check_rcv_quiet() for use in tls_rx_rec_wait(), which parses queued data without notifying. A single tls_strp_check_rcv() at the read_sock_end exit point fires at most one notification, and only when data remains after the loop exits. Acked-by: Alistair Francis Reviewed-by: Hannes Reinecke Signed-off-by: Chuck Lever --- net/tls/tls.h | 1 + net/tls/tls_strp.c | 21 +++++++++++++++++++-- net/tls/tls_sw.c | 13 +++++++++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/net/tls/tls.h b/net/tls/tls.h index a97f1acef31d..d58d86e8e43e 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -193,6 +193,7 @@ int tls_strp_init(struct tls_strparser *strp, struct sock *sk); void tls_strp_data_ready(struct tls_strparser *strp); void tls_strp_check_rcv(struct tls_strparser *strp); +void tls_strp_check_rcv_quiet(struct tls_strparser *strp); void tls_strp_msg_release(struct tls_strparser *strp); void tls_strp_msg_done(struct tls_strparser *strp); diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index a7648ebde162..7b9f5051becb 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -368,7 +368,6 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, desc->count = 0; WRITE_ONCE(strp->msg_ready, 1); - tls_rx_msg_ready(strp); } return ret; @@ -539,11 +538,27 @@ static int tls_strp_read_sock(struct tls_strparser *strp) return tls_strp_read_copy(strp, false); WRITE_ONCE(strp->msg_ready, 1); - tls_rx_msg_ready(strp); return 0; } +/** + * tls_strp_check_rcv_quiet - parse without consumer notification + * @strp: TLS stream parser instance + * + * Parse queued data without firing the consumer notification. A subsequent + * tls_strp_check_rcv() is required before the socket lock is released; + * otherwise queued data stalls until the next tls_strp_data_ready() event. + */ +void tls_strp_check_rcv_quiet(struct tls_strparser *strp) +{ + if (unlikely(strp->stopped) || strp->msg_ready) + return; + + if (tls_strp_read_sock(strp) == -ENOMEM) + queue_work(tls_strp_wq, &strp->work); +} + void tls_strp_check_rcv(struct tls_strparser *strp) { if (unlikely(strp->stopped) || strp->msg_ready) @@ -551,6 +566,8 @@ void tls_strp_check_rcv(struct tls_strparser *strp) if (tls_strp_read_sock(strp) == -ENOMEM) queue_work(tls_strp_wq, &strp->work); + else if (strp->msg_ready) + tls_rx_msg_ready(strp); } /* Lower sock lock held */ diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 108d417dcfb7..a5905f4c1ae2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1372,7 +1372,10 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, return ret; if (!skb_queue_empty(&sk->sk_receive_queue)) { - tls_strp_check_rcv(&ctx->strp); + /* tls_strp_check_rcv() is called on the read_sock_end + * path before the socket lock is released. + */ + tls_strp_check_rcv_quiet(&ctx->strp); if (tls_strp_msg_ready(ctx)) break; } @@ -1853,6 +1856,11 @@ static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm, return 1; } +static void tls_rx_rec_release(struct tls_sw_context_rx *ctx) +{ + tls_strp_msg_release(&ctx->strp); +} + static void tls_rx_rec_done(struct tls_sw_context_rx *ctx) { tls_strp_msg_done(&ctx->strp); @@ -2383,7 +2391,7 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, tlm = tls_msg(skb); decrypted += rxm->full_len; - tls_rx_rec_done(ctx); + tls_rx_rec_release(ctx); } /* read_sock does not support reading control messages */ @@ -2413,6 +2421,7 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, } read_sock_end: + tls_strp_check_rcv(&ctx->strp); tls_rx_reader_release(sk, ctx); return copied ? : err; -- 2.53.0 From: Chuck Lever While lock_sock is held during read_sock, incoming TCP segments land on sk->sk_backlog rather than sk->sk_receive_queue. tls_rx_rec_wait() inspects only sk_receive_queue, so backlog data remains invisible until release_sock() drains it, forcing an extra workqueue cycle for records that arrive during decryption. Calling sk_flush_backlog() before tls_rx_rec_wait() moves backlog data into sk_receive_queue, where tls_strp_check_rcv() can parse it immediately. The existing tls_read_flush_backlog call after decryption is retained for TCP window management. Acked-by: Alistair Francis Reviewed-by: Hannes Reinecke Signed-off-by: Chuck Lever --- net/tls/tls_sw.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a5905f4c1ae2..70a9c2402ea1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2371,6 +2371,11 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, } else { struct tls_decrypt_arg darg; + /* Drain backlog so segments that arrived while the + * lock was held appear on sk_receive_queue before + * tls_rx_rec_wait waits for a new record. + */ + sk_flush_backlog(sk); err = tls_rx_rec_wait(sk, NULL, true, released); if (err <= 0) goto read_sock_end; -- 2.53.0 From: Chuck Lever Pipelining multiple AEAD operations requires separating decryption from delivery so that several records can be submitted before any are passed to the read_actor callback. The main loop in tls_sw_read_sock() is split into two explicit phases: a submit phase that decrypts one record onto ctx->rx_list, and a deliver phase that drains rx_list and passes each cleartext skb to the read_actor callback. With a single record per submit phase, behavior is identical to the previous code. A subsequent patch will extend the submit phase to pipeline multiple AEAD operations. Reviewed-by: Hannes Reinecke Signed-off-by: Chuck Lever --- net/tls/tls_sw.c | 79 +++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 70a9c2402ea1..0e72a74ab1ee 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2338,8 +2338,8 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; - struct strp_msg *rxm = NULL; struct sk_buff *skb = NULL; + struct strp_msg *rxm; struct sk_psock *psock; size_t flushed_at = 0; bool released = true; @@ -2364,17 +2364,15 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, decrypted = 0; for (;;) { - if (!skb_queue_empty(&ctx->rx_list)) { - skb = __skb_dequeue(&ctx->rx_list); - rxm = strp_msg(skb); - tlm = tls_msg(skb); - } else { - struct tls_decrypt_arg darg; + struct tls_decrypt_arg darg; - /* Drain backlog so segments that arrived while the - * lock was held appear on sk_receive_queue before - * tls_rx_rec_wait waits for a new record. - */ + /* Phase 1: Submit -- decrypt one record onto rx_list. + * Flush the backlog first so that segments that + * arrived while the lock was held appear on + * sk_receive_queue before tls_rx_rec_wait waits + * for a new record. + */ + if (skb_queue_empty(&ctx->rx_list)) { sk_flush_backlog(sk); err = tls_rx_rec_wait(sk, NULL, true, released); if (err <= 0) @@ -2391,38 +2389,43 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, released = tls_read_flush_backlog(sk, prot, INT_MAX, 0, decrypted, &flushed_at); - skb = darg.skb; + decrypted += strp_msg(darg.skb)->full_len; + tls_rx_rec_release(ctx); + __skb_queue_tail(&ctx->rx_list, darg.skb); + } + + /* Phase 2: Deliver -- drain rx_list to read_actor */ + while ((skb = __skb_dequeue(&ctx->rx_list)) != NULL) { rxm = strp_msg(skb); tlm = tls_msg(skb); - decrypted += rxm->full_len; - tls_rx_rec_release(ctx); - } - - /* read_sock does not support reading control messages */ - if (tlm->control != TLS_RECORD_TYPE_DATA) { - err = -EINVAL; - goto read_sock_requeue; - } - - used = read_actor(desc, skb, rxm->offset, rxm->full_len); - if (used <= 0) { - if (!copied) - err = used; - goto read_sock_requeue; - } - copied += used; - if (used < rxm->full_len) { - rxm->offset += used; - rxm->full_len -= used; - if (!desc->count) + /* read_sock does not support reading control messages */ + if (tlm->control != TLS_RECORD_TYPE_DATA) { + err = -EINVAL; goto read_sock_requeue; - } else { - consume_skb(skb); - skb = NULL; - if (!desc->count) - break; + } + + used = read_actor(desc, skb, rxm->offset, + rxm->full_len); + if (used <= 0) { + if (!copied) + err = used; + goto read_sock_requeue; + } + copied += used; + if (used < rxm->full_len) { + rxm->offset += used; + rxm->full_len -= used; + if (!desc->count) + goto read_sock_requeue; + } else { + consume_skb(skb); + skb = NULL; + } } + /* Drain all of rx_list before honoring !desc->count */ + if (!desc->count) + break; } read_sock_end: -- 2.53.0 From: Chuck Lever tls_sw_read_sock() decrypts one TLS record at a time, blocking until each AEAD operation completes before proceeding. Hardware async crypto engines depend on pipelining multiple operations to achieve full throughput, and the one-at-a-time model prevents that. Kernel consumers such as NVMe-TCP and NFSD (when using TLS) are therefore unable to benefit from hardware offload. When ctx->async_capable is true, the submit phase now loops up to TLS_READ_SOCK_BATCH (16) records. The first record waits via tls_rx_rec_wait(); subsequent iterations use tls_strp_msg_ready() and tls_strp_check_rcv() to collect records already queued on the socket without blocking. Each record is submitted with darg.async set, and all resulting skbs are appended to rx_list. After the submit loop, a single tls_decrypt_async_wait() collects all pending AEAD completions before the deliver phase passes cleartext records to the consumer. The batch bound of 16 limits concurrent memory consumption to 16 cleartext skbs plus their AEAD contexts. If async_capable is false, the loop exits after one record and the async wait is skipped, preserving prior behavior. Reviewed-by: Hannes Reinecke Signed-off-by: Chuck Lever --- net/tls/tls_sw.c | 81 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0e72a74ab1ee..d5b9ee7aca57 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2332,6 +2332,8 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, goto splice_read_end; } +#define TLS_READ_SOCK_BATCH 16 + int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t read_actor) { @@ -2343,6 +2345,7 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, struct sk_psock *psock; size_t flushed_at = 0; bool released = true; + bool async = false; struct tls_msg *tlm; ssize_t copied = 0; ssize_t decrypted; @@ -2365,33 +2368,71 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, decrypted = 0; for (;;) { struct tls_decrypt_arg darg; + int nr_async = 0; - /* Phase 1: Submit -- decrypt one record onto rx_list. + /* Phase 1: Submit -- decrypt records onto rx_list. * Flush the backlog first so that segments that * arrived while the lock was held appear on * sk_receive_queue before tls_rx_rec_wait waits * for a new record. */ if (skb_queue_empty(&ctx->rx_list)) { - sk_flush_backlog(sk); - err = tls_rx_rec_wait(sk, NULL, true, released); - if (err <= 0) - goto read_sock_end; + while (nr_async < TLS_READ_SOCK_BATCH) { + if (nr_async == 0) { + sk_flush_backlog(sk); + err = tls_rx_rec_wait(sk, NULL, + true, + released); + if (err <= 0) + goto read_sock_end; + } else { + if (!tls_strp_msg_ready(ctx)) { + tls_strp_check_rcv_quiet(&ctx->strp); + if (!tls_strp_msg_ready(ctx)) + break; + } + if (!tls_strp_msg_load(&ctx->strp, + released)) + break; + } - memset(&darg.inargs, 0, sizeof(darg.inargs)); + memset(&darg.inargs, 0, + sizeof(darg.inargs)); + darg.async = ctx->async_capable; - err = tls_rx_one_record(sk, NULL, &darg); - if (err < 0) { - tls_err_abort(sk, -EBADMSG); + err = tls_rx_one_record(sk, NULL, &darg); + if (err < 0) { + tls_err_abort(sk, -EBADMSG); + goto read_sock_end; + } + + async |= darg.async; + released = tls_read_flush_backlog(sk, prot, + INT_MAX, + 0, + decrypted, + &flushed_at); + decrypted += strp_msg(darg.skb)->full_len; + tls_rx_rec_release(ctx); + __skb_queue_tail(&ctx->rx_list, darg.skb); + nr_async++; + + if (!ctx->async_capable) + break; + } + } + + /* Async wait -- collect pending AEAD completions */ + if (async) { + int ret = tls_decrypt_async_wait(ctx); + + __skb_queue_purge(&ctx->async_hold); + async = false; + if (ret) { + __skb_queue_purge(&ctx->rx_list); + err = ret; goto read_sock_end; } - - released = tls_read_flush_backlog(sk, prot, INT_MAX, - 0, decrypted, - &flushed_at); - decrypted += strp_msg(darg.skb)->full_len; - tls_rx_rec_release(ctx); - __skb_queue_tail(&ctx->rx_list, darg.skb); } /* Phase 2: Deliver -- drain rx_list to read_actor */ @@ -2429,6 +2470,14 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, } read_sock_end: + if (async) { + int ret = tls_decrypt_async_wait(ctx); + + __skb_queue_purge(&ctx->async_hold); + __skb_queue_purge(&ctx->rx_list); + if (ret && !err) + err = ret; + } tls_strp_check_rcv(&ctx->strp); tls_rx_reader_release(sk, ctx); return copied ? : err; -- 2.53.0