From: Geliang Tang This patch extracts the free skb related code in __mptcp_recvmsg_mskq() into a new helper mptcp_eat_recv_skb(). This new helper will be used in the next patch. Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) --- net/mptcp/protocol.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8d3233667418..5eb2a6182d4b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1995,6 +1995,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); +static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) +{ + /* avoid the indirect call, we know the destructor is sock_rfree */ + skb->destructor = NULL; + skb->sk = NULL; + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, skb->truesize); + __skb_unlink(skb, &sk->sk_receive_queue); + skb_attempt_defer_free(skb); +} + static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, int copied_total, struct scm_timestamping_internal *tss, @@ -2049,13 +2060,7 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, break; } - /* avoid the indirect call, we know the destructor is sock_rfree */ - skb->destructor = NULL; - skb->sk = NULL; - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); - sk_mem_uncharge(sk, skb->truesize); - __skb_unlink(skb, &sk->sk_receive_queue); - skb_attempt_defer_free(skb); + mptcp_eat_recv_skb(sk, skb); } if (copied >= len) -- 2.51.0 From: Geliang Tang Current in-kernel TCP sockets -- i.e. from nvme_tcp_try_recv() -- need to call .read_sock interface of struct proto_ops, but it's not implemented in MPTCP. This patch implements it with reference to __tcp_read_sock() and __mptcp_recvmsg_mskq(). Corresponding to tcp_recv_skb(), a new helper for MPTCP named mptcp_recv_skb() is added to peek a skb from sk->sk_receive_queue. Compared with __mptcp_recvmsg_mskq(), mptcp_read_sock() uses sk->sk_rcvbuf as the max read length. The LISTEN status is checked before the while loop, and mptcp_recv_skb() and mptcp_cleanup_rbuf() are invoked after the loop. In the loop, all flags checks for __mptcp_recvmsg_mskq() are removed. Reviewed-by: Hannes Reinecke Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) --- net/mptcp/protocol.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5eb2a6182d4b..2071f984da12 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -4317,6 +4317,86 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, return mask; } +static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + u32 offset; + + if (!list_empty(&msk->backlog_list)) + mptcp_move_skbs(sk); + + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + offset = MPTCP_SKB_CB(skb)->offset; + if (offset < skb->len) { + *off = offset; + return skb; + } + mptcp_eat_recv_skb(sk, skb); + } + return NULL; +} + +/* + * Note: + * - It is assumed that the socket was locked by the caller. + */ +static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + int copied = 0; + u32 offset; + + msk_owned_by_me(msk); + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + while ((skb = mptcp_recv_skb(sk, &offset)) != NULL) { + u32 data_len = skb->len - offset; + int count; + u32 size; + + size = min_t(size_t, data_len, INT_MAX); + count = recv_actor(desc, skb, offset, size); + if (count <= 0) { + if (!copied) + copied = count; + break; + } + + copied += count; + + msk->bytes_consumed += count; + if (count < data_len) { + MPTCP_SKB_CB(skb)->offset += count; + MPTCP_SKB_CB(skb)->map_seq += count; + break; + } + + mptcp_eat_recv_skb(sk, skb); + } + + if (noack) + goto out; + + mptcp_rcv_space_adjust(msk, copied); + + if (copied > 0) { + mptcp_recv_skb(sk, &offset); + mptcp_cleanup_rbuf(msk, copied); + } +out: + return copied; +} + +static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __mptcp_read_sock(sk, desc, recv_actor, false); +} + static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, @@ -4337,6 +4417,7 @@ static const struct proto_ops mptcp_stream_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, + .read_sock = mptcp_read_sock, }; static struct inet_protosw mptcp_protosw = { @@ -4441,6 +4522,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = mptcp_set_rcvlowat, + .read_sock = mptcp_read_sock, }; static struct proto mptcp_v6_prot; -- 2.51.0 From: Geliang Tang Export struct tcp_splice_state and tcp_splice_data_recv() in net/tcp.h so that they can be used by MPTCP in the next patch. Suggested-by: Paolo Abeni Signed-off-by: Geliang Tang Acked-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) --- include/net/tcp.h | 11 +++++++++++ net/ipv4/tcp.c | 13 ++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f1cf9e6730c8..cecec1a92d5e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -347,6 +347,15 @@ extern struct proto tcp_prot; #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) +/* + * TCP splice context + */ +struct tcp_splice_state { + struct pipe_inode_info *pipe; + size_t len; + unsigned int flags; +}; + void tcp_tsq_work_init(void); int tcp_v4_err(struct sk_buff *skb, u32); @@ -378,6 +387,8 @@ void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); void tcp_twsk_purge(struct list_head *net_exit_list); +int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e4009158b908..6e94c5859f4b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -318,15 +318,6 @@ EXPORT_SYMBOL(tcp_have_smc); struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_IPV6_MOD(tcp_sockets_allocated); -/* - * TCP splice context - */ -struct tcp_splice_state { - struct pipe_inode_info *pipe; - size_t len; - unsigned int flags; -}; - /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. @@ -791,8 +782,8 @@ void tcp_push(struct sock *sk, int flags, int mss_now, __tcp_push_pending_frames(sk, mss_now, nonagle); } -static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) +int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; -- 2.51.0 From: Geliang Tang This patch implements .splice_read interface of mptcp struct proto_ops as mptcp_splice_read() with reference to tcp_splice_read(). Corresponding to __tcp_splice_read(), __mptcp_splice_read() is defined, invoking mptcp_read_sock() instead of tcp_read_sock(). mptcp_splice_read() is almost the same as tcp_splice_read(), except for sock_rps_record_flow(). Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) --- v2: Fix kdoc warning reported by NIPA --- net/mptcp/protocol.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2071f984da12..9b8c51937eb2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -4397,6 +4397,121 @@ static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, return __mptcp_read_sock(sk, desc, recv_actor, false); } +static int __mptcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) +{ + /* Store TCP splice context information in read_descriptor_t. */ + read_descriptor_t rd_desc = { + .arg.data = tss, + .count = tss->len, + }; + + return mptcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); +} + +/** + * mptcp_splice_read - splice data from MPTCP socket to a pipe + * @sock: socket to splice from + * @ppos: position (not valid) + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given socket and fill them into a pipe. + * + * Return: + * Amount of bytes that have been spliced. + * + **/ +static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct tcp_splice_state tss = { + .pipe = pipe, + .len = len, + .flags = flags, + }; + struct sock *sk = sock->sk; + ssize_t spliced = 0; + int ret = 0; + long timeo; + + /* + * We can't seek on a socket input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + lock_sock(sk); + + mptcp_rps_record_subflows(mptcp_sk(sk)); + + timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); + while (tss.len) { + ret = __mptcp_splice_read(sk, &tss); + if (ret < 0) { + break; + } else if (!ret) { + if (spliced) + break; + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + ret = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_state == TCP_CLOSE) { + /* + * This occurs when user tries to read + * from never connected socket. + */ + ret = -ENOTCONN; + break; + } + if (!timeo) { + ret = -EAGAIN; + break; + } + /* if __mptcp_splice_read() got nothing while we have + * an skb in receive queue, we do not want to loop. + * This might happen with URG data. + */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) + break; + if (signal_pending(current)) { + ret = sock_intr_errno(timeo); + break; + } + continue; + } + tss.len -= ret; + spliced += ret; + + if (!tss.len || !timeo) + break; + release_sock(sk); + lock_sock(sk); + + if (sk->sk_err || sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current)) + break; + } + + release_sock(sk); + + if (spliced) + return spliced; + + return ret; +} + static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, @@ -4418,6 +4533,7 @@ static const struct proto_ops mptcp_stream_ops = { .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, .read_sock = mptcp_read_sock, + .splice_read = mptcp_splice_read, }; static struct inet_protosw mptcp_protosw = { @@ -4523,6 +4639,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { #endif .set_rcvlowat = mptcp_set_rcvlowat, .read_sock = mptcp_read_sock, + .splice_read = mptcp_splice_read, }; static struct proto mptcp_v6_prot; -- 2.51.0 From: Geliang Tang This patch adds a new 'splice' io mode for mptcp_connect to test the newly added read_sock() and splice_read() functions of MPTCP. do_splice() efficiently transfers data directly between two file descriptors (infd and outfd) without copying to userspace, using Linux's splice() system call. Usage: ./mptcp_connect.sh -m splice Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Co-developed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) --- v2: catch possible errors with splice() (NIPA review assistant) --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 79 ++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 10f6f99cfd4e..a74b13e42ecd 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -52,6 +52,7 @@ enum cfg_mode { CFG_MODE_POLL, CFG_MODE_MMAP, CFG_MODE_SENDFILE, + CFG_MODE_SPLICE, }; enum cfg_peek { @@ -124,7 +125,7 @@ static void die_usage(void) fprintf(stderr, "\t-j -- add additional sleep at connection start and tear down " "-- for MPJ tests\n"); fprintf(stderr, "\t-l -- listens mode, accepts incoming connection\n"); - fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); + fprintf(stderr, "\t-m [poll|mmap|sendfile|splice] -- use poll(default)/mmap+write/sendfile/splice\n"); fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-o option -- test sockopt