From: Gang Yan The 'overhead' in struct mptcp_data_frag can safely use u8, as it represents 'alignment + sizeof(mptcp_data_frag)'. With a maximum alignment of 7('ALIGN(1, sizeof(long)) - 1'), the overhead is at most 47, well below U8_MAX and validated with BUILD_BUG_ON(). This patch also adds a field named 'unused' for further extensions. Signed-off-by: Gang Yan Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) --- net/mptcp/protocol.c | 6 ++++++ net/mptcp/protocol.h | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 239fb9e75c7c..79315e575d07 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -4616,6 +4616,12 @@ void __init mptcp_proto_init(void) inet_register_protosw(&mptcp_protosw); BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); + + /* struct mptcp_data_frag: 'overhead' corresponds to the alignment + * (ALIGN(1, sizeof(long)) - 1, so 8-1) + the struct's size + */ + BUILD_BUG_ON(ALIGN(1, sizeof(long)) - 1 + sizeof(struct mptcp_data_frag) + > U8_MAX); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0bd1ee860316..02031007100b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -263,7 +263,8 @@ struct mptcp_data_frag { u64 data_seq; u16 data_len; u16 offset; - u16 overhead; + u8 overhead; + u8 __unused; u16 already_sent; struct page *page; }; -- 2.53.0 From: Gang Yan Extend MPTCP's sendmsg handling to recognize and honor the MSG_EOR flag, which marks the end of a record for application-level message boundaries. Data fragments tagged with MSG_EOR are explicitly marked in the mptcp_data_frag structure and skb context to prevent unintended coalescing with subsequent data chunks. This ensures the intent of applications using MSG_EOR is preserved across MPTCP subflows, maintaining consistent message segmentation behavior. Signed-off-by: Gang Yan Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) --- net/mptcp/protocol.c | 24 +++++++++++++++++++++--- net/mptcp/protocol.h | 2 +- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 79315e575d07..e21e416cd19a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1005,7 +1005,8 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct page_frag *pfrag, const struct mptcp_data_frag *df) { - return df && pfrag->page == df->page && + return df && !df->eor && + pfrag->page == df->page && pfrag->size - pfrag->offset > 0 && pfrag->offset == (df->offset + df->data_len) && df->data_seq + df->data_len == msk->write_seq; @@ -1147,6 +1148,7 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, dfrag->offset = offset + sizeof(struct mptcp_data_frag); dfrag->already_sent = 0; dfrag->page = pfrag->page; + dfrag->eor = 0; return dfrag; } @@ -1408,6 +1410,13 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mptcp_update_infinite_map(msk, ssk, mpext); trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; + + /* if this is the last chunk of a dfrag with MSG_EOR set, + * mark the skb to prevent coalescing with subsequent data. + */ + if (dfrag->eor && info->sent + copy >= dfrag->data_len) + TCP_SKB_CB(skb)->eor = 1; + return copy; } @@ -1868,7 +1877,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) long timeo; /* silently ignore everything else */ - msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; + msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_FASTOPEN | MSG_EOR; lock_sock(sk); @@ -1975,8 +1985,16 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_error; } - if (copied) + if (copied) { + /* mark the last dfrag with EOR if MSG_EOR was set */ + if (msg->msg_flags & MSG_EOR) { + struct mptcp_data_frag *dfrag = mptcp_pending_tail(sk); + + if (dfrag) + dfrag->eor = 1; + } __mptcp_push_pending(sk, msg->msg_flags); + } out: release_sock(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 02031007100b..1208f317ac33 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -264,7 +264,7 @@ struct mptcp_data_frag { u16 data_len; u16 offset; u8 overhead; - u8 __unused; + u8 eor; /* currently using 1 bit */ u16 already_sent; struct page *page; }; -- 2.53.0 From: Geliang Tang Factor out a new helper tcp_recv_should_stop() from tcp_recvmsg_locked() and tcp_splice_read() to check whether to stop receiving. And use this helper in mptcp_recvmsg() and mptcp_splice_read() to reduce redundant code. Suggested-by: Paolo Abeni Acked-by: Mat Martineau Signed-off-by: Geliang Tang Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) --- To: Neal Cardwell To: Kuniyuki Iwashima To: David Ahern --- include/net/tcp.h | 8 ++++++++ net/ipv4/tcp.c | 9 ++------- net/mptcp/protocol.c | 11 +++-------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 565943c34b7e..6156d1d068e1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -3077,4 +3077,12 @@ enum skb_drop_reason tcp_inbound_hash(struct sock *sk, const void *saddr, const void *daddr, int family, int dif, int sdif); +static inline int tcp_recv_should_stop(struct sock *sk) +{ + return sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current); +} + #endif /* _TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bd2c3c4587e1..e57eaffc007a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -888,9 +888,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, release_sock(sk); lock_sock(sk); - if (sk->sk_err || sk->sk_state == TCP_CLOSE || - (sk->sk_shutdown & RCV_SHUTDOWN) || - signal_pending(current)) + if (tcp_recv_should_stop(sk)) break; } @@ -2755,10 +2753,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, if (copied) { if (!timeo || - sk->sk_err || - sk->sk_state == TCP_CLOSE || - (sk->sk_shutdown & RCV_SHUTDOWN) || - signal_pending(current)) + tcp_recv_should_stop(sk)) break; } else { if (sock_flag(sk, SOCK_DONE)) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e21e416cd19a..2f4776a4f06a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2333,11 +2333,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; if (copied) { - if (sk->sk_err || - sk->sk_state == TCP_CLOSE || - (sk->sk_shutdown & RCV_SHUTDOWN) || - !timeo || - signal_pending(current)) + if (tcp_recv_should_stop(sk) || + !timeo) break; } else { if (sk->sk_err) { @@ -4520,9 +4517,7 @@ static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos, release_sock(sk); lock_sock(sk); - if (sk->sk_err || sk->sk_state == TCP_CLOSE || - (sk->sk_shutdown & RCV_SHUTDOWN) || - signal_pending(current)) + if (tcp_recv_should_stop(sk)) break; } -- 2.53.0 There is no need to call this helper: it will check if the address ID attribute is set, but this attribute has already been parsed previously. Indeed, the value has been set in 'entry->addr.id' if it was set and positive, which is what we were looking at. Then only looking at this already parsed value is enough, not need to re-extract all Netlink attributes again. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) --- net/mptcp/pm_kernel.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 82e59f9c6dd9..0ebf43be9939 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -720,7 +720,7 @@ static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, - bool needs_id, bool replace) + bool replace) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; int ret = -EINVAL; @@ -779,7 +779,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, } } - if (!entry->addr.id && needs_id) { + if (!entry->addr.id) { find_next: entry->addr.id = find_next_zero_bit(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, @@ -790,7 +790,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, } } - if (!entry->addr.id && needs_id) + if (!entry->addr.id) goto out; __set_bit(entry->addr.id, pernet->id_bitmap); @@ -923,7 +923,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, return -ENOMEM; entry->addr.port = 0; - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, false); if (ret < 0) kfree(entry); @@ -977,18 +977,6 @@ static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, return 0; } -static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, - struct genl_info *info) -{ - struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; - - if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, - mptcp_pm_address_nl_policy, info->extack) && - tb[MPTCP_PM_ADDR_ATTR_ID]) - return true; - return false; -} - /* Add an MPTCP endpoint */ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1037,9 +1025,7 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) goto out_free; } } - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, - !mptcp_pm_has_addr_attr_id(attr, info), - true); + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true); if (ret < 0) { GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); goto out_free; -- 2.53.0 In this "delete re-add signal" MPTCP Join subtest, the endpoint linked to the initial subflow is removed, but readded once with different ID. It appears that there was an issue when reusing the same ID, recently fixed by commit d191101dee25 ("mptcp: pm: in-kernel: always set ID as avail when rm endp"). The test then now reuses the same ID the first time, but continue to use another one (88) the second time. This should then cover more cases. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/615 Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) --- To: Shuah Khan Cc: linux-kselftest@vger.kernel.org --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a3144d7298a5..beec41f6662a 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -4343,13 +4343,13 @@ endpoint_tests() chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 [ ${ipt} = 1 ] && ip netns exec "${ns1}" ${iptables} -D OUTPUT 1 - pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal + pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal wait_mpj 4 chk_subflow_nr "after re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 - pm_nl_del_endpoint $ns1 99 10.0.1.1 + pm_nl_del_endpoint $ns1 42 10.0.1.1 sleep 0.5 chk_subflow_nr "after re-delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 -- 2.53.0