Both BPF_SOCK_OPS_RCVQ_CB and SOCKMAP can intercept and handle socket receive queues, leading to overlapping use cases. While BPF_SOCK_OPS_RCVQ_CB focuses on optimizing single-socket performance by reducing EPOLLIN wakeups and fully preserves TCP zerocopy support, SOCKMAP is designed to facilitate multi-socket routing at the cost of higher overhead and no zerocopy support. Enabling both features on the same socket makes no sense and results in unexpected interference between them. For instance, SOCKMAP calls __tcp_cleanup_rbuf(), where we will add a BPF_SOCK_OPS_RCVQ_CB hook, and bpf_sock_ops_tcp_set_rcvlowat() calls sk->sk_data_ready(), which would trigger SOCKMAP. Let's make BPF_SOCK_OPS_RCVQ_CB and SOCKMAP mutually exclusive. Both bpf_sol_tcp_setsockopt(TCP_BPF_SOCK_OPS_CB_FLAGS) and bpf_sock_ops_cb_flags_set() now check if sk->sk_prot is &tcp_prot or tcpv6_prot, while tcp_bpf_update_proto() checks if BPF_SOCK_OPS_RCVQ_CB_FLAG is already set. Both checks are performed under lock_sock(). Signed-off-by: Kuniyuki Iwashima --- net/core/filter.c | 29 +++++++++++++++++++++++++++-- net/ipv4/tcp_bpf.c | 2 ++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 3608036632a8..ff7fd415486a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5382,12 +5382,27 @@ static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname, return 0; } +static int bpf_sock_ops_check_rcvq_cb(struct sock *sk, int val) +{ + if (val & BPF_SOCK_OPS_RCVQ_CB_FLAG) { + bool not_tcp_prot = sk->sk_prot != &tcp_prot; + +#if IS_ENABLED(CONFIG_IPV6) + not_tcp_prot &= sk->sk_prot != &tcpv6_prot; +#endif + if (not_tcp_prot) + return -EBUSY; + } + + return 0; +} + static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); unsigned long timeout; - int val; + int val, err; if (optlen != sizeof(int)) return -EINVAL; @@ -5424,6 +5439,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, case TCP_BPF_SOCK_OPS_CB_FLAGS: if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) return -EINVAL; + + err = bpf_sock_ops_check_rcvq_cb(sk, val); + if (err) + return err; + tp->bpf_sock_ops_cb_flags = val; break; default: @@ -5999,8 +6019,9 @@ static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, int, argval) { - struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + struct sock *sk = bpf_sock->sk; + int err; if (!is_locked_tcp_sock_ops(bpf_sock) && bpf_sock->op != BPF_SOCK_OPS_RCVQ_CB) @@ -6009,6 +6030,10 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; + err = bpf_sock_ops_check_rcvq_cb(sk, val); + if (err) + return err; + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index cc0bd73f36b6..5c5c67080740 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -729,6 +729,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) sock_replace_proto(sk, psock->sk_proto); } return 0; + } else if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RCVQ_CB_FLAG)) { + return -EBUSY; } if (sk->sk_family == AF_INET6) { -- 2.54.0.746.g67dd491aae-goog