Move and rename nf_reject_fill_skb_dst from ipv4/netfilter/nf_reject_ipv4 to ip_route_reply_fetch_dst in ipv4/route.c so that it can be reused in the following patches by BPF kfuncs. Netfilter uses nf_ip_route that is almost a transparent wrapper around ip_route_output_key so this patch inlines it. Signed-off-by: Mahe Tardy --- include/net/route.h | 1 + net/ipv4/netfilter/nf_reject_ipv4.c | 19 ++----------------- net/ipv4/route.c | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 8e39aa822cf9..1f032f768d52 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -173,6 +173,7 @@ struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, const struct sock *sk); struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig); +int ip_route_reply_fetch_dst(struct sk_buff *skb); static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp) { diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 87fd945a0d27..76beb78f556a 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -220,21 +220,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, } EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); -static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) -{ - struct dst_entry *dst = NULL; - struct flowi fl; - - memset(&fl, 0, sizeof(struct flowi)); - fl.u.ip4.daddr = ip_hdr(skb_in)->saddr; - nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false); - if (!dst) - return -1; - - skb_dst_set(skb_in, dst); - return 0; -} - /* Send RST reply */ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) @@ -248,7 +233,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, return; if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(oldskb) < 0) + ip_route_reply_fetch_dst(oldskb) < 0) return; if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -322,7 +307,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) return; if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(skb_in) < 0) + ip_route_reply_fetch_dst(skb_in) < 0) return; if (skb_csum_unnecessary(skb_in) || diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fccb05fb3a79..59b8fc3c01c0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2934,6 +2934,21 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); +int ip_route_reply_fetch_dst(struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl4 = { + .daddr = ip_hdr(skb)->saddr + }; + + rt = ip_route_output_key(dev_net(skb->dev), &fl4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + skb_dst_set(skb, &rt->dst); + return 0; +} +EXPORT_SYMBOL_GPL(ip_route_reply_fetch_dst); + /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, dscp_t dscp, -- 2.34.1 Move and rename nf_reject6_fill_skb_dst from ipv6/netfilter/nf_reject_ipv6 to ip6_route_reply_fetch_dst in ipv6/route.c so that it can be reused in the following patches by BPF kfuncs. Netfilter uses nf_ip6_route that is almost a transparent wrapper around ip6_route_outputy so this patch inlines it. Signed-off-by: Mahe Tardy --- include/net/ip6_route.h | 2 ++ net/ipv6/netfilter/nf_reject_ipv6.c | 17 +---------------- net/ipv6/route.c | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 6dbdf60b342f..1426467df547 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -93,6 +93,8 @@ static inline struct dst_entry *ip6_route_output(struct net *net, return ip6_route_output_flags(net, sk, fl6, 0); } +int ip6_route_reply_fetch_dst(struct sk_buff *skb); + /* Only conditionally release dst if flags indicates * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list. */ diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 9ae2b2725bf9..994a3b88ac52 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -250,21 +250,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, } EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); -static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) -{ - struct dst_entry *dst = NULL; - struct flowi fl; - - memset(&fl, 0, sizeof(struct flowi)); - fl.u.ip6.daddr = ipv6_hdr(skb_in)->saddr; - nf_ip6_route(dev_net(skb_in->dev), &dst, &fl, false); - if (!dst) - return -1; - - skb_dst_set(skb_in, dst); - return 0; -} - void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { @@ -398,7 +383,7 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, skb_in->dev = net->loopback_dev; if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && - nf_reject6_fill_skb_dst(skb_in) < 0) + ip6_route_reply_fetch_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0d5464c64965..de61540f9524 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2705,6 +2705,24 @@ struct dst_entry *ip6_route_output_flags(struct net *net, } EXPORT_SYMBOL_GPL(ip6_route_output_flags); +int ip6_route_reply_fetch_dst(struct sk_buff *skb) +{ + struct dst_entry *result; + struct flowi6 fl = { + .daddr = ipv6_hdr(skb)->saddr + }; + int err; + + result = ip6_route_output(dev_net(skb->dev), NULL, &fl); + err = result->error; + if (err) + dst_release(result); + else + skb_dst_set(skb, result); + return err; +} +EXPORT_SYMBOL_GPL(ip6_route_reply_fetch_dst); + struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); -- 2.34.1 This is needed in the context of Tetragon to provide improved feedback (in contrast to just dropping packets) to east-west traffic when blocked by policies using cgroup_skb programs. This reuse concepts from netfilter reject target codepath with the differences that: * Packets are cloned since the BPF user can still return SK_PASS from the cgroup_skb progs and the current skb need to stay untouched (cgroup_skb hooks only allow read-only skb payload). * Since cgroup_skb programs are called late in the stack, checksums do not need to be computed or verified, and IPv4 fragmentation does not need to be checked (ip_local_deliver should take care of that earlier). Signed-off-by: Mahe Tardy --- net/core/filter.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 7a72f766aacf..050872324575 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -85,6 +85,10 @@ #include #include #include +#include +#include +#include +#include #include "dev.h" @@ -12148,6 +12152,53 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, return 0; } +__bpf_kfunc int bpf_icmp_send_unreach(struct __sk_buff *__skb, int code) +{ + struct sk_buff *skb = (struct sk_buff *)__skb; + struct sk_buff *nskb; + + switch (skb->protocol) { + case htons(ETH_P_IP): + if (code < 0 || code > NR_ICMP_UNREACH) + return -EINVAL; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + if (ip_route_reply_fetch_dst(nskb) < 0) { + kfree_skb(nskb); + return -EHOSTUNREACH; + } + + icmp_send(nskb, ICMP_DEST_UNREACH, code, 0); + kfree_skb(nskb); + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + if (code < 0 || code > ICMPV6_REJECT_ROUTE) + return -EINVAL; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + if (ip6_route_reply_fetch_dst(nskb) < 0) { + kfree_skb(nskb); + return -EHOSTUNREACH; + } + + icmpv6_send(nskb, ICMPV6_DEST_UNREACH, code, 0); + kfree_skb(nskb); + break; +#endif + default: + return -EPROTONOSUPPORT; + } + + return SK_DROP; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12185,6 +12236,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) +BTF_KFUNCS_START(bpf_kfunc_check_set_icmp_send_unreach) +BTF_ID_FLAGS(func, bpf_icmp_send_unreach, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_icmp_send_unreach) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -12210,6 +12265,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { .set = &bpf_kfunc_check_set_sock_ops, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_icmp_send_unreach = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_icmp_send_unreach, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -12229,6 +12289,7 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_icmp_send_unreach); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); } late_initcall(bpf_kfunc_init); -- 2.34.1 This test opens a server and client, attach a cgroup_skb program on egress and calls the icmp_send_unreach function from the client egress so that an ICMP unreach control message is sent back to the client. It then fetches the message from the error queue to confirm the correct ICMP unreach code has been sent. Note that the BPF program returns SK_PASS to let the connection being established to finish the test cases quicker. Otherwise, you have to wait for the TCP three-way handshake to timeout in the kernel and retrieve the errno translated from the unreach code set by the ICMP control message. Signed-off-by: Mahe Tardy --- .../bpf/prog_tests/icmp_send_unreach_kfunc.c | 99 +++++++++++++++++++ .../selftests/bpf/progs/icmp_send_unreach.c | 36 +++++++ 2 files changed, 135 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/icmp_send_unreach_kfunc.c create mode 100644 tools/testing/selftests/bpf/progs/icmp_send_unreach.c diff --git a/tools/testing/selftests/bpf/prog_tests/icmp_send_unreach_kfunc.c b/tools/testing/selftests/bpf/prog_tests/icmp_send_unreach_kfunc.c new file mode 100644 index 000000000000..414c1ed8ced3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/icmp_send_unreach_kfunc.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "icmp_send_unreach.skel.h" + +#define TIMEOUT_MS 1000 +#define SRV_PORT 54321 + +#define ICMP_DEST_UNREACH 3 + +#define ICMP_FRAG_NEEDED 4 +#define NR_ICMP_UNREACH 15 + +static void read_icmp_errqueue(int sockfd, int expected_code) +{ + ssize_t n; + struct sock_extended_err *sock_err; + struct cmsghdr *cm; + char ctrl_buf[512]; + struct msghdr msg = { + .msg_control = ctrl_buf, + .msg_controllen = sizeof(ctrl_buf), + }; + + n = recvmsg(sockfd, &msg, MSG_ERRQUEUE); + if (!ASSERT_GE(n, 0, "recvmsg_errqueue")) + return; + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + if (!ASSERT_EQ(cm->cmsg_level, IPPROTO_IP, "cmsg_type") || + !ASSERT_EQ(cm->cmsg_type, IP_RECVERR, "cmsg_level")) + continue; + + sock_err = (struct sock_extended_err *)CMSG_DATA(cm); + + if (!ASSERT_EQ(sock_err->ee_origin, SO_EE_ORIGIN_ICMP, + "sock_err_origin_icmp")) + return; + if (!ASSERT_EQ(sock_err->ee_type, ICMP_DEST_UNREACH, + "sock_err_type_dest_unreach")) + return; + ASSERT_EQ(sock_err->ee_code, expected_code, "sock_err_code"); + } +} + +void test_icmp_send_unreach_kfunc(void) +{ + struct icmp_send_unreach *skel; + int cgroup_fd = -1, client_fd = 1, srv_fd = -1; + int *code; + + skel = icmp_send_unreach__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + cgroup_fd = test__join_cgroup("/icmp_send_unreach_cgroup"); + if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) + goto cleanup; + + skel->links.egress = + bpf_program__attach_cgroup(skel->progs.egress, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.egress, "prog_attach_cgroup")) + goto cleanup; + + code = &skel->bss->unreach_code; + + for (*code = 0; *code <= NR_ICMP_UNREACH; (*code)++) { + // The TCP stack reacts differently when asking for + // fragmentation, let's ignore it for now + if (*code == ICMP_FRAG_NEEDED) + continue; + + skel->bss->kfunc_ret = -1; + + srv_fd = start_server(AF_INET, SOCK_STREAM, "127.0.0.1", + SRV_PORT, TIMEOUT_MS); + if (!ASSERT_GE(srv_fd, 0, "start_server")) + goto for_cleanup; + + client_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(client_fd, 0, "client_socket"); + + client_fd = connect_to_fd(srv_fd, 0); + if (!ASSERT_GE(client_fd, 0, "client_connect")) + goto for_cleanup; + + read_icmp_errqueue(client_fd, *code); + + ASSERT_EQ(skel->bss->kfunc_ret, SK_DROP, "kfunc_ret"); +for_cleanup: + close(client_fd); + close(srv_fd); + } + +cleanup: + icmp_send_unreach__destroy(skel); + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/progs/icmp_send_unreach.c b/tools/testing/selftests/bpf/progs/icmp_send_unreach.c new file mode 100644 index 000000000000..15783e5d1d65 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/icmp_send_unreach.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +int unreach_code = 0; +int kfunc_ret = 0; + +#define SERVER_PORT 54321 +#define SERVER_IP 0x7F000001 + +SEC("cgroup_skb/egress") +int egress(struct __sk_buff *skb) +{ + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + struct iphdr *iph; + struct tcphdr *tcph; + + iph = data; + if ((void *)(iph + 1) > data_end || iph->version != 4 || + iph->protocol != IPPROTO_TCP || iph->daddr != bpf_htonl(SERVER_IP)) + return SK_PASS; + + tcph = (void *)iph + iph->ihl * 4; + if ((void *)(tcph + 1) > data_end || + tcph->dest != bpf_htons(SERVER_PORT)) + return SK_PASS; + + kfunc_ret = bpf_icmp_send_unreach(skb, unreach_code); + + /* returns SK_PASS to execute the test case quicker */ + return SK_PASS; +} -- 2.34.1