This is needed in the context of Tetragon to provide improved feedback (in contrast to just dropping packets) to east-west traffic when blocked by policies using cgroup_skb programs. We also extend this kfunc to tc program as a convenience. This reuses concepts from netfilter reject target codepath with the differences that: * Packets are cloned since the BPF user can still let the packet pass (SK_PASS from the cgroup_skb progs for example) and the current skb need to stay untouched (cgroup_skb hooks only allow read-only skb payload). * We protect against recursion since the kfunc, by generating an ICMP error message, could retrigger the BPF prog that invoked it. For now, we support cgroup_skb and tc program types. For cgroup_skb and tc egress, almost everything should be good. However for tc ingress: - packet will not be routed yet: need to set the net device for icmp_send, thus the call to ip[6]_route_reply_fill_dst. - fragments could trigger hook: icmp_send will only reply to fragment 0. - ensure the ip headers is linearized before processing, and zero out the SKB control block after cloning to prevent icmp_send()/icmpv6_send() from misinterpreting garbage data as IP options. Only ICMP_DEST_UNREACH and ICMPV6_DEST_UNREACH are currently supported. The interface accepts a type parameter to facilitate future extension to other ICMP control message types. Reviewed-by: Jordan Rife Signed-off-by: Mahe Tardy --- net/core/filter.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 2e96b4b847ce..fc69a14650e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -84,6 +84,8 @@ #include #include #include +#include +#include #include "dev.h" @@ -12546,6 +12548,101 @@ __bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) return 0; } +/** + * bpf_icmp_send - Send an ICMP control message + * @skb_ctx: Packet that triggered the control message + * @type: ICMP type (only ICMP_DEST_UNREACH/ICMPV6_DEST_UNREACH supported) + * @code: ICMP code (0-15 for IPv4, 0-6 for IPv6) + * + * Sends an ICMP control message in response to the packet. The original packet + * is cloned before sending the ICMP message, so the BPF program can still let + * the packet pass if desired. + * + * Currently only ICMP_DEST_UNREACH (IPv4) and ICMPV6_DEST_UNREACH (IPv6) are + * supported. + * + * Return: 0 on success, negative error code on failure: + * -EINVAL: Invalid code parameter + * -EBADMSG: Packet too short or malformed + * -ENOMEM: Memory allocation failed + * -EBUSY: Recursion detected + * -EHOSTUNREACH: Routing failed + * -EPROTONOSUPPORT: Non-IP protocol + * -EOPNOTSUPP: Unsupported ICMP type + */ +__bpf_kfunc int bpf_icmp_send(struct __sk_buff *skb_ctx, int type, int code) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct sk_buff *nskb; + struct sock *sk; + + sk = skb_to_full_sk(skb); + if (sk && sk->sk_kern_sock && + (sk->sk_protocol == IPPROTO_ICMP || sk->sk_protocol == IPPROTO_ICMPV6)) + return -EBUSY; + + switch (skb->protocol) { +#if IS_ENABLED(CONFIG_INET) + case htons(ETH_P_IP): + if (type != ICMP_DEST_UNREACH) + return -EOPNOTSUPP; + if (code < 0 || code > NR_ICMP_UNREACH) + return -EINVAL; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + if (!pskb_network_may_pull(nskb, sizeof(struct iphdr))) { + kfree_skb(nskb); + return -EBADMSG; + } + + if (!skb_dst(nskb) && ip_route_reply_fill_dst(nskb) < 0) { + kfree_skb(nskb); + return -EHOSTUNREACH; + } + + memset(IPCB(nskb), 0, sizeof(struct inet_skb_parm)); + + icmp_send(nskb, type, code, 0); + consume_skb(nskb); + break; +#endif +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + if (type != ICMPV6_DEST_UNREACH) + return -EOPNOTSUPP; + if (code < 0 || code > ICMPV6_REJECT_ROUTE) + return -EINVAL; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + if (!pskb_network_may_pull(nskb, sizeof(struct ipv6hdr))) { + kfree_skb(nskb); + return -EBADMSG; + } + + if (!skb_dst(nskb) && ip6_route_reply_fill_dst(nskb) < 0) { + kfree_skb(nskb); + return -EHOSTUNREACH; + } + + memset(IP6CB(nskb), 0, sizeof(struct inet6_skb_parm)); + + icmpv6_send(nskb, type, code, 0); + consume_skb(nskb); + break; +#endif + default: + return -EPROTONOSUPPORT; + } + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12588,6 +12685,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) +BTF_KFUNCS_START(bpf_kfunc_check_set_icmp_send) +BTF_ID_FLAGS(func, bpf_icmp_send) +BTF_KFUNCS_END(bpf_kfunc_check_set_icmp_send) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -12618,6 +12719,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { .set = &bpf_kfunc_check_set_sock_ops, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_icmp_send = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_icmp_send, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -12639,6 +12745,9 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_icmp_send); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_icmp_send); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_icmp_send); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); } late_initcall(bpf_kfunc_init); -- 2.34.1