Enable BPF programs to properly handle GSO state when decapsulating tunneled packets by adding selective GSO flag clearing and a trusted mode for GSO handling. New decapsulation flags: - BPF_F_ADJ_ROOM_DECAP_L4_UDP: Clear UDP tunnel GSO flags (SKB_GSO_UDP_TUNNEL, SKB_GSO_UDP_TUNNEL_CSUM) - BPF_F_ADJ_ROOM_DECAP_L4_GRE: Clear GRE tunnel GSO flags (SKB_GSO_GRE, SKB_GSO_GRE_CSUM) - BPF_F_ADJ_ROOM_DECAP_IPXIP4: Clear SKB_GSO_IPXIP4 flag for IPv4-in-IPv4 (IPIP) and IPv6-in-IPv4 (SIT) tunnels - BPF_F_ADJ_ROOM_DECAP_IPXIP6: Clear SKB_GSO_IPXIP6 flag for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels - BPF_F_ADJ_ROOM_NO_DODGY: Preserve gso_segs and don't set SKB_GSO_DODGY when the BPF program is trusted and modifications are known to be valid The existing anonymous enum for BPF_FUNC_skb_adjust_room flags is renamed to enum bpf_adj_room_flags to enable CO-RE (Compile Once - Run Everywhere) lookups in BPF programs. By default, bpf_skb_adjust_room sets SKB_GSO_DODGY and resets gso_segs to 0, forcing revalidation. The NO_DODGY flag bypasses this for trusted programs that guarantee GSO correctness. Usage example (decapsulating UDP tunnel with IPv4 inner packet): bpf_skb_adjust_room(skb, -hdr_len, BPF_ADJ_ROOM_NET, BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | BPF_F_ADJ_ROOM_DECAP_L4_UDP); Co-developed-by: Anna Glasgall Signed-off-by: Anna Glasgall Co-developed-by: Max Tottenham Signed-off-by: Max Tottenham Signed-off-by: Josh Hunt Signed-off-by: Nick Hudson --- include/uapi/linux/bpf.h | 45 +++++++++++++++++++-- net/core/filter.c | 73 ++++++++++++++++++++++++++++------ tools/include/uapi/linux/bpf.h | 45 +++++++++++++++++++-- 3 files changed, 145 insertions(+), 18 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8d400b7680a..0cb24ab70af7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3010,8 +3010,42 @@ union bpf_attr { * * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: - * Indicate the new IP header version after decapsulating the outer - * IP header. Used when the inner and outer IP versions are different. + * Indicate the new IP header version after decapsulating the + * outer IP header. Used when the inner and outer IP versions + * are different. These flags only trigger a protocol change + * without clearing any tunnel-specific GSO flags. + * + * * **BPF_F_ADJ_ROOM_DECAP_L4_GRE**: + * Clear GRE tunnel GSO flags (SKB_GSO_GRE and SKB_GSO_GRE_CSUM) + * when decapsulating a GRE tunnel. + * + * * **BPF_F_ADJ_ROOM_DECAP_L4_UDP**: + * Clear UDP tunnel GSO flags (SKB_GSO_UDP_TUNNEL and + * SKB_GSO_UDP_TUNNEL_CSUM) when decapsulating a UDP tunnel. + * + * * **BPF_F_ADJ_ROOM_DECAP_IPXIP4**: + * Clear IPIP/SIT tunnel GSO flag (SKB_GSO_IPXIP4) when decapsulating + * a tunnel with an outer IPv4 header (IPv4-in-IPv4 or IPv6-in-IPv4). + * + * * **BPF_F_ADJ_ROOM_DECAP_IPXIP6**: + * Clear IPv6 encapsulation tunnel GSO flag (SKB_GSO_IPXIP6) when + * decapsulating a tunnel with an outer IPv6 header (IPv6-in-IPv6 + * or IPv4-in-IPv6). + * + * When using the decapsulation flags above, the skb->encapsulation + * flag is automatically cleared if all tunnel-specific GSO flags + * (SKB_GSO_UDP_TUNNEL, SKB_GSO_UDP_TUNNEL_CSUM, SKB_GSO_GRE, + * SKB_GSO_GRE_CSUM, SKB_GSO_IPXIP4, SKB_GSO_IPXIP6) have been + * removed from the packet. This handles cases where all tunnel + * layers have been decapsulated. + * + * * **BPF_F_ADJ_ROOM_NO_DODGY**: + * Do not mark the packet as dodgy (untrusted) and preserve + * the existing gso_segs count. By default, packet modifications + * set SKB_GSO_DODGY and reset gso_segs to 0, forcing + * revalidation. This flag is useful when decapsulating the + * tunnel, the BPF program is trusted, and the modifications + * are known to be valid. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers @@ -6209,7 +6243,7 @@ enum { }; /* BPF_FUNC_skb_adjust_room flags. */ -enum { +enum bpf_adj_room_flags { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), @@ -6219,6 +6253,11 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), + BPF_F_ADJ_ROOM_DECAP_L4_GRE = (1ULL << 9), + BPF_F_ADJ_ROOM_DECAP_L4_UDP = (1ULL << 10), + BPF_F_ADJ_ROOM_DECAP_IPXIP4 = (1ULL << 11), + BPF_F_ADJ_ROOM_DECAP_IPXIP6 = (1ULL << 12), + BPF_F_ADJ_ROOM_NO_DODGY = (1ULL << 13), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ba019ded773d..681dd53ab841 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3484,14 +3484,28 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_DECAP_L3_IPV6) -#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ - BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ +#define BPF_F_ADJ_ROOM_DECAP_L4_MASK (BPF_F_ADJ_ROOM_DECAP_L4_UDP | \ + BPF_F_ADJ_ROOM_DECAP_L4_GRE) + +#define BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK (BPF_F_ADJ_ROOM_DECAP_IPXIP4 | \ + BPF_F_ADJ_ROOM_DECAP_IPXIP6) + +#define BPF_F_ADJ_ROOM_ENCAP_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ - BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ - BPF_F_ADJ_ROOM_DECAP_L3_MASK) + BPF_ADJ_ROOM_ENCAP_L2_MASK)) + +#define BPF_F_ADJ_ROOM_DECAP_MASK (BPF_F_ADJ_ROOM_DECAP_L3_MASK | \ + BPF_F_ADJ_ROOM_DECAP_L4_MASK | \ + BPF_F_ADJ_ROOM_DECAP_IPXIP_MASK) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_MASK | \ + BPF_F_ADJ_ROOM_DECAP_MASK | \ + BPF_F_ADJ_ROOM_NO_CSUM_RESET | \ + BPF_F_ADJ_ROOM_NO_DODGY) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) @@ -3503,6 +3517,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, unsigned int gso_type = SKB_GSO_DODGY; int ret; + if (unlikely(flags & (BPF_F_ADJ_ROOM_DECAP_MASK | + BPF_F_ADJ_ROOM_NO_DODGY))) + return -EINVAL; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || @@ -3588,8 +3606,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Header must be checked, and gso_segs recomputed. */ + /* Add tunnel GSO type flags as appropriate. */ shinfo->gso_type |= gso_type; + + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_segs = 0; /* Due to header growth, MSS needs to be downgraded. @@ -3610,11 +3630,14 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + bool no_dodgy = flags & BPF_F_ADJ_ROOM_NO_DODGY; int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_DECAP_L3_MASK | - BPF_F_ADJ_ROOM_NO_CSUM_RESET))) + BPF_F_ADJ_ROOM_DECAP_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET | + BPF_F_ADJ_ROOM_NO_DODGY))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3647,9 +3670,36 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) skb_increase_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; - shinfo->gso_segs = 0; + /* Selective GSO flag clearing based on decap type. + * Only clear the flags for the tunnel layer being removed. + */ + if (flags & BPF_F_ADJ_ROOM_DECAP_L4_UDP) + shinfo->gso_type &= ~(SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM); + if (flags & BPF_F_ADJ_ROOM_DECAP_L4_GRE) + shinfo->gso_type &= ~(SKB_GSO_GRE | + SKB_GSO_GRE_CSUM); + if (flags & BPF_F_ADJ_ROOM_DECAP_IPXIP4) + shinfo->gso_type &= ~SKB_GSO_IPXIP4; + if (flags & BPF_F_ADJ_ROOM_DECAP_IPXIP6) + shinfo->gso_type &= ~SKB_GSO_IPXIP6; + + /* Clear encapsulation flag only when no tunnel GSO flags remain */ + if (flags & BPF_F_ADJ_ROOM_DECAP_MASK) { + if (!(shinfo->gso_type & (SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPXIP4 | + SKB_GSO_IPXIP6))) + skb->encapsulation = 0; + } + + /* NO_DODGY: preserve gso_segs, don't mark as dodgy. */ + if (!no_dodgy) { + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; + } } return 0; @@ -3709,8 +3759,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | - BPF_F_ADJ_ROOM_NO_CSUM_RESET))) + if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3729,7 +3778,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOTSUPP; } - if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + if (flags & BPF_F_ADJ_ROOM_DECAP_MASK) { if (!shrink) return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5e38b4887de6..664bc8438186 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3010,8 +3010,42 @@ union bpf_attr { * * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: - * Indicate the new IP header version after decapsulating the outer - * IP header. Used when the inner and outer IP versions are different. + * Indicate the new IP header version after decapsulating the + * outer IP header. Used when the inner and outer IP versions + * are different. These flags only trigger a protocol change + * without clearing any tunnel-specific GSO flags. + * + * * **BPF_F_ADJ_ROOM_DECAP_L4_GRE**: + * Clear GRE tunnel GSO flags (SKB_GSO_GRE and SKB_GSO_GRE_CSUM) + * when decapsulating a GRE tunnel. + * + * * **BPF_F_ADJ_ROOM_DECAP_L4_UDP**: + * Clear UDP tunnel GSO flags (SKB_GSO_UDP_TUNNEL and + * SKB_GSO_UDP_TUNNEL_CSUM) when decapsulating a UDP tunnel. + * + * * **BPF_F_ADJ_ROOM_DECAP_IPXIP4**: + * Clear IPIP/SIT tunnel GSO flag (SKB_GSO_IPXIP4) when decapsulating + * a tunnel with an outer IPv4 header (IPv4-in-IPv4 or IPv6-in-IPv4). + * + * * **BPF_F_ADJ_ROOM_DECAP_IPXIP6**: + * Clear IPv6 encapsulation tunnel GSO flag (SKB_GSO_IPXIP6) when + * decapsulating a tunnel with an outer IPv6 header (IPv6-in-IPv6 + * or IPv4-in-IPv6). + * + * When using the decapsulation flags above, the skb->encapsulation + * flag is automatically cleared if all tunnel-specific GSO flags + * (SKB_GSO_UDP_TUNNEL, SKB_GSO_UDP_TUNNEL_CSUM, SKB_GSO_GRE, + * SKB_GSO_GRE_CSUM, SKB_GSO_IPXIP4, SKB_GSO_IPXIP6) have been + * removed from the packet. This handles cases where all tunnel + * layers have been decapsulated. + * + * * **BPF_F_ADJ_ROOM_NO_DODGY**: + * Do not mark the packet as dodgy (untrusted) and preserve + * the existing gso_segs count. By default, packet modifications + * set SKB_GSO_DODGY and reset gso_segs to 0, forcing + * revalidation. This flag is useful when decapsulating the + * tunnel, the BPF program is trusted, and the modifications + * are known to be valid. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers @@ -6209,7 +6243,7 @@ enum { }; /* BPF_FUNC_skb_adjust_room flags. */ -enum { +enum bpf_adj_room_flags { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), @@ -6219,6 +6253,11 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), + BPF_F_ADJ_ROOM_DECAP_L4_GRE = (1ULL << 9), + BPF_F_ADJ_ROOM_DECAP_L4_UDP = (1ULL << 10), + BPF_F_ADJ_ROOM_DECAP_IPXIP4 = (1ULL << 11), + BPF_F_ADJ_ROOM_DECAP_IPXIP6 = (1ULL << 12), + BPF_F_ADJ_ROOM_NO_DODGY = (1ULL << 13), }; enum { -- 2.34.1