pskb_expand_head() copies headroom (including skb metadata) into the newly allocated head, but then clears the metadata. As a result, metadata is lost when BPF helpers trigger a headroom reallocation. Let the skb metadata be in the newly created copy of head. Signed-off-by: Jakub Sitnicki --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ee0274417948..dd58cc9c997f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2288,8 +2288,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); - skb_metadata_clear(skb); - /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). -- 2.43.0 Lay groundwork for fixing BPF helpers available to TC(X) programs. When skb_push() or skb_pull() is called in a TC(X) ingress BPF program, the skb metadata must be kept in front of the MAC header. Otherwise, BPF programs using the __sk_buff->data_meta pseudo-pointer lose access to it. Introduce a helper that moves both metadata and a specified number of packet data bytes together, suitable as a drop-in replacement for memmove(). Signed-off-by: Jakub Sitnicki --- include/linux/skbuff.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fa633657e4c0..d32b62ab9f31 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4514,6 +4514,80 @@ static inline void skb_metadata_clear(struct sk_buff *skb) skb_metadata_set(skb, 0); } +/** + * skb_data_move - Move packet data and metadata after skb_push() or skb_pull(). + * @skb: packet to operate on + * @len: number of bytes pushed or pulled from &sk_buff->data + * @n: number of bytes to memmove() from pre-push/pull &sk_buff->data + * + * Moves both packet data (@n bytes) and metadata. Assumes metadata is located + * immediately before &sk_buff->data prior to the push/pull, and that sufficient + * headroom exists to hold it after an skb_push(). Otherwise, metadata is + * cleared and a one-time warning is issued. + * + * Use skb_postpull_data_move() or skb_postpush_data_move() instead of calling + * this helper directly. + */ +static inline void skb_data_move(struct sk_buff *skb, const int len, + const unsigned int n) +{ + const u8 meta_len = skb_metadata_len(skb); + u8 *meta, *meta_end; + + if (!len || (!n && !meta_len)) + return; + + if (!meta_len) + goto no_metadata; + + meta_end = skb_metadata_end(skb); + meta = meta_end - meta_len; + + if (WARN_ON_ONCE(meta_end + len != skb->data || + meta + len < skb->head)) { + skb_metadata_clear(skb); + goto no_metadata; + } + + memmove(meta + len, meta, meta_len + n); + return; + +no_metadata: + memmove(skb->data, skb->data - len, n); +} + +/** + * skb_postpull_data_move - Move packet data and metadata after skb_pull(). + * @skb: packet to operate on + * @len: number of bytes pulled from &sk_buff->data + * @n: number of bytes to memmove() from pre-pull &sk_buff->data + * + * See skb_data_move() for details. + */ +static inline void skb_postpull_data_move(struct sk_buff *skb, + const unsigned int len, + const unsigned int n) +{ + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_data_move(skb, len, n); +} + +/** + * skb_postpush_data_move - Move packet data and metadata after skb_push(). + * @skb: packet to operate on + * @len: number of bytes pushed onto &sk_buff->data + * @n: number of bytes to memmove() from pre-push &sk_buff->data + * + * See skb_data_move() for details. + */ +static inline void skb_postpush_data_move(struct sk_buff *skb, + const unsigned int len, + const unsigned int n) +{ + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_data_move(skb, -len, n); +} + struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING -- 2.43.0 All callers ignore the return value. Prepare to reorder memmove() after skb_pull() which is a common pattern. Signed-off-by: Jakub Sitnicki --- include/linux/if_vlan.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 15e01935d3fa..afa5cc61a0fa 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -731,10 +731,8 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, * * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. - * - * Returns: a new pointer to skb->data, or NULL on failure to pull. */ -static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) +static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); @@ -742,7 +740,7 @@ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); - return __skb_pull(skb, VLAN_HLEN); + __skb_pull(skb, VLAN_HLEN); } /** -- 2.43.0 Use the metadata-aware helper to move packet bytes after skb_pull(), ensuring metadata remains valid after calling the BPF helper. Signed-off-by: Jakub Sitnicki --- include/linux/if_vlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index afa5cc61a0fa..4ecc2509b0d4 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -738,9 +738,9 @@ static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) *vlan_tci = ntohs(vhdr->h_vlan_TCI); - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); __skb_pull(skb, VLAN_HLEN); + skb_postpull_data_move(skb, VLAN_HLEN, 2 * ETH_ALEN); } /** -- 2.43.0 Use the metadata-aware helper to move packet bytes after skb_push(), ensuring metadata remains valid after calling the BPF helper. Also, take care to reserve sufficient headroom for metadata to fit. Signed-off-by: Jakub Sitnicki --- include/linux/if_vlan.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 4ecc2509b0d4..b0e1f57d51aa 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -355,16 +355,17 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { + const u8 meta_len = mac_len > ETH_HLEN ? skb_metadata_len(skb) : 0; struct vlan_ethhdr *veth; - if (skb_cow_head(skb, VLAN_HLEN) < 0) + if (skb_cow_head(skb, meta_len + VLAN_HLEN) < 0) return -ENOMEM; skb_push(skb, VLAN_HLEN); /* Move the mac header sans proto to the beginning of the new header. */ if (likely(mac_len > ETH_TLEN)) - memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); + skb_postpush_data_move(skb, VLAN_HLEN, mac_len - ETH_TLEN); if (skb_mac_header_was_set(skb)) skb->mac_header -= VLAN_HLEN; -- 2.43.0 bpf_skb_adjust_room() may push or pull bytes from skb->data. In both cases, skb metadata must be moved accordingly to stay accessible. Replace existing memmove() calls, which only move payload, with a helper that also handles metadata. Reserve enough space for metadata to fit after skb_push. Signed-off-by: Jakub Sitnicki --- net/core/filter.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 2af0a5f1d748..030349179b5a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3251,11 +3251,11 @@ static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { - /* Caller already did skb_cow() with len as headroom, + /* Caller already did skb_cow() with meta_len+len as headroom, * so no need to do it here. */ skb_push(skb, len); - memmove(skb->data, skb->data + len, off); + skb_postpush_data_move(skb, len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) @@ -3279,7 +3279,7 @@ static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); - memmove(skb->data, old_data, off); + skb_postpull_data_move(skb, len, off); return 0; } @@ -3487,6 +3487,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; + const u8 meta_len = skb_metadata_len(skb); unsigned int gso_type = SKB_GSO_DODGY; int ret; @@ -3497,7 +3498,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return -ENOTSUPP; } - ret = skb_cow_head(skb, len_diff); + ret = skb_cow_head(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; -- 2.43.0 bpf_skb_change_proto reuses the same headroom operations as bpf_skb_adjust_room, already updated to handle metadata safely. The remaining step is to ensure that there is sufficient headroom to accommodate metadata on skb_push(). Signed-off-by: Jakub Sitnicki --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 030349179b5a..c4b18b7fa95e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3324,10 +3324,11 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + const u8 meta_len = skb_metadata_len(skb); u32 off = skb_mac_header_len(skb); int ret; - ret = skb_cow(skb, len_diff); + ret = skb_cow(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; -- 2.43.0 Although bpf_skb_change_head() doesn't move packet data after skb_push(), skb metadata still needs to be relocated. Use the dedicated helper to handle it. Signed-off-by: Jakub Sitnicki --- net/core/filter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index c4b18b7fa95e..7a5ee02098dc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3873,6 +3873,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = { static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { + const u8 meta_len = skb_metadata_len(skb); u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; @@ -3881,7 +3882,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, new_len < skb->len)) return -EINVAL; - ret = skb_cow(skb, head_room); + ret = skb_cow(skb, meta_len + head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that @@ -3893,6 +3894,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, * for redirection into L2 device. */ __skb_push(skb, head_room); + skb_postpush_data_move(skb, head_room, 0); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); -- 2.43.0 Since pskb_expand_head() no longer clears metadata on unclone, update tests for cloned packets to expect metadata to remain intact. Verify metadata contents directly in the BPF program. This allows for multiple checks as packet passes through a chain of BPF programs, rather than a one-time check in user-space. Signed-off-by: Jakub Sitnicki --- .../bpf/prog_tests/xdp_context_test_run.c | 20 ++--- tools/testing/selftests/bpf/progs/test_xdp_meta.c | 94 +++++++++++++--------- 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 178292d1251a..6fac79416b70 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -462,25 +462,25 @@ void test_xdp_context_tuntap(void) skel->progs.ing_cls_dynptr_offset_oob, skel->progs.ing_cls, skel->maps.test_result); - if (test__start_subtest("clone_data_meta_empty_on_data_write")) + if (test__start_subtest("clone_data_meta_kept_on_data_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_data_meta_empty_on_data_write, + skel->progs.clone_data_meta_kept_on_data_write, &skel->bss->test_pass); - if (test__start_subtest("clone_data_meta_empty_on_meta_write")) + if (test__start_subtest("clone_data_meta_kept_on_meta_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_data_meta_empty_on_meta_write, + skel->progs.clone_data_meta_kept_on_meta_write, &skel->bss->test_pass); - if (test__start_subtest("clone_dynptr_empty_on_data_slice_write")) + if (test__start_subtest("clone_dynptr_kept_on_data_slice_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_dynptr_empty_on_data_slice_write, + skel->progs.clone_dynptr_kept_on_data_slice_write, &skel->bss->test_pass); - if (test__start_subtest("clone_dynptr_empty_on_meta_slice_write")) + if (test__start_subtest("clone_dynptr_kept_on_meta_slice_write")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_dynptr_empty_on_meta_slice_write, + skel->progs.clone_dynptr_kept_on_meta_slice_write, &skel->bss->test_pass); - if (test__start_subtest("clone_dynptr_rdonly_before_data_dynptr_write")) + if (test__start_subtest("clone_dynptr_rdonly_before_data_dynptr_write_then_rw")) test_tuntap_mirred(skel->progs.ing_xdp, - skel->progs.clone_dynptr_rdonly_before_data_dynptr_write, + skel->progs.clone_dynptr_rdonly_before_data_dynptr_write_then_rw, &skel->bss->test_pass); if (test__start_subtest("clone_dynptr_rdonly_before_meta_dynptr_write")) test_tuntap_mirred(skel->progs.ing_xdp, diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index d79cb74b571e..3de85b5668c9 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -28,6 +28,13 @@ struct { bool test_pass; +static const __u8 meta_want[META_SIZE] = { + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, +}; + SEC("tc") int ing_cls(struct __sk_buff *ctx) { @@ -304,12 +311,13 @@ int ing_xdp(struct xdp_md *ctx) } /* - * Check that skb->data_meta..skb->data is empty if prog writes to packet + * Check that skb->data_meta..skb->data is kept in tact if prog writes to packet * _payload_ using packet pointers. Applies only to cloned skbs. */ SEC("tc") -int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx) +int clone_data_meta_kept_on_data_write(struct __sk_buff *ctx) { + __u8 *meta_have = ctx_ptr(ctx, data_meta); struct ethhdr *eth = ctx_ptr(ctx, data); if (eth + 1 > ctx_ptr(ctx, data_end)) @@ -318,8 +326,10 @@ int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx) if (eth->h_proto != 0) goto out; - /* Expect no metadata */ - if (ctx->data_meta != ctx->data) + if (meta_have + META_SIZE > eth) + goto out; + + if (__builtin_memcmp(meta_want, meta_have, META_SIZE)) goto out; /* Packet write to trigger unclone in prologue */ @@ -331,14 +341,14 @@ int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx) } /* - * Check that skb->data_meta..skb->data is empty if prog writes to packet + * Check that skb->data_meta..skb->data is kept in tact if prog writes to packet * _metadata_ using packet pointers. Applies only to cloned skbs. */ SEC("tc") -int clone_data_meta_empty_on_meta_write(struct __sk_buff *ctx) +int clone_data_meta_kept_on_meta_write(struct __sk_buff *ctx) { + __u8 *meta_have = ctx_ptr(ctx, data_meta); struct ethhdr *eth = ctx_ptr(ctx, data); - __u8 *md = ctx_ptr(ctx, data_meta); if (eth + 1 > ctx_ptr(ctx, data_end)) goto out; @@ -346,25 +356,29 @@ int clone_data_meta_empty_on_meta_write(struct __sk_buff *ctx) if (eth->h_proto != 0) goto out; - if (md + 1 > ctx_ptr(ctx, data)) { - /* Expect no metadata */ - test_pass = true; - } else { - /* Metadata write to trigger unclone in prologue */ - *md = 42; - } + if (meta_have + META_SIZE > eth) + goto out; + + if (__builtin_memcmp(meta_want, meta_have, META_SIZE)) + goto out; + + /* Metadata write to trigger unclone in prologue */ + *meta_have = 42; + + test_pass = true; out: return TC_ACT_SHOT; } /* - * Check that skb_meta dynptr is writable but empty if prog writes to packet - * _payload_ using a dynptr slice. Applies only to cloned skbs. + * Check that skb_meta dynptr is writable and was kept in tact if prog creates a + * r/w slice to packet _payload_. Applies only to cloned skbs. */ SEC("tc") -int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx) +int clone_dynptr_kept_on_data_slice_write(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; + __u8 meta_have[META_SIZE]; struct ethhdr *eth; bpf_dynptr_from_skb(ctx, 0, &data); @@ -375,29 +389,26 @@ int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx) if (eth->h_proto != 0) goto out; - /* Expect no metadata */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (__builtin_memcmp(meta_want, meta_have, META_SIZE)) goto out; - /* Packet write to trigger unclone in prologue */ - eth->h_proto = 42; - test_pass = true; out: return TC_ACT_SHOT; } /* - * Check that skb_meta dynptr is writable but empty if prog writes to packet - * _metadata_ using a dynptr slice. Applies only to cloned skbs. + * Check that skb_meta dynptr is writable and was kept in tact if prog creates + * an r/w slice to packet _metadata_. Applies only to cloned skbs. */ SEC("tc") -int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx) +int clone_dynptr_kept_on_meta_slice_write(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; const struct ethhdr *eth; - __u8 *md; + __u8 *meta_have; bpf_dynptr_from_skb(ctx, 0, &data); eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); @@ -407,16 +418,13 @@ int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx) if (eth->h_proto != 0) goto out; - /* Expect no metadata */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) + meta_have = bpf_dynptr_slice_rdwr(&meta, 0, NULL, META_SIZE); + if (!meta_have) goto out; - /* Metadata write to trigger unclone in prologue */ - bpf_dynptr_from_skb_meta(ctx, 0, &meta); - md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); - if (md) - *md = 42; + if (__builtin_memcmp(meta_want, meta_have, META_SIZE)) + goto out; test_pass = true; out: @@ -425,12 +433,14 @@ int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx) /* * Check that skb_meta dynptr is read-only before prog writes to packet payload - * using dynptr_write helper. Applies only to cloned skbs. + * using dynptr_write helper, and becomes read-write afterwards. Applies only to + * cloned skbs. */ SEC("tc") -int clone_dynptr_rdonly_before_data_dynptr_write(struct __sk_buff *ctx) +int clone_dynptr_rdonly_before_data_dynptr_write_then_rw(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; + __u8 meta_have[META_SIZE]; const struct ethhdr *eth; bpf_dynptr_from_skb(ctx, 0, &data); @@ -443,15 +453,23 @@ int clone_dynptr_rdonly_before_data_dynptr_write(struct __sk_buff *ctx) /* Expect read-only metadata before unclone */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) + if (!bpf_dynptr_is_rdonly(&meta)) + goto out; + + bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (__builtin_memcmp(meta_want, meta_have, META_SIZE)) goto out; /* Helper write to payload will unclone the packet */ bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0); - /* Expect no metadata after unclone */ + /* Expect r/w metadata after unclone */ bpf_dynptr_from_skb_meta(ctx, 0, &meta); - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != 0) + if (bpf_dynptr_is_rdonly(&meta)) + goto out; + + bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); + if (__builtin_memcmp(meta_want, meta_have, META_SIZE)) goto out; test_pass = true; -- 2.43.0