After decoupling metadata location from MAC header offset, a gap can appear between metadata and skb->data on L2 decapsulation (e.g., VLAN, GRE). This breaks the BPF data_meta pointer which assumes metadata is directly before skb->data. Introduce bpf_skb_meta_realign() kfunc to close the gap by moving metadata to immediately precede the MAC header. Inject a call to it in tc_cls_act_prologue() when the verifier detects data_meta access (PA_F_DATA_META_LOAD flag). Update skb_data_move() to handle the gap case: on skb_push(), move metadata to the top of the head buffer; on skb_pull() where metadata is already detached, leave it in place. This restores data_meta functionality for TC programs while keeping the performance benefit of avoiding memmove on L2 decapsulation for programs that don't use data_meta. Signed-off-by: Jakub Sitnicki --- include/linux/skbuff.h | 25 +++++++++++++++-------- net/core/filter.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 10 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8868db976e1f..24c4e216d0cb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4600,19 +4600,28 @@ static inline void skb_data_move(struct sk_buff *skb, const int len, if (!meta_len) goto no_metadata; - meta_end = skb_metadata_end(skb); - meta = meta_end - meta_len; - - if (WARN_ON_ONCE(meta_end + len != skb->data || - meta_len > skb_headroom(skb))) { + /* Not enough headroom left for metadata. Drop it. */ + if (WARN_ONCE(meta_len > skb_headroom(skb), + "skb headroom smaller than metadata")) { skb_metadata_clear(skb); goto no_metadata; } - memmove(meta + len, meta, meta_len + n); - skb_shinfo(skb)->meta_end += len; - return; + meta_end = skb_metadata_end(skb); + meta = meta_end - meta_len; + /* Metadata in front of data before push/pull. Keep it that way. */ + if (meta_end == skb->data - len) { + memmove(meta + len, meta, meta_len + n); + skb_shinfo(skb)->meta_end += len; + return; + } + + if (len < 0) { + /* Data pushed. Move metadata to the top. */ + memmove(skb->head, meta, meta_len); + skb_shinfo(skb)->meta_end = meta_len; + } no_metadata: memmove(skb->data, skb->data - len, n); } diff --git a/net/core/filter.c b/net/core/filter.c index 334421910107..91100c923f2c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9125,11 +9125,62 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig, return insn - insn_buf; } +__bpf_kfunc_start_defs(); + +__bpf_kfunc void bpf_skb_meta_realign(struct __sk_buff *skb_) +{ + struct sk_buff *skb = (typeof(skb))skb_; + u8 *meta_end = skb_metadata_end(skb); + u8 meta_len = skb_metadata_len(skb); + u8 *meta; + int gap; + + gap = skb_mac_header(skb) - meta_end; + if (!meta_len || !gap) + return; + + if (WARN_ONCE(gap < 0, "skb metadata end past mac header")) { + skb_metadata_clear(skb); + return; + } + + meta = meta_end - meta_len; + memmove(meta + gap, meta, meta_len); + skb_shinfo(skb)->meta_end += gap; + + bpf_compute_data_pointers(skb); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(tc_cls_act_hidden_ids) +BTF_ID_FLAGS(func, bpf_skb_meta_realign, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(tc_cls_act_hidden_ids) + +BTF_ID_LIST_SINGLE(bpf_skb_meta_realign_ids, func, bpf_skb_meta_realign) + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { - return bpf_unclone_prologue(insn_buf, pkt_access_flags, prog, - TC_ACT_SHOT); + struct bpf_insn *insn = insn_buf; + int cnt; + + if (pkt_access_flags & PA_F_DATA_META_LOAD) { + /* Realign skb metadata for access through data_meta pointer. + * + * r6 = r1; // r6 will be "u64 *ctx" + * r0 = bpf_skb_meta_realign(r1); // r0 is undefined + * r1 = r6; + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_CALL_KFUNC(0, bpf_skb_meta_realign_ids[0]); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + } + cnt = bpf_unclone_prologue(insn, pkt_access_flags, prog, TC_ACT_SHOT); + if (!cnt && insn > insn_buf) + *insn++ = prog->insnsi[0]; + + return cnt + insn - insn_buf; } static bool tc_cls_act_is_valid_access(int off, int size, -- 2.43.0