Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data already points past the metadata area. Adjust the driver to pull from skb->data before calling skb_metadata_set(). Signed-off-by: Jakub Sitnicki --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a625e7c311dd..93e05a0f1905 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1440,8 +1440,8 @@ static struct sk_buff *bnxt_copy_xdp(struct bnxt_napi *bnapi, return skb; if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } return skb; -- 2.43.0 Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data already points past the metadata area. Adjust the driver to pull from skb->data before calling skb_metadata_set(). Signed-off-by: Jakub Sitnicki --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 9f47388eaba5..11eff5bd840b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -310,8 +310,8 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, ALIGN(totalsize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } if (likely(!xdp_buff_has_frags(xdp))) -- 2.43.0 Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data already points past the metadata area. Adjust the driver to pull from skb->data before calling skb_metadata_set(). Signed-off-by: Jakub Sitnicki --- drivers/net/ethernet/intel/igb/igb_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c index 30ce5fbb5b77..9202da66e32c 100644 --- a/drivers/net/ethernet/intel/igb/igb_xsk.c +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c @@ -284,8 +284,8 @@ static struct sk_buff *igb_construct_skb_zc(struct igb_ring *rx_ring, ALIGN(totalsize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } return skb; -- 2.43.0 Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data already points past the metadata area. Adjust the driver to pull from skb->data before calling skb_metadata_set(). Signed-off-by: Jakub Sitnicki --- drivers/net/ethernet/intel/igc/igc_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 728d7ca5338b..426b2cef5a1c 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2024,8 +2024,8 @@ static struct sk_buff *igc_construct_skb(struct igc_ring *rx_ring, ALIGN(headlen + metasize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } /* update all of the pointers */ @@ -2752,8 +2752,8 @@ static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring, ALIGN(totalsize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } if (ctx->rx_ts) { -- 2.43.0 Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data already points past the metadata area. Adjust the driver to pull from skb->data before calling skb_metadata_set(). Signed-off-by: Jakub Sitnicki --- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 7b941505a9d0..69104f432f8d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -228,8 +228,8 @@ static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring, ALIGN(totalsize, sizeof(long))); if (metasize) { - skb_metadata_set(skb, metasize); __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); } return skb; -- 2.43.0 Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data already points past the metadata area. Adjust the driver to pull from skb->data before calling skb_metadata_set(). Signed-off-by: Jakub Sitnicki --- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c index 2b05536d564a..20c983c3ce62 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -237,8 +237,8 @@ static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, struct xdp_b skb_put_data(skb, xdp->data_meta, totallen); if (metalen) { - skb_metadata_set(skb, metalen); __skb_pull(skb, metalen); + skb_metadata_set(skb, metalen); } return skb; -- 2.43.0 Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data points right past the metadata area. Unlike other drivers, veth calls skb_metadata_set() after eth_type_trans(), which pulls the Ethernet header and moves skb->data. This violates the future calling convention. Adjust the driver to pull the MAC header after calling skb_metadata_set(). Signed-off-by: Jakub Sitnicki --- drivers/net/veth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 87a63c4bee77..61f218d83f80 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -876,11 +876,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, else skb->data_len = 0; - skb->protocol = eth_type_trans(skb, rq->dev); - metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); + + skb->protocol = eth_type_trans(skb, rq->dev); out: return skb; drop: -- 2.43.0 Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data already points past the metadata area. Adjust AF_XDP to pull from skb->data before calling skb_metadata_set(). Signed-off-by: Jakub Sitnicki --- net/core/xdp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a..e86ac1d6ad6d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -768,8 +768,8 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) metalen = xdp->data - xdp->data_meta; if (metalen > 0) { - skb_metadata_set(skb, metalen); __skb_pull(skb, metalen); + skb_metadata_set(skb, metalen); } skb_record_rx_queue(skb, rxq->queue_index); -- 2.43.0 Prepare to track skb metadata location independently of MAC header offset. Following changes will make skb_metadata_set() record where metadata ends relative to skb->head. Hence the helper must be called when skb->data points just past the metadata area. Tweak XDP generic mode accordingly. Signed-off-by: Jakub Sitnicki --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 69515edd17bc..70cf90d5c73c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5461,8 +5461,11 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, break; case XDP_PASS: metalen = xdp->data - xdp->data_meta; - if (metalen) + if (metalen) { + __skb_push(skb, mac_len); skb_metadata_set(skb, metalen); + __skb_pull(skb, mac_len); + } break; } -- 2.43.0 Currently skb metadata location is derived from the MAC header offset. This breaks when L2 tunnel/tagging devices (VLAN, GRE, etc.) reset the MAC offset after pulling the encapsulation header, making the metadata inaccessible. A naive fix would be to move metadata on every skb_pull() path. However, we can avoid a memmove on L2 decapsulation if we can locate metadata independently of the MAC offset. Introduce a meta_end field in skb_shared_info to track where metadata ends, decoupling it from mac_header. The new field takes 2 bytes out of the existing 4 byte hole, with structure size unchanged if we reorder the gso_type field. Update skb_metadata_set() to record meta_end at the time of the call, and adjust skb_data_move() and pskb_expand_head() to keep meta_end in sync with head buffer layout. Remove the now-unneeded metadata adjustment in skb_reorder_vlan_header(). Note that this breaks BPF skb metadata access through skb->data_meta when there is a gap between meta_end and skb->data. Following BPF verifier changes address this. Also, we still need to relocate the metadata on encapsulation on forward path. VLAN and QinQ have already been patched when fixing TC BPF helpers [1], but other tagging/tunnel code still requires similar changes. This will be done as a follow up. Signed-off-by: Jakub Sitnicki --- include/linux/skbuff.h | 14 ++++++++++++-- net/core/skbuff.c | 10 ++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff90281ddf90..8868db976e1f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -595,15 +595,16 @@ struct skb_shared_info { __u8 meta_len; __u8 nr_frags; __u8 tx_flags; + u16 meta_end; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; + unsigned int gso_type; struct sk_buff *frag_list; union { struct skb_shared_hwtstamps hwtstamps; struct xsk_tx_metadata_compl xsk_meta; }; - unsigned int gso_type; u32 tskey; /* @@ -4499,7 +4500,7 @@ static inline u8 skb_metadata_len(const struct sk_buff *skb) static inline void *skb_metadata_end(const struct sk_buff *skb) { - return skb_mac_header(skb); + return skb->head + skb_shinfo(skb)->meta_end; } static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, @@ -4554,8 +4555,16 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a, true : __skb_metadata_differs(skb_a, skb_b, len_a); } +/** + * skb_metadata_set - Record packet metadata length and location. + * @skb: packet carrying the metadata + * @meta_len: number of bytes of metadata preceding skb->data + * + * Must be called when skb->data already points past the metadata area. + */ static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) { + skb_shinfo(skb)->meta_end = skb_headroom(skb); skb_shinfo(skb)->meta_len = meta_len; } @@ -4601,6 +4610,7 @@ static inline void skb_data_move(struct sk_buff *skb, const int len, } memmove(meta + len, meta, meta_len + n); + skb_shinfo(skb)->meta_end += len; return; no_metadata: diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4f4d7ab7057f..7142487644c3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2306,6 +2306,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, #endif skb->tail += off; skb_headers_offset_update(skb, nhead); + skb_shinfo(skb)->meta_end += nhead; skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; @@ -6219,8 +6220,7 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet); static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { - int mac_len, meta_len; - void *meta; + int mac_len; if (skb_cow(skb, skb_headroom(skb)) < 0) { kfree_skb(skb); @@ -6233,12 +6233,6 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) mac_len - VLAN_HLEN - ETH_TLEN); } - meta_len = skb_metadata_len(skb); - if (meta_len) { - meta = skb_metadata_end(skb) - meta_len; - memmove(meta + VLAN_HLEN, meta, meta_len); - } - skb->mac_header += VLAN_HLEN; return skb; } -- 2.43.0 The may_access_direct_pkt_data() helper sets env->seen_direct_write as a side effect, which creates awkward calling patterns: - check_special_kfunc() has a comment warning readers about the side effect - specialize_kfunc() must save and restore the flag around the call Make the helper a pure function by moving the seen_direct_write flag setting to call sites that need it. Signed-off-by: Jakub Sitnicki --- kernel/bpf/verifier.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff40e5e65c43..64a04b7dd500 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6085,13 +6085,9 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, if (meta) return meta->pkt_access; - env->seen_direct_write = true; return true; case BPF_PROG_TYPE_CGROUP_SOCKOPT: - if (t == BPF_WRITE) - env->seen_direct_write = true; - return true; default: @@ -7619,15 +7615,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_stack_write(env, regno, off, size, value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { - if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { - verbose(env, "cannot write into packet\n"); - return -EACCES; - } - if (t == BPF_WRITE && value_regno >= 0 && - is_pointer_value(env, value_regno)) { - verbose(env, "R%d leaks addr into packet\n", - value_regno); - return -EACCES; + if (t == BPF_WRITE) { + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "cannot write into packet\n"); + return -EACCES; + } + if (value_regno >= 0 && is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into packet\n", + value_regno); + return -EACCES; + } + env->seen_direct_write = true; } err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) @@ -13766,11 +13764,11 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { regs[BPF_REG_0].type |= MEM_RDONLY; } else { - /* this will set env->seen_direct_write to true */ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { verbose(env, "the prog does not allow writes to packet data\n"); return -EINVAL; } + env->seen_direct_write = true; } if (!meta->initialized_dynptr.id) { @@ -21810,7 +21808,6 @@ static void specialize_kfunc(struct bpf_verifier_env *env, u32 func_id, u16 offset, unsigned long *addr) { struct bpf_prog *prog = env->prog; - bool seen_direct_write; void *xdp_kfunc; bool is_rdonly; @@ -21827,16 +21824,10 @@ static void specialize_kfunc(struct bpf_verifier_env *env, return; if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { - seen_direct_write = env->seen_direct_write; is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); if (is_rdonly) *addr = (unsigned long)bpf_dynptr_from_skb_rdonly; - - /* restore env->seen_direct_write to its original value, since - * may_access_direct_pkt_data mutates it - */ - env->seen_direct_write = seen_direct_write; } if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && -- 2.43.0 Convert seen_direct_write from a boolean to a bitmap (seen_packet_access) in preparation for tracking additional packet access patterns. No functional change. Signed-off-by: Jakub Sitnicki --- include/linux/bpf_verifier.h | 6 +++++- kernel/bpf/verifier.c | 11 ++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c497e839526..42ce94ce96ba 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -637,6 +637,10 @@ enum priv_stack_mode { PRIV_STACK_ADAPTIVE, }; +enum packet_access_flags { + PA_F_DIRECT_WRITE = BIT(0), +}; + struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ @@ -760,7 +764,7 @@ struct bpf_verifier_env { bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; - bool seen_direct_write; + u8 seen_packet_access; /* combination of enum packet_access_flags */ bool seen_exception; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 64a04b7dd500..4c84b0cd399e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7625,7 +7625,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); return -EACCES; } - env->seen_direct_write = true; + env->seen_packet_access |= PA_F_DIRECT_WRITE; } err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) @@ -13768,7 +13768,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca verbose(env, "the prog does not allow writes to packet data\n"); return -EINVAL; } - env->seen_direct_write = true; + env->seen_packet_access |= PA_F_DIRECT_WRITE; } if (!meta->initialized_dynptr.id) { @@ -21200,6 +21200,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; + bool seen_direct_write; int epilogue_idx = 0; if (ops->gen_epilogue) { @@ -21227,13 +21228,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (ops->gen_prologue || env->seen_direct_write) { + seen_direct_write = env->seen_packet_access & PA_F_DIRECT_WRITE; + if (ops->gen_prologue || seen_direct_write) { if (!ops->gen_prologue) { verifier_bug(env, "gen_prologue is null"); return -EFAULT; } - cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, - env->prog); + cnt = ops->gen_prologue(insn_buf, seen_direct_write, env->prog); if (cnt >= INSN_BUF_SIZE) { verifier_bug(env, "prologue is too long"); return -EFAULT; -- 2.43.0 Change gen_prologue() to accept the packet access flags bitmap. This allows gen_prologue() to inspect multiple access patterns when needed. No functional change. Signed-off-by: Jakub Sitnicki --- include/linux/bpf.h | 2 +- kernel/bpf/cgroup.c | 2 +- kernel/bpf/verifier.c | 6 ++---- net/core/filter.c | 15 ++++++++------- net/sched/bpf_qdisc.c | 3 ++- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 6 +++--- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d808253f2e94..d16987e94d98 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1076,7 +1076,7 @@ struct bpf_verifier_ops { bool (*is_valid_access)(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); - int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, + int (*gen_prologue)(struct bpf_insn *insn, u32 pkt_access_flags, const struct bpf_prog *prog); int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog, s16 ctx_stack_off); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 69988af44b37..d96465cd7d43 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2694,7 +2694,7 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, } static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, - bool direct_write, + u32 pkt_access_flags, const struct bpf_prog *prog) { /* Nothing to do for sockopt argument. The data is kzalloc'ated. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4c84b0cd399e..bb4e70913ab4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21200,7 +21200,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; - bool seen_direct_write; int epilogue_idx = 0; if (ops->gen_epilogue) { @@ -21228,13 +21227,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - seen_direct_write = env->seen_packet_access & PA_F_DIRECT_WRITE; - if (ops->gen_prologue || seen_direct_write) { + if (ops->gen_prologue || (env->seen_packet_access & PA_F_DIRECT_WRITE)) { if (!ops->gen_prologue) { verifier_bug(env, "gen_prologue is null"); return -EFAULT; } - cnt = ops->gen_prologue(insn_buf, seen_direct_write, env->prog); + cnt = ops->gen_prologue(insn_buf, env->seen_packet_access, env->prog); if (cnt >= INSN_BUF_SIZE) { verifier_bug(env, "prologue is too long"); return -EFAULT; diff --git a/net/core/filter.c b/net/core/filter.c index 4124becf8604..334421910107 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9042,7 +9042,7 @@ static bool sock_filter_is_valid_access(int off, int size, prog->expected_attach_type); } -static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int bpf_noop_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { /* Neither direct read nor direct write requires any preliminary @@ -9051,12 +9051,12 @@ static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, return 0; } -static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int bpf_unclone_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; - if (!direct_write) + if (!(pkt_access_flags & PA_F_DIRECT_WRITE)) return 0; /* if (!skb->cloned) @@ -9125,10 +9125,11 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig, return insn - insn_buf; } -static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { - return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); + return bpf_unclone_prologue(insn_buf, pkt_access_flags, prog, + TC_ACT_SHOT); } static bool tc_cls_act_is_valid_access(int off, int size, @@ -9466,10 +9467,10 @@ static bool sock_ops_is_valid_access(int off, int size, return true; } -static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int sk_skb_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { - return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); + return bpf_unclone_prologue(insn_buf, pkt_access_flags, prog, SK_DROP); } static bool sk_skb_is_valid_access(int off, int size, diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index adcb618a2bfc..ae44d0dab073 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -132,7 +132,8 @@ static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, BTF_ID_LIST_SINGLE(bpf_qdisc_init_prologue_ids, func, bpf_qdisc_init_prologue) -static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, + u32 direct_access_flags, const struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 8eeebaa951f0..286cbb8c5623 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1369,7 +1369,7 @@ static int bpf_test_mod_st_ops__test_pro_epilogue(struct st_ops_args *args) static int bpf_cgroup_from_id_id; static int bpf_cgroup_release_id; -static int st_ops_gen_prologue_with_kfunc(struct bpf_insn *insn_buf, bool direct_write, +static int st_ops_gen_prologue_with_kfunc(struct bpf_insn *insn_buf, const struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1445,7 +1445,7 @@ static int st_ops_gen_epilogue_with_kfunc(struct bpf_insn *insn_buf, const struc } #define KFUNC_PRO_EPI_PREFIX "test_kfunc_" -static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, +static int st_ops_gen_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -1455,7 +1455,7 @@ static int st_ops_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, return 0; if (!strncmp(prog->aux->name, KFUNC_PRO_EPI_PREFIX, strlen(KFUNC_PRO_EPI_PREFIX))) - return st_ops_gen_prologue_with_kfunc(insn_buf, direct_write, prog); + return st_ops_gen_prologue_with_kfunc(insn_buf, prog); /* r6 = r1[0]; // r6 will be "struct st_ops *args". r1 is "u64 *ctx". * r7 = r6->a; -- 2.43.0 Introduce PA_F_DATA_META_LOAD flag to track when a BPF program loads the skb->data_meta pointer. This information will be used by gen_prologue() to handle cases where there is a gap between metadata end and skb->data, requiring metadata to be realigned. Signed-off-by: Jakub Sitnicki --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 42ce94ce96ba..fa330e4dc14a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -639,6 +639,7 @@ enum priv_stack_mode { enum packet_access_flags { PA_F_DIRECT_WRITE = BIT(0), + PA_F_DATA_META_LOAD = BIT(1), }; struct bpf_subprog_info { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb4e70913ab4..32989e29a5e1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6160,6 +6160,10 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, } else { env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size; } + + if (base_type(info->reg_type) == PTR_TO_PACKET_META) + env->seen_packet_access |= PA_F_DATA_META_LOAD; + /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; -- 2.43.0 After decoupling metadata location from MAC header offset, a gap can appear between metadata and skb->data on L2 decapsulation (e.g., VLAN, GRE). This breaks the BPF data_meta pointer which assumes metadata is directly before skb->data. Introduce bpf_skb_meta_realign() kfunc to close the gap by moving metadata to immediately precede the MAC header. Inject a call to it in tc_cls_act_prologue() when the verifier detects data_meta access (PA_F_DATA_META_LOAD flag). Update skb_data_move() to handle the gap case: on skb_push(), move metadata to the top of the head buffer; on skb_pull() where metadata is already detached, leave it in place. This restores data_meta functionality for TC programs while keeping the performance benefit of avoiding memmove on L2 decapsulation for programs that don't use data_meta. Signed-off-by: Jakub Sitnicki --- include/linux/skbuff.h | 25 +++++++++++++++-------- net/core/filter.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 10 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8868db976e1f..24c4e216d0cb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4600,19 +4600,28 @@ static inline void skb_data_move(struct sk_buff *skb, const int len, if (!meta_len) goto no_metadata; - meta_end = skb_metadata_end(skb); - meta = meta_end - meta_len; - - if (WARN_ON_ONCE(meta_end + len != skb->data || - meta_len > skb_headroom(skb))) { + /* Not enough headroom left for metadata. Drop it. */ + if (WARN_ONCE(meta_len > skb_headroom(skb), + "skb headroom smaller than metadata")) { skb_metadata_clear(skb); goto no_metadata; } - memmove(meta + len, meta, meta_len + n); - skb_shinfo(skb)->meta_end += len; - return; + meta_end = skb_metadata_end(skb); + meta = meta_end - meta_len; + /* Metadata in front of data before push/pull. Keep it that way. */ + if (meta_end == skb->data - len) { + memmove(meta + len, meta, meta_len + n); + skb_shinfo(skb)->meta_end += len; + return; + } + + if (len < 0) { + /* Data pushed. Move metadata to the top. */ + memmove(skb->head, meta, meta_len); + skb_shinfo(skb)->meta_end = meta_len; + } no_metadata: memmove(skb->data, skb->data - len, n); } diff --git a/net/core/filter.c b/net/core/filter.c index 334421910107..91100c923f2c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9125,11 +9125,62 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig, return insn - insn_buf; } +__bpf_kfunc_start_defs(); + +__bpf_kfunc void bpf_skb_meta_realign(struct __sk_buff *skb_) +{ + struct sk_buff *skb = (typeof(skb))skb_; + u8 *meta_end = skb_metadata_end(skb); + u8 meta_len = skb_metadata_len(skb); + u8 *meta; + int gap; + + gap = skb_mac_header(skb) - meta_end; + if (!meta_len || !gap) + return; + + if (WARN_ONCE(gap < 0, "skb metadata end past mac header")) { + skb_metadata_clear(skb); + return; + } + + meta = meta_end - meta_len; + memmove(meta + gap, meta, meta_len); + skb_shinfo(skb)->meta_end += gap; + + bpf_compute_data_pointers(skb); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(tc_cls_act_hidden_ids) +BTF_ID_FLAGS(func, bpf_skb_meta_realign, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(tc_cls_act_hidden_ids) + +BTF_ID_LIST_SINGLE(bpf_skb_meta_realign_ids, func, bpf_skb_meta_realign) + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, u32 pkt_access_flags, const struct bpf_prog *prog) { - return bpf_unclone_prologue(insn_buf, pkt_access_flags, prog, - TC_ACT_SHOT); + struct bpf_insn *insn = insn_buf; + int cnt; + + if (pkt_access_flags & PA_F_DATA_META_LOAD) { + /* Realign skb metadata for access through data_meta pointer. + * + * r6 = r1; // r6 will be "u64 *ctx" + * r0 = bpf_skb_meta_realign(r1); // r0 is undefined + * r1 = r6; + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_CALL_KFUNC(0, bpf_skb_meta_realign_ids[0]); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + } + cnt = bpf_unclone_prologue(insn, pkt_access_flags, prog, TC_ACT_SHOT); + if (!cnt && insn > insn_buf) + *insn++ = prog->insnsi[0]; + + return cnt + insn - insn_buf; } static bool tc_cls_act_is_valid_access(int off, int size, -- 2.43.0