From: Lorenzo Bianconi Introduce the `xdp_rx_meta` structure to serve as a container for XDP RX hardware hints within XDP packet buffers. Initially, this structure will accommodate `rx_hash` and `rx_vlan` metadata. (The `rx_timestamp` hint will get stored in `skb_shared_info`). A key design aspect is making this metadata accessible both during BPF program execution (via `struct xdp_buff`) and later if an `struct xdp_frame` is materialized (e.g., for XDP_REDIRECT). To achieve this: - The `struct xdp_frame` embeds an `xdp_rx_meta` field directly for storage. - The `struct xdp_buff` includes an `xdp_rx_meta` pointer. This pointer is initialized (in `xdp_prepare_buff`) to point to the memory location within the packet buffer's headroom where the `xdp_frame`'s embedded `rx_meta` field would reside. This setup allows BPF kfuncs, operating on `xdp_buff`, to populate the metadata in the precise location where it will be found if an `xdp_frame` is subsequently created. The availability of this metadata storage area within the buffer is indicated by the `XDP_FLAGS_META_AREA` flag in `xdp_buff->flags` (and propagated to `xdp_frame->flags`). This flag is only set if sufficient headroom (at least `XDP_MIN_HEADROOM`, currently 192 bytes) is present. Specific hints like `XDP_FLAGS_META_RX_HASH` and `XDP_FLAGS_META_RX_VLAN` will then denote which types of metadata have been populated into the `xdp_rx_meta` structure. This patch is a step for enabling the preservation and use of XDP RX hints across operations like XDP_REDIRECT. Signed-off-by: Lorenzo Bianconi Signed-off-by: Jesper Dangaard Brouer --- include/net/xdp.h | 57 +++++++++++++++++++++++++++++++++++------------ net/core/xdp.c | 1 + net/xdp/xsk_buff_pool.c | 4 ++- 3 files changed, 47 insertions(+), 15 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb11..f52742a25212 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -71,11 +71,31 @@ struct xdp_txq_info { struct net_device *dev; }; +struct xdp_rx_meta { + struct xdp_rx_meta_hash { + u32 val; + u32 type; /* enum xdp_rss_hash_type */ + } hash; + struct xdp_rx_meta_vlan { + __be16 proto; + u16 tci; + } vlan; +}; + +/* Storage area for HW RX metadata only available with reasonable headroom + * available. Less than XDP_PACKET_HEADROOM due to Intel drivers. + */ +#define XDP_MIN_HEADROOM 192 + enum xdp_buff_flags { XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ + XDP_FLAGS_META_AREA = BIT(2), /* storage area available */ + XDP_FLAGS_META_RX_HASH = BIT(3), /* hw rx hash */ + XDP_FLAGS_META_RX_VLAN = BIT(4), /* hw rx vlan */ + XDP_FLAGS_META_RX_TS = BIT(5), /* hw rx timestamp */ }; struct xdp_buff { @@ -87,6 +107,24 @@ struct xdp_buff { struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ u32 flags; /* supported values defined in xdp_buff_flags */ + struct xdp_rx_meta *rx_meta; /* rx hw metadata pointer in the + * buffer headroom + */ +}; + +struct xdp_frame { + void *data; + u32 len; + u32 headroom; + u32 metasize; /* uses lower 8-bits */ + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem_type is valid on remote CPU. + */ + enum xdp_mem_type mem_type:32; + struct net_device *dev_rx; /* used by cpumap */ + u32 frame_sz; + u32 flags; /* supported values defined in xdp_buff_flags */ + struct xdp_rx_meta rx_meta; /* rx hw metadata */ }; static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) @@ -133,6 +171,9 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, xdp->data = data; xdp->data_end = data + data_len; xdp->data_meta = meta_valid ? data : data + 1; + xdp->flags = (headroom < XDP_MIN_HEADROOM) ? 0 : XDP_FLAGS_META_AREA; + xdp->rx_meta = (void *)(hard_start + + offsetof(struct xdp_frame, rx_meta)); } /* Reserve memory area at end-of data area. @@ -253,20 +294,6 @@ static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, return true; } -struct xdp_frame { - void *data; - u32 len; - u32 headroom; - u32 metasize; /* uses lower 8-bits */ - /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem_type is valid on remote CPU. - */ - enum xdp_mem_type mem_type:32; - struct net_device *dev_rx; /* used by cpumap */ - u32 frame_sz; - u32 flags; /* supported values defined in xdp_buff_flags */ -}; - static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); @@ -355,6 +382,8 @@ void xdp_convert_frame_to_buff(const struct xdp_frame *frame, xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; xdp->flags = frame->flags; + xdp->rx_meta = xdp->data_hard_start + + offsetof(struct xdp_frame, rx_meta); } static inline diff --git a/net/core/xdp.c b/net/core/xdp.c index 491334b9b8be..bd3110fc7ef8 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -606,6 +606,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->metasize = metasize; xdpf->frame_sz = PAGE_SIZE; xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; + memcpy(&xdpf->rx_meta, xdp->rx_meta, sizeof(*xdp->rx_meta)); xsk_buff_free(xdp); return xdpf; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index aa9788f20d0d..de42dacdcb25 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -574,7 +574,9 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; xskb->xdp.data_meta = xskb->xdp.data; - xskb->xdp.flags = 0; + xskb->xdp.flags = XDP_FLAGS_META_AREA; + xskb->xdp.rx_meta = (void *)(xskb->xdp.data_hard_start + + offsetof(struct xdp_frame, rx_meta)); if (pool->dev) xp_dma_sync_for_device(pool, xskb->dma, pool->frame_len); Patchset increased xdp_buff with a pointer 8 bytes, and the bpf/test_run struct xdp_page_head have two xdp_buff's. Thus adjust test with 16 bytes. Signed-off-by: Jesper Dangaard Brouer --- .../selftests/bpf/prog_tests/xdp_do_redirect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c index dd34b0cc4b4e..35c65518f55a 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c @@ -59,12 +59,12 @@ static int attach_tc_prog(struct bpf_tc_hook *hook, int fd) /* The maximum permissible size is: PAGE_SIZE - sizeof(struct xdp_page_head) - * SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) - XDP_PACKET_HEADROOM = - * 3408 bytes for 64-byte cacheline and 3216 for 256-byte one. + * 3392 bytes for 64-byte cacheline and 3200 for 256-byte one. */ #if defined(__s390x__) -#define MAX_PKT_SIZE 3216 +#define MAX_PKT_SIZE 3200 #else -#define MAX_PKT_SIZE 3408 +#define MAX_PKT_SIZE 3392 #endif #define PAGE_SIZE_4K 4096 From: Lorenzo Bianconi Introduce the following kfuncs to store hw metadata provided by the NIC into the xdp_buff struct: - rx-hash: bpf_xdp_store_rx_hash - rx-vlan: bpf_xdp_store_rx_vlan - rx-hw-ts: bpf_xdp_store_rx_ts Signed-off-by: Lorenzo Bianconi Signed-off-by: Jesper Dangaard Brouer --- include/net/xdp.h | 5 +++++ net/core/xdp.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index f52742a25212..8c7d47e3609b 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -153,6 +153,11 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } +static __always_inline bool xdp_buff_has_valid_meta_area(struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_META_AREA); +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { diff --git a/net/core/xdp.c b/net/core/xdp.c index bd3110fc7ef8..1ffba57714ea 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -963,12 +963,57 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, return -EOPNOTSUPP; } +__bpf_kfunc int bpf_xdp_store_rx_hash(struct xdp_md *ctx, u32 hash, + enum xdp_rss_hash_type rss_type) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + + if (!xdp_buff_has_valid_meta_area(xdp)) + return -ENOSPC; + + xdp->rx_meta->hash.val = hash; + xdp->rx_meta->hash.type = rss_type; + xdp->flags |= XDP_FLAGS_META_RX_HASH; + + return 0; +} + +__bpf_kfunc int bpf_xdp_store_rx_vlan(struct xdp_md *ctx, __be16 vlan_proto, + u16 vlan_tci) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + + if (!xdp_buff_has_valid_meta_area(xdp)) + return -ENOSPC; + + xdp->rx_meta->vlan.proto = vlan_proto; + xdp->rx_meta->vlan.tci = vlan_tci; + xdp->flags |= XDP_FLAGS_META_RX_VLAN; + + return 0; +} + +__bpf_kfunc int bpf_xdp_store_rx_ts(struct xdp_md *ctx, u64 ts) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + struct skb_shared_hwtstamps *shwt = &sinfo->hwtstamps; + + shwt->hwtstamp = ts; + xdp->flags |= XDP_FLAGS_META_RX_TS; + + return 0; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(xdp_metadata_kfunc_ids) #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC +BTF_ID_FLAGS(func, bpf_xdp_store_rx_hash) +BTF_ID_FLAGS(func, bpf_xdp_store_rx_vlan) +BTF_ID_FLAGS(func, bpf_xdp_store_rx_ts) BTF_KFUNCS_END(xdp_metadata_kfunc_ids) static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { From: Lorenzo Bianconi Update the following hw metadata provided by the NIC building the skb from a xdp_frame. - rx hash - rx vlan - rx hw-ts Signed-off-by: Lorenzo Bianconi --- include/net/xdp.h | 15 +++++++++++++++ net/core/xdp.c | 29 ++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 8c7d47e3609b..3d1a9711fe82 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -310,6 +310,21 @@ xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } +static __always_inline bool xdp_frame_has_rx_meta_hash(struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_META_RX_HASH); +} + +static __always_inline bool xdp_frame_has_rx_meta_vlan(struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_META_RX_VLAN); +} + +static __always_inline bool xdp_frame_has_rx_meta_ts(struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_META_RX_TS); +} + #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; diff --git a/net/core/xdp.c b/net/core/xdp.c index 1ffba57714ea..f1b2a3b4ba95 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -792,6 +792,23 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); +static void xdp_set_skb_rx_hash_from_meta(struct xdp_frame *frame, + struct sk_buff *skb) +{ + enum pkt_hash_types hash_type = PKT_HASH_TYPE_NONE; + + if (!xdp_frame_has_rx_meta_hash(frame)) + return; + + if (frame->rx_meta.hash.type & XDP_RSS_TYPE_L4_ANY) + hash_type = PKT_HASH_TYPE_L4; + else if (frame->rx_meta.hash.type & (XDP_RSS_TYPE_L3_IPV4 | + XDP_RSS_TYPE_L3_IPV6)) + hash_type = PKT_HASH_TYPE_L3; + + skb_set_hash(skb, frame->rx_meta.hash.val, hash_type); +} + struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev) @@ -800,11 +817,15 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, unsigned int headroom, frame_size; void *hard_start; u8 nr_frags; + u64 ts; /* xdp frags frame */ if (unlikely(xdp_frame_has_frags(xdpf))) nr_frags = sinfo->nr_frags; + if (unlikely(xdp_frame_has_rx_meta_ts(xdpf))) + ts = sinfo->hwtstamps.hwtstamp; + /* Part of headroom was reserved to xdpf */ headroom = sizeof(*xdpf) + xdpf->headroom; @@ -832,9 +853,15 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); + xdp_set_skb_rx_hash_from_meta(xdpf, skb); + if (xdp_frame_has_rx_meta_vlan(xdpf)) + __vlan_hwaccel_put_tag(skb, xdpf->rx_meta.vlan.proto, + xdpf->rx_meta.vlan.tci); + if (unlikely(xdp_frame_has_rx_meta_ts(xdpf))) + skb_hwtstamps(skb)->hwtstamp = ts; + /* Optional SKB info, currently missing: * - HW checksum info (skb->ip_summed) - * - HW RX hash (skb_set_hash) * - RX ring dev queue index (skb_record_rx_queue) */ From: Lorenzo Bianconi Report xdp_rx_meta info if available in xdp_buff struct in xdp_metadata_ops callbacks for veth driver Signed-off-by: Lorenzo Bianconi --- drivers/net/veth.c | 12 +++++++++++ include/net/xdp.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a3046142cb8e..c3a08b7d8192 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1651,6 +1651,10 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) { struct veth_xdp_buff *_ctx = (void *)ctx; + const struct xdp_buff *xdp = &_ctx->xdp; + + if (!xdp_load_rx_ts_from_buff(xdp, timestamp)) + return 0; if (!_ctx->skb) return -ENODATA; @@ -1663,8 +1667,12 @@ static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { struct veth_xdp_buff *_ctx = (void *)ctx; + const struct xdp_buff *xdp = &_ctx->xdp; struct sk_buff *skb = _ctx->skb; + if (!xdp_load_rx_hash_from_buff(xdp, hash, rss_type)) + return 0; + if (!skb) return -ENODATA; @@ -1678,9 +1686,13 @@ static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci) { const struct veth_xdp_buff *_ctx = (void *)ctx; + const struct xdp_buff *xdp = &_ctx->xdp; const struct sk_buff *skb = _ctx->skb; int err; + if (!xdp_load_rx_vlan_tag_from_buff(xdp, vlan_proto, vlan_tci)) + return 0; + if (!skb) return -ENODATA; diff --git a/include/net/xdp.h b/include/net/xdp.h index 3d1a9711fe82..2b495feedfb0 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -158,6 +158,23 @@ static __always_inline bool xdp_buff_has_valid_meta_area(struct xdp_buff *xdp) return !!(xdp->flags & XDP_FLAGS_META_AREA); } +static __always_inline bool +xdp_buff_has_rx_meta_hash(const struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_META_RX_HASH); +} + +static __always_inline bool +xdp_buff_has_rx_meta_vlan(const struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_META_RX_VLAN); +} + +static __always_inline bool xdp_buff_has_rx_meta_ts(const struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_META_RX_TS); +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { @@ -712,4 +729,44 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, return act; } + +static inline int xdp_load_rx_hash_from_buff(const struct xdp_buff *xdp, + u32 *hash, + enum xdp_rss_hash_type *rss_type) +{ + if (!xdp_buff_has_rx_meta_hash(xdp)) + return -ENODATA; + + *hash = xdp->rx_meta->hash.val; + *rss_type = xdp->rx_meta->hash.type; + + return 0; +} + +static inline int xdp_load_rx_vlan_tag_from_buff(const struct xdp_buff *xdp, + __be16 *vlan_proto, + u16 *vlan_tci) +{ + if (!xdp_buff_has_rx_meta_vlan(xdp)) + return -ENODATA; + + *vlan_proto = xdp->rx_meta->vlan.proto; + *vlan_tci = xdp->rx_meta->vlan.tci; + + return 0; +} + +static inline int xdp_load_rx_ts_from_buff(const struct xdp_buff *xdp, u64 *ts) +{ + struct skb_shared_info *sinfo; + + if (!xdp_buff_has_rx_meta_ts(xdp)) + return -ENODATA; + + sinfo = xdp_get_shared_info_from_buff(xdp); + *ts = sinfo->hwtstamps.hwtstamp; + + return 0; +} + #endif /* __LINUX_NET_XDP_H__ */ From: Lorenzo Bianconi Introduce bpf selftests for the XDP rx_meta store kfuncs. Signed-off-by: Lorenzo Bianconi --- .../testing/selftests/bpf/prog_tests/xdp_rxmeta.c | 166 ++++++++++++++++++++ .../selftests/bpf/progs/xdp_rxmeta_receiver.c | 44 +++++ .../selftests/bpf/progs/xdp_rxmeta_redirect.c | 43 +++++ 3 files changed, 253 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_rxmeta.c create mode 100644 tools/testing/selftests/bpf/progs/xdp_rxmeta_receiver.c create mode 100644 tools/testing/selftests/bpf/progs/xdp_rxmeta_redirect.c diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_rxmeta.c b/tools/testing/selftests/bpf/prog_tests/xdp_rxmeta.c new file mode 100644 index 000000000000..d5c181684ff8 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_rxmeta.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "xdp_rxmeta_redirect.skel.h" +#include "xdp_rxmeta_receiver.skel.h" + +#define LOCAL_NETNS_NAME "local" +#define FWD_NETNS_NAME "forward" +#define DST_NETNS_NAME "dest" + +#define LOCAL_NAME "local" +#define FWD0_NAME "fwd0" +#define FWD1_NAME "fwd1" +#define DST_NAME "dest" + +#define LOCAL_MAC "00:00:00:00:00:01" +#define FWD0_MAC "00:00:00:00:00:02" +#define FWD1_MAC "00:00:00:00:01:01" +#define DST_MAC "00:00:00:00:01:02" + +#define LOCAL_ADDR "10.0.0.1" +#define FWD0_ADDR "10.0.0.2" +#define FWD1_ADDR "20.0.0.1" +#define DST_ADDR "20.0.0.2" + +#define PREFIX_LEN "8" +#define NUM_PACKETS 10 + +static int run_ping(const char *dst, int num_ping) +{ + SYS(fail, "ping -c%d -W1 -i0.5 %s >/dev/null", num_ping, dst); + return 0; +fail: + return -1; +} + +void test_xdp_rxmeta(void) +{ + struct xdp_rxmeta_redirect *skel_redirect = NULL; + struct xdp_rxmeta_receiver *skel_receiver = NULL; + struct bpf_devmap_val val = {}; + struct nstoken *tok = NULL; + struct bpf_program *prog; + __u32 key = 0, stats; + int ret, index; + + SYS(out, "ip netns add " LOCAL_NETNS_NAME); + SYS(out, "ip netns add " FWD_NETNS_NAME); + SYS(out, "ip netns add " DST_NETNS_NAME); + + tok = open_netns(LOCAL_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; + + SYS(out, "ip link add " LOCAL_NAME " type veth peer " FWD0_NAME); + SYS(out, "ip link set " FWD0_NAME " netns " FWD_NETNS_NAME); + SYS(out, "ip link set dev " LOCAL_NAME " address " LOCAL_MAC); + SYS(out, "ip addr add " LOCAL_ADDR "/" PREFIX_LEN " dev " LOCAL_NAME); + SYS(out, "ip link set dev " LOCAL_NAME " up"); + SYS(out, "ip route add default via " FWD0_ADDR); + close_netns(tok); + + tok = open_netns(DST_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; + + SYS(out, "ip link add " DST_NAME " type veth peer " FWD1_NAME); + SYS(out, "ip link set " FWD1_NAME " netns " FWD_NETNS_NAME); + SYS(out, "ip link set dev " DST_NAME " address " DST_MAC); + SYS(out, "ip addr add " DST_ADDR "/" PREFIX_LEN " dev " DST_NAME); + SYS(out, "ip link set dev " DST_NAME " up"); + SYS(out, "ip route add default via " FWD1_ADDR); + + skel_receiver = xdp_rxmeta_receiver__open(); + if (!ASSERT_OK_PTR(skel_receiver, "open skel_receiver")) + goto out; + + prog = bpf_object__find_program_by_name(skel_receiver->obj, + "xdp_rxmeta_receiver"); + index = if_nametoindex(DST_NAME); + bpf_program__set_ifindex(prog, index); + bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY); + + if (!ASSERT_OK(xdp_rxmeta_receiver__load(skel_receiver), + "load skel_receiver")) + goto out; + + ret = bpf_xdp_attach(index, + bpf_program__fd(skel_receiver->progs.xdp_rxmeta_receiver), + XDP_FLAGS_DRV_MODE, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach rx_meta_redirect")) + goto out; + + close_netns(tok); + tok = open_netns(FWD_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; + + SYS(out, "ip link set dev " FWD0_NAME " address " FWD0_MAC); + SYS(out, "ip addr add " FWD0_ADDR "/" PREFIX_LEN " dev " FWD0_NAME); + SYS(out, "ip link set dev " FWD0_NAME " up"); + + SYS(out, "ip link set dev " FWD1_NAME " address " FWD1_MAC); + SYS(out, "ip addr add " FWD1_ADDR "/" PREFIX_LEN " dev " FWD1_NAME); + SYS(out, "ip link set dev " FWD1_NAME " up"); + + SYS(out, "sysctl -qw net.ipv4.conf.all.forwarding=1"); + + skel_redirect = xdp_rxmeta_redirect__open(); + if (!ASSERT_OK_PTR(skel_redirect, "open skel_redirect")) + goto out; + + prog = bpf_object__find_program_by_name(skel_redirect->obj, + "xdp_rxmeta_redirect"); + index = if_nametoindex(FWD0_NAME); + bpf_program__set_ifindex(prog, index); + bpf_program__set_flags(prog, BPF_F_XDP_DEV_BOUND_ONLY); + + if (!ASSERT_OK(xdp_rxmeta_redirect__load(skel_redirect), + "load skel_redirect")) + goto out; + + val.ifindex = if_nametoindex(FWD1_NAME); + ret = bpf_map_update_elem(bpf_map__fd(skel_redirect->maps.dev_map), + &key, &val, 0); + if (!ASSERT_GE(ret, 0, "bpf_map_update_elem")) + goto out; + + ret = bpf_xdp_attach(index, + bpf_program__fd(skel_redirect->progs.xdp_rxmeta_redirect), + XDP_FLAGS_DRV_MODE, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach rxmeta_redirect")) + goto out; + + close_netns(tok); + tok = open_netns(LOCAL_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; + + if (!ASSERT_OK(run_ping(DST_ADDR, NUM_PACKETS), "ping")) + goto out; + + close_netns(tok); + tok = open_netns(DST_NETNS_NAME); + if (!ASSERT_OK_PTR(tok, "setns")) + goto out; + + ret = bpf_map__lookup_elem(skel_receiver->maps.stats, + &key, sizeof(key), + &stats, sizeof(stats), 0); + if (!ASSERT_GE(ret, 0, "bpf_map_update_elem")) + goto out; + + ASSERT_EQ(stats, NUM_PACKETS, "rx_meta stats"); +out: + xdp_rxmeta_redirect__destroy(skel_redirect); + xdp_rxmeta_receiver__destroy(skel_receiver); + if (tok) + close_netns(tok); + SYS_NOFAIL("ip netns del " LOCAL_NETNS_NAME); + SYS_NOFAIL("ip netns del " FWD_NETNS_NAME); + SYS_NOFAIL("ip netns del " DST_NETNS_NAME); +} diff --git a/tools/testing/selftests/bpf/progs/xdp_rxmeta_receiver.c b/tools/testing/selftests/bpf/progs/xdp_rxmeta_receiver.c new file mode 100644 index 000000000000..1033fa558970 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_rxmeta_receiver.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#define BPF_NO_KFUNC_PROTOTYPES +#include +#include +#include + +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash, + enum xdp_rss_hash_type *rss_type) __ksym; +extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, + __u64 *timestamp) __ksym; + +#define RX_TIMESTAMP 0x12345678 +#define RX_HASH 0x1234 + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 1); +} stats SEC(".maps"); + +SEC("xdp") +int xdp_rxmeta_receiver(struct xdp_md *ctx) +{ + enum xdp_rss_hash_type rss_type; + __u64 timestamp; + __u32 hash; + + if (!bpf_xdp_metadata_rx_hash(ctx, &hash, &rss_type) && + !bpf_xdp_metadata_rx_timestamp(ctx, ×tamp)) { + if (hash == RX_HASH && rss_type == XDP_RSS_L4_TCP && + timestamp == RX_TIMESTAMP) { + __u32 *val, key = 0; + + val = bpf_map_lookup_elem(&stats, &key); + if (val) + __sync_add_and_fetch(val, 1); + } + } + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/xdp_rxmeta_redirect.c b/tools/testing/selftests/bpf/progs/xdp_rxmeta_redirect.c new file mode 100644 index 000000000000..635cbae64f53 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_rxmeta_redirect.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#define RX_TIMESTAMP 0x12345678 +#define RX_HASH 0x1234 + +#define ETH_ALEN 6 +#define ETH_P_IP 0x0800 + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 1); +} dev_map SEC(".maps"); + +SEC("xdp") +int xdp_rxmeta_redirect(struct xdp_md *ctx) +{ + __u8 src_mac[] = { 0x00, 0x00, 0x00, 0x00, 0x01, 0x01 }; + __u8 dst_mac[] = { 0x00, 0x00, 0x00, 0x00, 0x01, 0x02 }; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eh = data; + + if (eh + 1 > (struct ethhdr *)data_end) + return XDP_DROP; + + if (eh->h_proto != bpf_htons(ETH_P_IP)) + return XDP_PASS; + + __builtin_memcpy(eh->h_source, src_mac, ETH_ALEN); + __builtin_memcpy(eh->h_dest, dst_mac, ETH_ALEN); + + bpf_xdp_store_rx_hash(ctx, RX_HASH, XDP_RSS_L4_TCP); + bpf_xdp_store_rx_ts(ctx, RX_TIMESTAMP); + + return bpf_redirect_map(&dev_map, ctx->rx_queue_index, XDP_PASS); +} + +char _license[] SEC("license") = "GPL"; Update the documentation[1] based on the changes in this patchset. [1] https://docs.kernel.org/networking/xdp-rx-metadata.html Signed-off-by: Jesper Dangaard Brouer --- Documentation/networking/xdp-rx-metadata.rst | 77 +++++++++++++++++++++----- net/core/xdp.c | 32 +++++++++++ 2 files changed, 93 insertions(+), 16 deletions(-) diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst index a6e0ece18be5..e2b89c066a82 100644 --- a/Documentation/networking/xdp-rx-metadata.rst +++ b/Documentation/networking/xdp-rx-metadata.rst @@ -90,22 +90,67 @@ the ``data_meta`` pointer. In the future, we'd like to support a case where an XDP program can override some of the metadata used for building ``skbs``. -bpf_redirect_map -================ - -``bpf_redirect_map`` can redirect the frame to a different device. -Some devices (like virtual ethernet links) support running a second XDP -program after the redirect. However, the final consumer doesn't have -access to the original hardware descriptor and can't access any of -the original metadata. The same applies to XDP programs installed -into devmaps and cpumaps. - -This means that for redirected packets only custom metadata is -currently supported, which has to be prepared by the initial XDP program -before redirect. If the frame is eventually passed to the kernel, the -``skb`` created from such a frame won't have any hardware metadata populated -in its ``skb``. If such a packet is later redirected into an ``XSK``, -that will also only have access to the custom metadata. +XDP_REDIRECT +============ + +The ``XDP_REDIRECT`` action forwards an XDP frame (``xdp_frame``) to another net +device or a CPU (via cpumap/devmap) for further processing. It is invoked using +BPF helpers like ``bpf_redirect_map()`` or ``bpf_redirect()``. When an XDP +frame is redirected, the recipient (e.g., an XDP program on a veth device, or +the kernel stack via cpumap) naturally loses direct access to the original NIC's +hardware descriptor and thus its hardware metadata hints. + +By default, if an ``xdp_frame`` is redirected and then converted to an ``skb``, +its fields for hardware-derived metadata like ``skb->hash`` are not +populated. When this occurs, the network stack recalculates the hash in +software. This is particularly problematic for encapsulated tunnel traffic +(e.g., IPsec, GRE), as the software hash is based on the outer headers. For a +single tunnel, this can cause all flows to receive the same hash, leading to +poor load balancing when redirected to a veth device or processed by cpumap. + +To solve this, a BPF program can calculate a more appropriate hint from the +packet data (e.g., from the inner headers of a tunnel) and store it for later +use. While it is also possible for the BPF program to propagate existing +hardware hints, this is not useful for the tunnel use case; it is unnecessary to +read the existing hardware metadata hint, as it is based on the outer headers +and must be recalculated to correctly reflect the inner flow. + +For example, a BPF program can perform partial decryption on an IPsec packet, +calculate a hash from the inner headers, and use ``bpf_xdp_store_rx_hash()`` to +save it. This ensures that when the packet is redirected to a veth device, it is +placed on the correct RX queue, achieving proper load balancing. + +When these kfuncs are used to store hints before redirection: + +* If the ``xdp_frame`` is converted to an ``skb``, the networking stack will use + the stored hints to populate the corresponding ``skb`` fields (e.g., + ``skb->hash``, ``skb->vlan_tci``, timestamps). + +* When running a second XDP-program after the redirect. The veth driver supports + access to the previous stored metadata is accessed though the normal reader + kfuncs. + +The BPF programmer must explicitly call these "store" kfuncs to save the desired +hints. The NIC driver is responsible for ensuring sufficient headroom is +available; kfuncs may return ``-ENOSPC`` if space is inadequate. + +Kfuncs are available for storing RX hash (``bpf_xdp_store_rx_hash()``), +VLAN information (``bpf_xdp_store_rx_vlan()``), and hardware timestamps +(``bpf_xdp_store_rx_ts()``). Consult the kfunc API documentation for usage +details, expected data, return codes, and relevant XDP flags that may +indicate success or metadata availability. + +Kfuncs for **store** operations: + +.. kernel-doc:: net/core/xdp.c + :identifiers: bpf_xdp_store_rx_timestamp + +.. kernel-doc:: net/core/xdp.c + :identifiers: bpf_xdp_store_rx_hash + +.. kernel-doc:: net/core/xdp.c + :identifiers: bpf_xdp_store_rx_vlan_tag + bpf_tail_call ============= diff --git a/net/core/xdp.c b/net/core/xdp.c index f1b2a3b4ba95..e8faf9f6fc7e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -990,6 +990,18 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, return -EOPNOTSUPP; } +/** + * bpf_xdp_store_rx_hash - Store XDP frame RX hash. + * @ctx: XDP context pointer. + * @hash: 32-bit hash value. + * @rss_type: RSS hash type. + * + * The RSS hash type (@rss_type) is as descibed in bpf_xdp_metadata_rx_hash. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-NOSPC`` : means device driver doesn't provide enough headroom for storing + */ __bpf_kfunc int bpf_xdp_store_rx_hash(struct xdp_md *ctx, u32 hash, enum xdp_rss_hash_type rss_type) { @@ -1005,6 +1017,18 @@ __bpf_kfunc int bpf_xdp_store_rx_hash(struct xdp_md *ctx, u32 hash, return 0; } +/** + * bpf_xdp_store_rx_vlan_tag - Store XDP packet outermost VLAN tag + * @ctx: XDP context pointer. + * @vlan_proto: VLAN protocol stored in **network byte order (BE)** + * @vlan_tci: VLAN TCI (VID + DEI + PCP) stored in **host byte order** + * + * See bpf_xdp_metadata_rx_vlan_tag() for byte order reasoning. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-NOSPC`` : means device driver doesn't provide enough headroom for storing + */ __bpf_kfunc int bpf_xdp_store_rx_vlan(struct xdp_md *ctx, __be16 vlan_proto, u16 vlan_tci) { @@ -1020,6 +1044,14 @@ __bpf_kfunc int bpf_xdp_store_rx_vlan(struct xdp_md *ctx, __be16 vlan_proto, return 0; } +/** + * bpf_xdp_metadata_rx_timestamp - Store XDP frame RX timestamp. + * @ctx: XDP context pointer. + * @timestamp: Timestamp value. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + */ __bpf_kfunc int bpf_xdp_store_rx_ts(struct xdp_md *ctx, u64 ts) { struct xdp_buff *xdp = (struct xdp_buff *)ctx;