Currently, generic XDP hook uses xdp_rxq_info from netstack Rx queues which do not have its XDP memory model registered. There is a case when XDP program calls bpf_xdp_adjust_tail() BPF helper, which in turn releases underlying memory. This happens when it consumes enough amount of bytes and when XDP buffer has fragments. For this action the memory model knowledge passed to XDP program is crucial so that core can call suitable function for freeing/recycling the page. For netstack queues it defaults to MEM_TYPE_PAGE_SHARED (0) due to lack of mem model registration. The problem we're fixing here is when kernel copied the skb to new buffer backed by system's page_pool and XDP buffer is built around it. Then when bpf_xdp_adjust_tail() calls __xdp_return(), it acts incorrectly due to mem type not being set to MEM_TYPE_PAGE_POOL and causes a page leak. Pull out the existing code from bpf_prog_run_generic_xdp() that init/prepares xdp_buff onto new helper xdp_convert_skb_to_buff() and embed there rxq's mem_type initialization that is assigned to xdp_buff. This problem was triggered by syzbot as well as AF_XDP test suite which is about to be integrated to BPF CI. Reported-by: syzbot+ff145014d6b0ce64a173@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6756c37b.050a0220.a30f1.019a.GAE@google.com/ Fixes: e6d5dbdd20aa ("xdp: add multi-buff support for xdp running in generic mode") Tested-by: Ihor Solodrai Co-developed-by: Octavian Purdila Signed-off-by: Octavian Purdila # whole analysis, testing, initiating a fix Signed-off-by: Maciej Fijalkowski # commit msg and proposed more robust fix --- include/net/xdp.h | 31 +++++++++++++++++++++++++++++++ net/core/dev.c | 25 ++++--------------------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index aa742f413c35..51f3321e4f94 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -384,6 +384,37 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); +static inline +void xdp_convert_skb_to_buff(struct sk_buff *skb, struct xdp_buff *xdp, + struct xdp_rxq_info *xdp_rxq) +{ + u32 frame_sz, mac_len; + void *hard_start; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + frame_sz = skb_end_pointer(skb) - skb->head; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + xdp_init_buff(xdp, frame_sz, xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); + + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } + + xdp->rxq->mem.type = page_pool_page_is_pp(virt_to_page(xdp->data)) ? + MEM_TYPE_PAGE_POOL : MEM_TYPE_PAGE_SHARED; +} + static inline void xdp_convert_frame_to_buff(const struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/net/core/dev.c b/net/core/dev.c index a64cef2c537e..3d607dce292b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5320,35 +5320,18 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { - void *orig_data, *orig_data_end, *hard_start; + void *orig_data, *orig_data_end; struct netdev_rx_queue *rxqueue; bool orig_bcast, orig_host; - u32 mac_len, frame_sz; + u32 metalen, act, mac_len; __be16 orig_eth_type; struct ethhdr *eth; - u32 metalen, act; int off; - /* The XDP program wants to see the packet starting at the MAC - * header. - */ + rxqueue = netif_get_rxqueue(skb); mac_len = skb->data - skb_mac_header(skb); - hard_start = skb->data - skb_headroom(skb); - - /* SKB "head" area always have tailroom for skb_shared_info */ - frame_sz = (void *)skb_end_pointer(skb) - hard_start; - frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - rxqueue = netif_get_rxqueue(skb); - xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); - xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, - skb_headlen(skb) + mac_len, true); - if (skb_is_nonlinear(skb)) { - skb_shinfo(skb)->xdp_frags_size = skb->data_len; - xdp_buff_set_frags_flag(xdp); - } else { - xdp_buff_clear_frags_flag(xdp); - } + xdp_convert_skb_to_buff(skb, xdp, &rxqueue->xdp_rxq); orig_data_end = xdp->data_end; orig_data = xdp->data; -- 2.43.0 Veth calls skb_pp_cow_data() which makes the underlying memory to originate from system page_pool. For CONFIG_DEBUG_VM=y and XDP program that uses bpf_xdp_adjust_tail(), following splat was observed: [ 32.204881] BUG: Bad page state in process test_progs pfn:11c98b [ 32.207167] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11c98b [ 32.210084] flags: 0x1fffe0000000000(node=0|zone=1|lastcpupid=0x7fff) [ 32.212493] raw: 01fffe0000000000 dead000000000040 ff11000123c9b000 0000000000000000 [ 32.218056] raw: 0000000000000000 0000000000000001 00000000ffffffff 0000000000000000 [ 32.220900] page dumped because: page_pool leak [ 32.222636] Modules linked in: bpf_testmod(O) bpf_preload [ 32.224632] CPU: 6 UID: 0 PID: 3612 Comm: test_progs Tainted: G O 6.17.0-rc5-gfec474d29325 #6969 PREEMPT [ 32.224638] Tainted: [O]=OOT_MODULE [ 32.224639] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 32.224641] Call Trace: [ 32.224644] [ 32.224646] dump_stack_lvl+0x4b/0x70 [ 32.224653] bad_page.cold+0xbd/0xe0 [ 32.224657] __free_frozen_pages+0x838/0x10b0 [ 32.224660] ? skb_pp_cow_data+0x782/0xc30 [ 32.224665] bpf_xdp_shrink_data+0x221/0x530 [ 32.224668] ? skb_pp_cow_data+0x6d1/0xc30 [ 32.224671] bpf_xdp_adjust_tail+0x598/0x810 [ 32.224673] ? xsk_destruct_skb+0x321/0x800 [ 32.224678] bpf_prog_004ac6bb21de57a7_xsk_xdp_adjust_tail+0x52/0xd6 [ 32.224681] veth_xdp_rcv_skb+0x45d/0x15a0 [ 32.224684] ? get_stack_info_noinstr+0x16/0xe0 [ 32.224688] ? veth_set_channels+0x920/0x920 [ 32.224691] ? get_stack_info+0x2f/0x80 [ 32.224693] ? unwind_next_frame+0x3af/0x1df0 [ 32.224697] veth_xdp_rcv.constprop.0+0x38a/0xbe0 [ 32.224700] ? common_startup_64+0x13e/0x148 [ 32.224703] ? veth_xdp_rcv_one+0xcd0/0xcd0 [ 32.224706] ? stack_trace_save+0x84/0xa0 [ 32.224709] ? stack_depot_save_flags+0x28/0x820 [ 32.224713] ? __resched_curr.constprop.0+0x332/0x3b0 [ 32.224716] ? timerqueue_add+0x217/0x320 [ 32.224719] veth_poll+0x115/0x5e0 [ 32.224722] ? veth_xdp_rcv.constprop.0+0xbe0/0xbe0 [ 32.224726] ? update_load_avg+0x1cb/0x12d0 [ 32.224730] ? update_cfs_group+0x121/0x2c0 [ 32.224733] __napi_poll+0xa0/0x420 [ 32.224736] net_rx_action+0x901/0xe90 [ 32.224740] ? run_backlog_napi+0x50/0x50 [ 32.224743] ? clockevents_program_event+0x1cc/0x280 [ 32.224746] ? hrtimer_interrupt+0x31e/0x7c0 [ 32.224749] handle_softirqs+0x151/0x430 [ 32.224752] do_softirq+0x3f/0x60 [ 32.224755] It's because xdp_rxq with mem model set to MEM_TYPE_PAGE_SHARED was used when initializing xdp_buff. Fix this by using new helper xdp_convert_skb_to_buff() that, besides init/prepare xdp_buff, will check if page used for linear part of xdp_buff comes from page_pool. We assume that linear data and frags will have same memory provider as currently XDP API does not provide us a way to distinguish it (the mem model is registered for *whole* Rx queue and here we speak about single buffer granularity). In order to meet expected skb layout by new helper, pull the mac header before conversion from skb to xdp_buff. However, that is not enough as before releasing xdp_buff out of veth via XDP_{TX,REDIRECT}, mem type on xdp_rxq associated with xdp_buff is restored to its original model. We need to respect previous setting at least until buff is converted to frame, as frame carries the mem_type. Add a page_pool variant of veth_xdp_get() so that we avoid refcount underflow when draining page frag. Fixes: 0ebab78cbcbf ("net: veth: add page_pool for page recycling") Reported-by: Alexei Starovoitov Closes: https://lore.kernel.org/bpf/CAADnVQ+bBofJDfieyOYzSmSujSfJwDTQhiz3aJw7hE+4E2_iPA@mail.gmail.com/ Signed-off-by: Maciej Fijalkowski --- drivers/net/veth.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a3046142cb8e..eeeee7bba685 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -733,7 +733,7 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, } } -static void veth_xdp_get(struct xdp_buff *xdp) +static void veth_xdp_get_shared(struct xdp_buff *xdp) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i; @@ -746,12 +746,33 @@ static void veth_xdp_get(struct xdp_buff *xdp) __skb_frag_ref(&sinfo->frags[i]); } +static void veth_xdp_get_pp(struct xdp_buff *xdp) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i; + + page_pool_ref_page(virt_to_page(xdp->data)); + if (likely(!xdp_buff_has_frags(xdp))) + return; + + for (i = 0; i < sinfo->nr_frags; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + + page_pool_ref_page(netmem_to_page(frag->netmem)); + } +} + +static void veth_xdp_get(struct xdp_buff *xdp) +{ + xdp->rxq->mem.type == MEM_TYPE_PAGE_POOL ? + veth_xdp_get_pp(xdp) : veth_xdp_get_shared(xdp); +} + static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, struct xdp_buff *xdp, struct sk_buff **pskb) { struct sk_buff *skb = *pskb; - u32 frame_sz; if (skb_shared(skb) || skb_head_is_locked(skb) || skb_shinfo(skb)->nr_frags || @@ -762,19 +783,9 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, skb = *pskb; } - /* SKB "head" area always have tailroom for skb_shared_info */ - frame_sz = skb_end_pointer(skb) - skb->head; - frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); - xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), - skb_headlen(skb), true); + __skb_pull(*pskb, skb->data - skb_mac_header(skb)); - if (skb_is_nonlinear(skb)) { - skb_shinfo(skb)->xdp_frags_size = skb->data_len; - xdp_buff_set_frags_flag(xdp); - } else { - xdp_buff_clear_frags_flag(xdp); - } + xdp_convert_skb_to_buff(skb, xdp, &rq->xdp_rxq); *pskb = skb; return 0; @@ -822,24 +833,24 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, case XDP_TX: veth_xdp_get(xdp); consume_skb(skb); - xdp->rxq->mem = rq->xdp_mem; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); stats->rx_drops++; goto err_xdp; } stats->xdp_tx++; + rq->xdp_rxq.mem = rq->xdp_mem; rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: veth_xdp_get(xdp); consume_skb(skb); - xdp->rxq->mem = rq->xdp_mem; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { stats->rx_drops++; goto err_xdp; } stats->xdp_redirect++; + rq->xdp_rxq.mem = rq->xdp_mem; rcu_read_unlock(); goto xdp_xmit; default: -- 2.43.0