From: Pavel Begunkov There is a better way to handle the problem IORING_REGISTER_ZCRX_REFILL solves. Disable it for now and remove relevant uapi, it'll be reworked for next release. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- include/uapi/linux/io_uring.h | 12 ------------ io_uring/register.c | 3 --- io_uring/zcrx.c | 11 +++++++++++ 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 04797a9b76bc..e96080db3e4d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -697,9 +697,6 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, - /* return zcrx buffers back into circulation */ - IORING_REGISTER_ZCRX_REFILL = 36, - /* this goes last */ IORING_REGISTER_LAST, @@ -1081,15 +1078,6 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; -struct io_uring_zcrx_sync_refill { - __u32 zcrx_id; - /* the number of entries to return */ - __u32 nr_entries; - /* pointer to an array of struct io_uring_zcrx_rqe */ - __u64 rqes; - __u64 __resv[2]; -}; - #ifdef __cplusplus } #endif diff --git a/io_uring/register.c b/io_uring/register.c index 1a3e05be6e7b..d8ce1b5cc3a2 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -826,9 +826,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; - case IORING_REGISTER_ZCRX_REFILL: - ret = io_zcrx_return_bufs(ctx, arg, nr_args); - break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a816f5902091..b694fa582d29 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -931,6 +931,16 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { #define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) #define IO_ZCRX_SYS_REFILL_BATCH 32 +struct io_uring_zcrx_sync_refill { + __u32 zcrx_id; + /* the number of entries to return */ + __u32 nr_entries; + /* pointer to an array of struct io_uring_zcrx_rqe */ + __u64 rqes; + __u64 __resv[2]; +}; + + static void io_return_buffers(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_rqe *rqes, unsigned nr) { @@ -955,6 +965,7 @@ static void io_return_buffers(struct io_zcrx_ifq *ifq, } } +__maybe_unused int io_zcrx_return_bufs(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg) { -- 2.47.3 From: Pavel Begunkov Introduce IORING_REGISTER_ZCRX_CTRL and add some boilerplate code forwarding it to zcrx. There are no actual users in this patch, it'll be used for refill queue flushing and other features. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- include/uapi/linux/io_uring.h | 13 +++++++++++++ io_uring/register.c | 3 +++ io_uring/zcrx.c | 21 ++++++++++++++++++++- io_uring/zcrx.h | 7 +++---- 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e96080db3e4d..8b4935b983e7 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -697,6 +697,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ + IORING_REGISTER_ZCRX_CTRL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1078,6 +1081,16 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +enum zcrx_ctrl_op { + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 resv[8]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/register.c b/io_uring/register.c index d8ce1b5cc3a2..38b20a7a34db 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -826,6 +826,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; + case IORING_REGISTER_ZCRX_CTRL: + ret = io_zcrx_ctrl(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b694fa582d29..3e9d8333a301 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -541,6 +541,25 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, return ifq ? &ifq->region : NULL; } +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct zcrx_ctrl ctrl; + struct io_zcrx_ifq *ifq; + + if (nr_args) + return -EINVAL; + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) + return -EFAULT; + if (ctrl.op >= __ZCRX_CTRL_LAST) + return -EOPNOTSUPP; + + ifq = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); + if (!ifq) + return -ENXIO; + + return -EINVAL; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -966,7 +985,7 @@ static void io_return_buffers(struct io_zcrx_ifq *ifq, } __maybe_unused -int io_zcrx_return_bufs(struct io_ring_ctx *ctx, +static int io_zcrx_return_bufs(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg) { struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 33ef61503092..d7bef619e8ad 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -63,8 +63,7 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) -int io_zcrx_return_bufs(struct io_ring_ctx *ctx, - void __user *arg, unsigned nr_arg); +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -97,8 +96,8 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } -static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, - void __user *arg, unsigned nr_arg) +static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) { return -EOPNOTSUPP; } -- 2.47.3 Remove io_ring_ctx arg from io_region_pin_pages() and io_region_allocate_pages() that isn't used. Signed-off-by: David Wei --- io_uring/memmap.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index aa388ecd4754..d1318079c337 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -131,9 +131,8 @@ static int io_region_init_ptr(struct io_mapped_region *mr) return 0; } -static int io_region_pin_pages(struct io_ring_ctx *ctx, - struct io_mapped_region *mr, - struct io_uring_region_desc *reg) +static int io_region_pin_pages(struct io_mapped_region *mr, + struct io_uring_region_desc *reg) { unsigned long size = mr->nr_pages << PAGE_SHIFT; struct page **pages; @@ -150,8 +149,7 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx, return 0; } -static int io_region_allocate_pages(struct io_ring_ctx *ctx, - struct io_mapped_region *mr, +static int io_region_allocate_pages(struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset) { @@ -219,9 +217,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, mr->nr_pages = nr_pages; if (reg->flags & IORING_MEM_REGION_TYPE_USER) - ret = io_region_pin_pages(ctx, mr, reg); + ret = io_region_pin_pages(mr, reg); else - ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); + ret = io_region_allocate_pages(mr, reg, mmap_offset); if (ret) goto out_free; -- 2.47.3 Refactor io_free_region() to take user_struct directly, instead of accessing it from the ring ctx. Signed-off-by: David Wei --- io_uring/io_uring.c | 6 +++--- io_uring/kbuf.c | 4 ++-- io_uring/memmap.c | 8 ++++---- io_uring/memmap.h | 2 +- io_uring/register.c | 6 +++--- io_uring/zcrx.c | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 200b6c4bb2cc..7d42748774f8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2798,8 +2798,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, static void io_rings_free(struct io_ring_ctx *ctx) { - io_free_region(ctx, &ctx->sq_region); - io_free_region(ctx, &ctx->ring_region); + io_free_region(ctx->user, &ctx->sq_region); + io_free_region(ctx->user, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -2884,7 +2884,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_eventfd_unregister(ctx); io_free_alloc_caches(ctx); io_destroy_buffers(ctx); - io_free_region(ctx, &ctx->param_region); + io_free_region(ctx->user, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index c034c90396bc..8a329556f8df 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -428,7 +428,7 @@ static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { if (bl->flags & IOBL_BUF_RING) - io_free_region(ctx, &bl->region); + io_free_region(ctx->user, &bl->region); else io_remove_buffers_legacy(ctx, bl, -1U); @@ -672,7 +672,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_buffer_add_list(ctx, bl, reg.bgid); return 0; fail: - io_free_region(ctx, &bl->region); + io_free_region(ctx->user, &bl->region); kfree(bl); return ret; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index d1318079c337..b1054fe94568 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -88,7 +88,7 @@ enum { IO_REGION_F_SINGLE_REF = 4, }; -void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) +void io_free_region(struct user_struct *user, struct io_mapped_region *mr) { if (mr->pages) { long nr_refs = mr->nr_pages; @@ -105,8 +105,8 @@ void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) } if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) vunmap(mr->ptr); - if (mr->nr_pages && ctx->user) - __io_unaccount_mem(ctx->user, mr->nr_pages); + if (mr->nr_pages && user) + __io_unaccount_mem(user, mr->nr_pages); memset(mr, 0, sizeof(*mr)); } @@ -228,7 +228,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, goto out_free; return 0; out_free: - io_free_region(ctx, mr); + io_free_region(ctx->user, mr); return ret; } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 58002976e0c3..a7c476f499d5 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -16,7 +16,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); int io_uring_mmap(struct file *file, struct vm_area_struct *vma); -void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); +void io_free_region(struct user_struct *user, struct io_mapped_region *mr); int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset); diff --git a/io_uring/register.c b/io_uring/register.c index 38b20a7a34db..244d523d1bad 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -381,8 +381,8 @@ struct io_ring_ctx_rings { static void io_register_free_rings(struct io_ring_ctx *ctx, struct io_ring_ctx_rings *r) { - io_free_region(ctx, &r->sq_region); - io_free_region(ctx, &r->ring_region); + io_free_region(ctx->user, &r->sq_region); + io_free_region(ctx->user, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -604,7 +604,7 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { - io_free_region(ctx, ®ion); + io_free_region(ctx->user, ®ion); return -EFAULT; } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 3e9d8333a301..ec0a76b4f199 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -378,7 +378,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx, &ifq->region); + io_free_region(ifq->ctx->user, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } -- 2.47.3 Refactor io_{un}account_mem() to take user_struct and mm_struct directly, instead of accessing it from the ring ctx. Signed-off-by: David Wei --- io_uring/rsrc.c | 26 ++++++++++++++------------ io_uring/rsrc.h | 6 ++++-- io_uring/zcrx.c | 5 +++-- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d787c16dc1c3..59135fe84082 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -56,27 +56,29 @@ int __io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } -void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages) { - if (ctx->user) - __io_unaccount_mem(ctx->user, nr_pages); + if (user) + __io_unaccount_mem(user, nr_pages); - if (ctx->mm_account) - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); + if (mm_account) + atomic64_sub(nr_pages, &mm_account->pinned_vm); } -int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages) { int ret; - if (ctx->user) { - ret = __io_account_mem(ctx->user, nr_pages); + if (user) { + ret = __io_account_mem(user, nr_pages); if (ret) return ret; } - if (ctx->mm_account) - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); + if (mm_account) + atomic64_add(nr_pages, &mm_account->pinned_vm); return 0; } @@ -145,7 +147,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) } if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); + io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); imu->release(imu->priv); io_free_imu(ctx, imu); } @@ -684,7 +686,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, if (!imu->acct_pages) return 0; - ret = io_account_mem(ctx, imu->acct_pages); + ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index a3ca6ba66596..d603f6a47f5e 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -120,8 +120,10 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int __io_account_mem(struct user_struct *user, unsigned long nr_pages); -int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); -void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); +int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages); +void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, + unsigned long nr_pages); static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index ec0a76b4f199..ac9abfd54799 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -200,7 +200,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, } mem->account_pages = io_count_account_pages(pages, nr_pages); - ret = io_account_mem(ifq->ctx, mem->account_pages); + ret = io_account_mem(ifq->ctx->user, ifq->ctx->mm_account, mem->account_pages); if (ret < 0) mem->account_pages = 0; @@ -389,7 +389,8 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) io_release_area_mem(&area->mem); if (area->mem.account_pages) - io_unaccount_mem(area->ifq->ctx, area->mem.account_pages); + io_unaccount_mem(area->ifq->ctx->user, area->ifq->ctx->mm_account, + area->mem.account_pages); kvfree(area->freelist); kvfree(area->nia.niovs); -- 2.47.3 Add io_zcrx_ifq arg to io_zcrx_free_area(). A QOL change to reduce line widths. Signed-off-by: David Wei --- io_uring/zcrx.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index ac9abfd54799..5dd93e4e0ee7 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -383,9 +383,10 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) ifq->rqes = NULL; } -static void io_zcrx_free_area(struct io_zcrx_area *area) +static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) { - io_zcrx_unmap_area(area->ifq, area); + io_zcrx_unmap_area(ifq, area); io_release_area_mem(&area->mem); if (area->mem.account_pages) @@ -464,7 +465,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, return 0; err: if (area) - io_zcrx_free_area(area); + io_zcrx_free_area(ifq, area); return ret; } @@ -523,7 +524,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) io_close_queue(ifq); if (ifq->area) - io_zcrx_free_area(ifq->area); + io_zcrx_free_area(ifq, ifq->area); if (ifq->dev) put_device(ifq->dev); -- 2.47.3 In preparation for removing ifq->ctx and making ifq lifetime independent of ring ctx, add user_struct and mm_struct to io_zcrx_ifq. In the ifq cleanup path, these are the only fields used from the main ring ctx to do accounting. Taking a copy in the ifq allows ifq->ctx to be removed later, including the ctx->refs held by the ifq. Signed-off-by: David Wei --- io_uring/zcrx.c | 24 ++++++++++++++++++------ io_uring/zcrx.h | 2 ++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 5dd93e4e0ee7..dcf5297c0330 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -200,7 +200,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, } mem->account_pages = io_count_account_pages(pages, nr_pages); - ret = io_account_mem(ifq->ctx->user, ifq->ctx->mm_account, mem->account_pages); + ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); if (ret < 0) mem->account_pages = 0; @@ -344,7 +344,8 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } -static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, +static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, + struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd, u32 id) @@ -362,7 +363,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, mmap_offset = IORING_MAP_OFF_ZCRX_REGION; mmap_offset += id << IORING_OFF_PBUF_SHIFT; - ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); + ret = io_create_region(ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; @@ -378,7 +379,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx->user, &ifq->region); + io_free_region(ifq->user, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } @@ -390,7 +391,7 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, io_release_area_mem(&area->mem); if (area->mem.account_pages) - io_unaccount_mem(area->ifq->ctx->user, area->ifq->ctx->mm_account, + io_unaccount_mem(ifq->user, ifq->mm_account, area->mem.account_pages); kvfree(area->freelist); @@ -525,6 +526,9 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) if (ifq->area) io_zcrx_free_area(ifq, ifq->area); + free_uid(ifq->user); + if (ifq->mm_account) + mmdrop(ifq->mm_account); if (ifq->dev) put_device(ifq->dev); @@ -607,6 +611,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + if (ctx->user) { + get_uid(ctx->user); + ifq->user = ctx->user; + } + if (ctx->mm_account) { + mmgrab(ctx->mm_account); + ifq->mm_account = ctx->mm_account; + } ifq->rq_entries = reg.rq_entries; scoped_guard(mutex, &ctx->mmap_lock) { @@ -616,7 +628,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto ifq_free; } - ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); + ret = io_allocate_rbuf_ring(ctx, ifq, ®, &rd, id); if (ret) goto err; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index d7bef619e8ad..2396436643e5 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -42,6 +42,8 @@ struct io_zcrx_ifq { struct io_ring_ctx *ctx; struct io_zcrx_area *area; unsigned niov_shift; + struct user_struct *user; + struct mm_struct *mm_account; spinlock_t rq_lock ____cacheline_aligned_in_smp; struct io_uring *rq_ring; -- 2.47.3 In preparation for removing the ref on ctx->refs held by an ifq and removing io_shutdown_zcrx_ifqs(), move io_unregister_zcrx_ifqs() down such that it can call io_zcrx_scrub(). Signed-off-by: David Wei --- io_uring/zcrx.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index dcf5297c0330..bb5cc6ec5b9b 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -681,28 +681,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return ret; } -void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) -{ - struct io_zcrx_ifq *ifq; - - lockdep_assert_held(&ctx->uring_lock); - - while (1) { - scoped_guard(mutex, &ctx->mmap_lock) { - unsigned long id = 0; - - ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); - if (ifq) - xa_erase(&ctx->zcrx_ctxs, id); - } - if (!ifq) - break; - io_zcrx_ifq_free(ifq); - } - - xa_destroy(&ctx->zcrx_ctxs); -} - static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) { unsigned niov_idx; @@ -768,6 +746,28 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) } } +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + lockdep_assert_held(&ctx->uring_lock); + + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + unsigned long id = 0; + + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } + + xa_destroy(&ctx->zcrx_ctxs); +} + static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) { u32 entries; -- 2.47.3 Add two refcounts to struct io_zcrx_ifq to reverse the refcounting relationship i.e. rings now reference ifqs instead. As a result of this, remove ctx->refs that an ifq holds on a ring via the page pool memory provider. The first ref is ifq->refs, held by internal users of an ifq, namely rings and the page pool memory provider associated with an ifq. This is needed to keep the ifq around until the page pool is destroyed. The second ref is ifq->user_refs, held by userspace facing users like rings. For now, only the ring that created the ifq will have a ref, but with ifq sharing added, this will include multiple rings. ifq->refs will be 1 larger than ifq->user_refs, with the extra ref held by the page pool. Once ifq->user_refs falls to 0, the ifq is cleaned up including destroying the page pool. Once the page pool is destroyed, ifq->refs will fall to 0 and free the ifq. Since ifqs now no longer hold refs to ring ctx, there isn't a need to split the cleanup of ifqs into two: io_shutdown_zcrx_ifqs() in io_ring_exit_work() while waiting for ctx->refs to drop to 0, and io_unregister_zcrx_ifqs() after. Remove io_shutdown_zcrx_ifqs(). Signed-off-by: David Wei Co-developed-by: Pavel Begunkov Signed-off-by: Pavel Begunkov --- io_uring/io_uring.c | 5 ----- io_uring/zcrx.c | 36 +++++++++++++++++------------------- io_uring/zcrx.h | 8 +++----- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7d42748774f8..8af5efda9c11 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3042,11 +3042,6 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } - if (!xa_empty(&ctx->zcrx_ctxs)) { - mutex_lock(&ctx->uring_lock); - io_shutdown_zcrx_ifqs(ctx); - mutex_unlock(&ctx->uring_lock); - } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index bb5cc6ec5b9b..00498e3dcbd3 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -479,9 +479,10 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) return NULL; ifq->if_rxq = -1; - ifq->ctx = ctx; spin_lock_init(&ifq->rq_lock); mutex_init(&ifq->pp_lock); + refcount_set(&ifq->refs, 1); + refcount_set(&ifq->user_refs, 1); return ifq; } @@ -537,6 +538,12 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) kfree(ifq); } +static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->refs)) + io_zcrx_ifq_free(ifq); +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -611,6 +618,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + if (ctx->user) { get_uid(ctx->user); ifq->user = ctx->user; @@ -733,19 +741,6 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } -void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) -{ - struct io_zcrx_ifq *ifq; - unsigned long index; - - lockdep_assert_held(&ctx->uring_lock); - - xa_for_each(&ctx->zcrx_ctxs, index, ifq) { - io_zcrx_scrub(ifq); - io_close_queue(ifq); - } -} - void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -762,7 +757,12 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) } if (!ifq) break; - io_zcrx_ifq_free(ifq); + + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } + io_put_zcrx_ifq(ifq); } xa_destroy(&ctx->zcrx_ctxs); @@ -913,15 +913,13 @@ static int io_pp_zc_init(struct page_pool *pp) if (ret) return ret; - percpu_ref_get(&ifq->ctx->refs); + refcount_inc(&ifq->refs); return 0; } static void io_pp_zc_destroy(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); - - percpu_ref_put(&ifq->ctx->refs); + io_put_zcrx_ifq(io_pp_to_ifq(pp)); } static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 2396436643e5..9014a1fd0f61 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -39,7 +39,6 @@ struct io_zcrx_area { }; struct io_zcrx_ifq { - struct io_ring_ctx *ctx; struct io_zcrx_area *area; unsigned niov_shift; struct user_struct *user; @@ -55,6 +54,9 @@ struct io_zcrx_ifq { struct device *dev; struct net_device *netdev; netdevice_tracker netdev_tracker; + refcount_t refs; + /* counts userspace facing users like io_uring */ + refcount_t user_refs; /* * Page pool and net configuration lock, can be taken deeper in the @@ -69,7 +71,6 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); -void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len); @@ -84,9 +85,6 @@ static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { } -static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) -{ -} static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len) -- 2.47.3 In preparation for adding zcrx ifq exporting and importing, move io_zcrx_scrub() and its dependencies up the file to be closer to io_close_queue(). Signed-off-by: David Wei --- io_uring/zcrx.c | 84 ++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 00498e3dcbd3..e9981478bcf6 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -544,6 +544,48 @@ static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) io_zcrx_ifq_free(ifq); } +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -699,48 +741,6 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) return &area->nia.niovs[niov_idx]; } -static void io_zcrx_return_niov_freelist(struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - spin_lock_bh(&area->freelist_lock); - area->freelist[area->free_count++] = net_iov_idx(niov); - spin_unlock_bh(&area->freelist_lock); -} - -static void io_zcrx_return_niov(struct net_iov *niov) -{ - netmem_ref netmem = net_iov_to_netmem(niov); - - if (!niov->pp) { - /* copy fallback allocated niovs */ - io_zcrx_return_niov_freelist(niov); - return; - } - page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); -} - -static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) -{ - struct io_zcrx_area *area = ifq->area; - int i; - - if (!area) - return; - - /* Reclaim back all buffers given to the user space. */ - for (i = 0; i < area->nia.num_niovs; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - int nr; - - if (!atomic_read(io_get_user_counter(niov))) - continue; - nr = atomic_xchg(io_get_user_counter(niov), 0); - if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) - io_zcrx_return_niov(niov); - } -} - void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; -- 2.47.3 From: Pavel Begunkov Add an option to wrap a zcrx instance into a file and expose it to the user space. Currently, users can't do anything meaningful with the file, but it'll be used in a next patch to import it into another io_uring instance. It's implemented as a new op called ZCRX_CTRL_EXPORT for the IORING_REGISTER_ZCRX_CTRL registration opcode. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei --- include/uapi/linux/io_uring.h | 2 ++ io_uring/zcrx.c | 62 +++++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8b4935b983e7..34bd32402902 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1082,6 +1082,8 @@ struct io_uring_zcrx_ifq_reg { }; enum zcrx_ctrl_op { + ZCRX_CTRL_EXPORT, + __ZCRX_CTRL_LAST, }; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e9981478bcf6..17ce49536f41 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -586,6 +587,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } + io_put_zcrx_ifq(ifq); +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -596,6 +606,46 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, return ifq ? &ifq->region : NULL; } +static int zcrx_box_release(struct inode *inode, struct file *file) +{ + struct io_zcrx_ifq *ifq = file->private_data; + + zcrx_unregister(ifq); + return 0; +} + +static const struct file_operations zcrx_box_fops = { + .owner = THIS_MODULE, + .release = zcrx_box_release, +}; + +static int export_zcrx(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, + struct zcrx_ctrl *ctrl) +{ + struct file *file; + int fd = -1; + + if (!mem_is_zero(&ctrl->resv, sizeof(ctrl->resv))) + return -EINVAL; + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, + ifq, O_CLOEXEC, NULL); + if (IS_ERR(file)) { + put_unused_fd(fd); + zcrx_unregister(ifq); + return PTR_ERR(file); + } + + fd_install(fd, file); + return fd; +} + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct zcrx_ctrl ctrl; @@ -612,6 +662,11 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) if (!ifq) return -ENXIO; + switch (ctrl.op) { + case ZCRX_CTRL_EXPORT: + return export_zcrx(ctx, ifq, &ctrl); + } + return -EINVAL; } @@ -757,12 +812,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) } if (!ifq) break; - - if (refcount_dec_and_test(&ifq->user_refs)) { - io_close_queue(ifq); - io_zcrx_scrub(ifq); - } - io_put_zcrx_ifq(ifq); + zcrx_unregister(ifq); } xa_destroy(&ctx->zcrx_ctxs); -- 2.47.3 Add a way to share an ifq from a src ring that is real (i.e. bound to a HW RX queue) with other rings. This is done by passing a new flag IORING_ZCRX_IFQ_REG_IMPORT in the registration struct io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq. Signed-off-by: David Wei --- include/uapi/linux/io_uring.h | 4 +++ io_uring/zcrx.c | 63 +++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 34bd32402902..0ead7f6b2094 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1063,6 +1063,10 @@ struct io_uring_zcrx_area_reg { __u64 __resv2[2]; }; +enum io_uring_zcrx_ifq_reg_flags { + IORING_ZCRX_IFQ_REG_IMPORT = 1, +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 17ce49536f41..5a0af9dd6a8e 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -625,6 +625,11 @@ static int export_zcrx(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, struct file *file; int fd = -1; + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + if (!mem_is_zero(&ctrl->resv, sizeof(ctrl->resv))) return -EINVAL; fd = get_unused_fd_flags(O_CLOEXEC); @@ -646,6 +651,58 @@ static int export_zcrx(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, return fd; } +static int import_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg, + struct io_uring_zcrx_ifq_reg *reg) +{ + struct io_zcrx_ifq *ifq; + struct file *file; + int fd, ret; + u32 id; + + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) + return -EINVAL; + + fd = reg->if_idx; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + file = fd_file(f); + if (file->f_op != &zcrx_box_fops || !file->private_data) + return -EBADF; + + ifq = file->private_data; + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto err; + } + + reg->zcrx_id = id; + if (copy_to_user(arg, reg, sizeof(*reg))) { + ret = -EFAULT; + goto err_xa_erase; + } + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err_xa_erase; + } + + return 0; +err_xa_erase: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +err: + zcrx_unregister(ifq); + return ret; +} + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct zcrx_ctrl ctrl; @@ -695,11 +752,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EINVAL; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) - return -EFAULT; if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.__resv2 || reg.zcrx_id) return -EINVAL; + if (reg.flags & IORING_ZCRX_IFQ_REG_IMPORT) + return import_zcrx(ctx, arg, ®); + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { -- 2.47.3