Refactor the logic in io_register_pbuf_ring() into generic helpers: - io_copy_and_validate_buf_reg(): Copy out user arg and validate user arg and buffer registration parameters - io_alloc_new_buffer_list(): Allocate and initialize a new buffer list for the given buffer group ID - io_setup_pbuf_ring(): Sets up the physical buffer ring region and handles memory mapping for provided buffer rings This is a preparatory change for upcoming kernel-managed buffer ring support which will need to reuse some of these helpers. Signed-off-by: Joanne Koong --- io_uring/kbuf.c | 129 +++++++++++++++++++++++++++++++----------------- 1 file changed, 85 insertions(+), 44 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 67d4fe576473..850b836f32ee 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -596,55 +596,73 @@ int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) return IOU_COMPLETE; } -int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +static int io_copy_and_validate_buf_reg(const void __user *arg, + struct io_uring_buf_reg *reg, + unsigned int permitted_flags) { - struct io_uring_buf_reg reg; - struct io_buffer_list *bl; - struct io_uring_region_desc rd; - struct io_uring_buf_ring *br; - unsigned long mmap_offset; - unsigned long ring_size; - int ret; - - lockdep_assert_held(&ctx->uring_lock); - - if (copy_from_user(®, arg, sizeof(reg))) + if (copy_from_user(reg, arg, sizeof(*reg))) return -EFAULT; - if (!mem_is_zero(reg.resv, sizeof(reg.resv))) + + if (!mem_is_zero(reg->resv, sizeof(reg->resv))) return -EINVAL; - if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) + if (reg->flags & ~permitted_flags) return -EINVAL; - if (!is_power_of_2(reg.ring_entries)) + if (!is_power_of_2(reg->ring_entries)) return -EINVAL; /* cannot disambiguate full vs empty due to head/tail size */ - if (reg.ring_entries >= 65536) + if (reg->ring_entries >= 65536) return -EINVAL; + return 0; +} - bl = io_buffer_get_list(ctx, reg.bgid); - if (bl) { +static struct io_buffer_list * +io_alloc_new_buffer_list(struct io_ring_ctx *ctx, + const struct io_uring_buf_reg *reg) +{ + struct io_buffer_list *list; + + list = io_buffer_get_list(ctx, reg->bgid); + if (list) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) - return -EEXIST; - io_destroy_bl(ctx, bl); + if (list->flags & IOBL_BUF_RING || !list_empty(&list->buf_list)) + return ERR_PTR(-EEXIST); + io_destroy_bl(ctx, list); } - bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) - return -ENOMEM; + list = kzalloc(sizeof(*list), GFP_KERNEL_ACCOUNT); + if (!list) + return ERR_PTR(-ENOMEM); + + list->nr_entries = reg->ring_entries; + list->mask = reg->ring_entries - 1; + list->flags = IOBL_BUF_RING; + + return list; +} + +static int io_setup_pbuf_ring(struct io_ring_ctx *ctx, + const struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) +{ + struct io_uring_region_desc rd; + unsigned long mmap_offset; + unsigned long ring_size; + int ret; - mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; - ring_size = flex_array_size(br, bufs, reg.ring_entries); + mmap_offset = (unsigned long)reg->bgid << IORING_OFF_PBUF_SHIFT; + ring_size = flex_array_size(bl->buf_ring, bufs, reg->ring_entries); memset(&rd, 0, sizeof(rd)); rd.size = PAGE_ALIGN(ring_size); - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { - rd.user_addr = reg.ring_addr; + if (!(reg->flags & IOU_PBUF_RING_MMAP)) { + rd.user_addr = reg->ring_addr; rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region(ctx, &bl->region, &rd, mmap_offset); if (ret) - goto fail; - br = io_region_get_ptr(&bl->region); + return ret; + bl->buf_ring = io_region_get_ptr(&bl->region); #ifdef SHM_COLOUR /* @@ -656,25 +674,48 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) * should use IOU_PBUF_RING_MMAP instead, and liburing will handle * this transparently. */ - if (!(reg.flags & IOU_PBUF_RING_MMAP) && - ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { - ret = -EINVAL; - goto fail; + if (!(reg->flags & IOU_PBUF_RING_MMAP) && + ((reg->ring_addr | (unsigned long)bl->buf_ring) & + (SHM_COLOUR - 1))) { + io_free_region(ctx->user, &bl->region); + return -EINVAL; } #endif - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; - bl->flags |= IOBL_BUF_RING; - bl->buf_ring = br; - if (reg.flags & IOU_PBUF_RING_INC) + if (reg->flags & IOU_PBUF_RING_INC) bl->flags |= IOBL_INC; + + return 0; +} + +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + unsigned int permitted_flags; + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + permitted_flags = IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC; + ret = io_copy_and_validate_buf_reg(arg, ®, permitted_flags); + if (ret) + return ret; + + bl = io_alloc_new_buffer_list(ctx, ®); + if (IS_ERR(bl)) + return PTR_ERR(bl); + + ret = io_setup_pbuf_ring(ctx, ®, bl); + if (ret) { + kfree(bl); + return ret; + } + ret = io_buffer_add_list(ctx, bl, reg.bgid); - if (!ret) - return 0; -fail: - io_free_region(ctx->user, &bl->region); - kfree(bl); + if (ret) + io_put_bl(ctx, bl); + return ret; } -- 2.47.3 Use the more generic name io_unregister_buf_ring() as this function will be used for unregistering both provided buffer rings and kernel-managed buffer rings. This is a preparatory change for upcoming kernel-managed buffer ring support. Signed-off-by: Joanne Koong --- io_uring/kbuf.c | 2 +- io_uring/kbuf.h | 2 +- io_uring/register.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 850b836f32ee..aa9b70b72db4 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -719,7 +719,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return ret; } -int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +int io_unregister_buf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; struct io_buffer_list *bl; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index bf15e26520d3..40b44f4fdb15 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -74,7 +74,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); -int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +int io_unregister_buf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); diff --git a/io_uring/register.c b/io_uring/register.c index 594b1f2ce875..0882cb34f851 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -841,7 +841,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = -EINVAL; if (!arg || nr_args != 1) break; - ret = io_unregister_pbuf_ring(ctx, arg); + ret = io_unregister_buf_ring(ctx, arg); break; case IORING_REGISTER_SYNC_CANCEL: ret = -EINVAL; -- 2.47.3 Add support for kernel-managed buffer rings (kmbuf rings), which allow the kernel to allocate and manage the backing buffers for a buffer ring, rather than requiring the application to provide and manage them. This introduces two new registration opcodes: - IORING_REGISTER_KMBUF_RING: Register a kernel-managed buffer ring - IORING_UNREGISTER_KMBUF_RING: Unregister a kernel-managed buffer ring The existing io_uring_buf_reg structure is extended with a union to support both application-provided buffer rings (pbuf) and kernel-managed buffer rings (kmbuf): - For pbuf rings: ring_addr specifies the user-provided ring address - For kmbuf rings: buf_size specifies the size of each buffer. buf_size must be non-zero and page-aligned. The implementation follows the same pattern as pbuf ring registration, reusing the validation and buffer list allocation helpers introduced in earlier refactoring. The IOBL_KERNEL_MANAGED flag marks buffer lists as kernel-managed for appropriate handling in the I/O path. Signed-off-by: Joanne Koong --- include/uapi/linux/io_uring.h | 15 ++++- io_uring/kbuf.c | 81 ++++++++++++++++++++++++- io_uring/kbuf.h | 7 ++- io_uring/memmap.c | 111 ++++++++++++++++++++++++++++++++++ io_uring/memmap.h | 4 ++ io_uring/register.c | 7 +++ 6 files changed, 219 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index fc473af6feb4..a0889c1744bd 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -715,6 +715,10 @@ enum io_uring_register_op { /* register bpf filtering programs */ IORING_REGISTER_BPF_FILTER = 37, + /* register/unregister kernel-managed ring buffer group */ + IORING_REGISTER_KMBUF_RING = 38, + IORING_UNREGISTER_KMBUF_RING = 39, + /* this goes last */ IORING_REGISTER_LAST, @@ -891,9 +895,16 @@ enum io_uring_register_pbuf_ring_flags { IOU_PBUF_RING_INC = 2, }; -/* argument for IORING_(UN)REGISTER_PBUF_RING */ +/* argument for IORING_(UN)REGISTER_PBUF_RING and + * IORING_(UN)REGISTER_KMBUF_RING + */ struct io_uring_buf_reg { - __u64 ring_addr; + union { + /* used for pbuf rings */ + __u64 ring_addr; + /* used for kmbuf rings */ + __u32 buf_size; + }; __u32 ring_entries; __u16 bgid; __u16 flags; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index aa9b70b72db4..9bc36451d083 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -427,10 +427,13 @@ static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - if (bl->flags & IOBL_BUF_RING) + if (bl->flags & IOBL_BUF_RING) { io_free_region(ctx->user, &bl->region); - else + if (bl->flags & IOBL_KERNEL_MANAGED) + kfree(bl->buf_ring); + } else { io_remove_buffers_legacy(ctx, bl, -1U); + } kfree(bl); } @@ -779,3 +782,77 @@ struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, return NULL; return &bl->region; } + +static int io_setup_kmbuf_ring(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + struct io_uring_buf_reg *reg) +{ + struct io_uring_buf_ring *ring; + unsigned long ring_size; + void *buf_region; + unsigned int i; + int ret; + + /* allocate pages for the ring structure */ + ring_size = flex_array_size(ring, bufs, bl->nr_entries); + ring = kzalloc(ring_size, GFP_KERNEL_ACCOUNT); + if (!ring) + return -ENOMEM; + + ret = io_create_region_multi_buf(ctx, &bl->region, bl->nr_entries, + reg->buf_size); + if (ret) { + kfree(ring); + return ret; + } + + /* initialize ring buf entries to point to the buffers */ + buf_region = bl->region.ptr; + for (i = 0; i < bl->nr_entries; i++) { + struct io_uring_buf *buf = &ring->bufs[i]; + + buf->addr = (u64)(uintptr_t)buf_region; + buf->len = reg->buf_size; + buf->bid = i; + + buf_region += reg->buf_size; + } + ring->tail = bl->nr_entries; + + bl->buf_ring = ring; + bl->flags |= IOBL_KERNEL_MANAGED; + + return 0; +} + +int io_register_kmbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + ret = io_copy_and_validate_buf_reg(arg, ®, 0); + if (ret) + return ret; + + if (!reg.buf_size || !PAGE_ALIGNED(reg.buf_size)) + return -EINVAL; + + bl = io_alloc_new_buffer_list(ctx, ®); + if (IS_ERR(bl)) + return PTR_ERR(bl); + + ret = io_setup_kmbuf_ring(ctx, bl, ®); + if (ret) { + kfree(bl); + return ret; + } + + ret = io_buffer_add_list(ctx, bl, reg.bgid); + if (ret) + io_put_bl(ctx, bl); + + return ret; +} diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 40b44f4fdb15..62c80a1ebf03 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -7,9 +7,11 @@ enum { /* ring mapped provided buffers */ - IOBL_BUF_RING = 1, + IOBL_BUF_RING = 1, /* buffers are consumed incrementally rather than always fully */ - IOBL_INC = 2, + IOBL_INC = 2, + /* buffers are kernel managed */ + IOBL_KERNEL_MANAGED = 4, }; struct io_buffer_list { @@ -74,6 +76,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +int io_register_kmbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_buf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 89f56609e50a..8d37e93c0433 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -15,6 +15,28 @@ #include "rsrc.h" #include "zcrx.h" +static void release_multi_buf_pages(struct page **pages, unsigned long nr_pages) +{ + struct page *page; + unsigned int nr, i = 0; + + while (nr_pages) { + page = pages[i]; + + if (!page || WARN_ON_ONCE(page != compound_head(page))) + return; + + nr = compound_nr(page); + put_page(page); + + if (WARN_ON_ONCE(nr > nr_pages)) + return; + + i += nr; + nr_pages -= nr; + } +} + static bool io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) { @@ -86,6 +108,8 @@ enum { IO_REGION_F_USER_PROVIDED = 2, /* only the first page in the array is ref'ed */ IO_REGION_F_SINGLE_REF = 4, + /* pages in the array belong to multiple discrete allocations */ + IO_REGION_F_MULTI_BUF = 8, }; void io_free_region(struct user_struct *user, struct io_mapped_region *mr) @@ -98,6 +122,8 @@ void io_free_region(struct user_struct *user, struct io_mapped_region *mr) if (mr->flags & IO_REGION_F_USER_PROVIDED) unpin_user_pages(mr->pages, nr_refs); + else if (mr->flags & IO_REGION_F_MULTI_BUF) + release_multi_buf_pages(mr->pages, nr_refs); else release_pages(mr->pages, nr_refs); @@ -149,6 +175,54 @@ static int io_region_pin_pages(struct io_mapped_region *mr, return 0; } +static int io_region_allocate_pages_multi_buf(struct io_mapped_region *mr, + unsigned int nr_bufs, + unsigned int buf_size) +{ + gfp_t gfp = GFP_USER | __GFP_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages, **cur_pages; + unsigned int nr_allocated; + unsigned int buf_pages; + unsigned int i; + + if (!PAGE_ALIGNED(buf_size)) + return -EINVAL; + + buf_pages = buf_size >> PAGE_SHIFT; + + pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); + if (!pages) + return -ENOMEM; + + cur_pages = pages; + + for (i = 0; i < nr_bufs; i++) { + if (io_mem_alloc_compound(cur_pages, buf_pages, buf_size, + gfp)) { + cur_pages += buf_pages; + continue; + } + + nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE, + buf_pages, cur_pages); + if (nr_allocated != buf_pages) { + unsigned int total = + (cur_pages - pages) + nr_allocated; + + release_multi_buf_pages(pages, total); + kvfree(pages); + return -ENOMEM; + } + + cur_pages += buf_pages; + } + + mr->flags |= IO_REGION_F_MULTI_BUF; + mr->pages = pages; + + return 0; +} + static int io_region_allocate_pages(struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset) @@ -181,6 +255,43 @@ static int io_region_allocate_pages(struct io_mapped_region *mr, return 0; } +int io_create_region_multi_buf(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + unsigned int nr_bufs, unsigned int buf_size) +{ + unsigned int nr_pages; + int ret; + + if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) + return -EFAULT; + + if (WARN_ON_ONCE(!nr_bufs || !buf_size || !PAGE_ALIGNED(buf_size))) + return -EINVAL; + + if (check_mul_overflow(buf_size >> PAGE_SHIFT, nr_bufs, &nr_pages)) + return -EINVAL; + + if (ctx->user) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + mr->nr_pages = nr_pages; + + ret = io_region_allocate_pages_multi_buf(mr, nr_bufs, buf_size); + if (ret) + goto out_free; + + ret = io_region_init_ptr(mr); + if (ret) + goto out_free; + + return 0; +out_free: + io_free_region(ctx->user, mr); + return ret; +} + int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset) diff --git a/io_uring/memmap.h b/io_uring/memmap.h index f4cfbb6b9a1f..3aa1167462ae 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -22,6 +22,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset); +int io_create_region_multi_buf(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + unsigned int nr_bufs, unsigned int buf_size); + static inline void *io_region_get_ptr(struct io_mapped_region *mr) { return mr->ptr; diff --git a/io_uring/register.c b/io_uring/register.c index 0882cb34f851..2db8daaf8fde 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -837,7 +837,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_pbuf_ring(ctx, arg); break; + case IORING_REGISTER_KMBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_kmbuf_ring(ctx, arg); + break; case IORING_UNREGISTER_PBUF_RING: + case IORING_UNREGISTER_KMBUF_RING: ret = -EINVAL; if (!arg || nr_args != 1) break; -- 2.47.3 Add support for mmapping kernel-managed buffer rings (kmbuf) to userspace, allowing applications to access the kernel-allocated buffers. Similar to application-provided buffer rings (pbuf), kmbuf rings use the buffer group ID encoded in the mmap offset to identify which buffer ring to map. The implementation follows the same pattern as pbuf rings. New mmap offset constants are introduced: - IORING_OFF_KMBUF_RING (0x88000000): Base offset for kmbuf mappings - IORING_OFF_KMBUF_SHIFT (16): Shift value to encode buffer group ID The mmap offset encodes the bgid shifted by IORING_OFF_KMBUF_SHIFT. The io_buf_get_region() helper retrieves the appropriate region. This allows userspace to mmap the kernel-allocated buffer region and access the buffers directly. Signed-off-by: Joanne Koong --- include/uapi/linux/io_uring.h | 2 ++ io_uring/kbuf.c | 11 +++++++++-- io_uring/kbuf.h | 5 +++-- io_uring/memmap.c | 5 ++++- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a0889c1744bd..42a2812c9922 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -545,6 +545,8 @@ struct io_uring_cqe { #define IORING_OFF_SQES 0x10000000ULL #define IORING_OFF_PBUF_RING 0x80000000ULL #define IORING_OFF_PBUF_SHIFT 16 +#define IORING_OFF_KMBUF_RING 0x88000000ULL +#define IORING_OFF_KMBUF_SHIFT 16 #define IORING_OFF_MMAP_MASK 0xf8000000ULL /* diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 9bc36451d083..ccf5b213087b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -770,16 +770,23 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, - unsigned int bgid) +struct io_mapped_region *io_buf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid, + bool kernel_managed) { struct io_buffer_list *bl; + bool is_kernel_managed; lockdep_assert_held(&ctx->mmap_lock); bl = xa_load(&ctx->io_bl_xa, bgid); if (!bl || !(bl->flags & IOBL_BUF_RING)) return NULL; + + is_kernel_managed = !!(bl->flags & IOBL_KERNEL_MANAGED); + if (is_kernel_managed != kernel_managed) + return NULL; + return &bl->region; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 62c80a1ebf03..11d165888b8e 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -88,8 +88,9 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, bool io_kbuf_commit(struct io_kiocb *req, struct io_buffer_list *bl, int len, int nr); -struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, - unsigned int bgid); +struct io_mapped_region *io_buf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid, + bool kernel_managed); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req, struct io_buffer_list *bl) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 8d37e93c0433..916315122323 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -356,7 +356,10 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, return &ctx->sq_region; case IORING_OFF_PBUF_RING: id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - return io_pbuf_get_region(ctx, id); + return io_buf_get_region(ctx, id, false); + case IORING_OFF_KMBUF_RING: + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_KMBUF_SHIFT; + return io_buf_get_region(ctx, id, true); case IORING_MAP_OFF_PARAM_REGION: return &ctx->param_region; case IORING_MAP_OFF_ZCRX_REGION: -- 2.47.3 Allow kernel-managed buffers to be selected. This requires modifying the io_br_sel struct to separate the fields for address and val, since a kernel address cannot be distinguished from a negative val when error checking. Auto-commit any selected kernel-managed buffer. Signed-off-by: Joanne Koong --- include/linux/io_uring_types.h | 8 ++++---- io_uring/kbuf.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3e4a82a6f817..36cc2e0346d9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -93,13 +93,13 @@ struct io_mapped_region { */ struct io_br_sel { struct io_buffer_list *buf_list; - /* - * Some selection parts return the user address, others return an error. - */ union { + /* for classic/ring provided buffers */ void __user *addr; - ssize_t val; + /* for kernel-managed buffers */ + void *kaddr; }; + ssize_t val; }; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index ccf5b213087b..1e8395270227 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -155,7 +155,8 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, return 1; } -static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags) +static bool io_should_commit(struct io_kiocb *req, struct io_buffer_list *bl, + unsigned int issue_flags) { /* * If we came in unlocked, we have no choice but to consume the @@ -170,7 +171,11 @@ static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_UNLOCKED) return true; - /* uring_cmd commits kbuf upfront, no need to auto-commit */ + /* kernel-managed buffers are auto-committed */ + if (bl->flags & IOBL_KERNEL_MANAGED) + return true; + + /* multishot uring_cmd commits kbuf upfront, no need to auto-commit */ if (!io_file_can_poll(req) && req->opcode != IORING_OP_URING_CMD) return true; return false; @@ -200,9 +205,12 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_index = READ_ONCE(buf->bid); sel.buf_list = bl; - sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr)); + if (bl->flags & IOBL_KERNEL_MANAGED) + sel.kaddr = (void *)(uintptr_t)READ_ONCE(buf->addr); + else + sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr)); - if (io_should_commit(req, issue_flags)) { + if (io_should_commit(req, bl, issue_flags)) { io_kbuf_commit(req, sel.buf_list, *len, 1); sel.buf_list = NULL; } -- 2.47.3 Add kernel APIs to pin and unpin buffer rings, preventing userspace from unregistering a buffer ring while it is pinned by the kernel. This provides a mechanism for kernel subsystems to safely access buffer ring contents while ensuring the buffer ring remains valid. A pinned buffer ring cannot be unregistered until explicitly unpinned. On the userspace side, trying to unregister a pinned buffer will return -EBUSY. This is a preparatory change for upcoming fuse usage of kernel-managed buffer rings. It is necessary for fuse to pin the buffer ring because fuse may need to select a buffer in atomic contexts, which it can only do so by using the underlying buffer list pointer. Signed-off-by: Joanne Koong --- include/linux/io_uring/cmd.h | 17 +++++++++++++ io_uring/kbuf.c | 48 ++++++++++++++++++++++++++++++++++++ io_uring/kbuf.h | 5 ++++ 3 files changed, 70 insertions(+) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 375fd048c4cb..702b1903e6ee 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -84,6 +84,10 @@ struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, struct io_br_sel *sel, unsigned int issue_flags); +int io_uring_buf_ring_pin(struct io_uring_cmd *cmd, unsigned buf_group, + unsigned issue_flags, struct io_buffer_list **bl); +int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, unsigned buf_group, + unsigned issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -126,6 +130,19 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, { return true; } +static inline int io_uring_buf_ring_pin(struct io_uring_cmd *cmd, + unsigned buf_group, + unsigned issue_flags, + struct io_buffer_list **bl) +{ + return -EOPNOTSUPP; +} +static inline int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, + unsigned buf_group, + unsigned issue_flags) +{ + return -EOPNOTSUPP; +} #endif static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 1e8395270227..dee1764ed19f 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -237,6 +238,51 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, return sel; } +int io_uring_buf_ring_pin(struct io_uring_cmd *cmd, unsigned buf_group, + unsigned issue_flags, struct io_buffer_list **bl) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_buffer_list *buffer_list; + int ret = -EINVAL; + + io_ring_submit_lock(ctx, issue_flags); + + buffer_list = io_buffer_get_list(ctx, buf_group); + if (buffer_list && (buffer_list->flags & IOBL_BUF_RING)) { + if (unlikely(buffer_list->flags & IOBL_PINNED)) { + ret = -EALREADY; + } else { + buffer_list->flags |= IOBL_PINNED; + ret = 0; + *bl = buffer_list; + } + } + + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_uring_buf_ring_pin); + +int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, unsigned buf_group, + unsigned issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_buffer_list *bl; + int ret = -EINVAL; + + io_ring_submit_lock(ctx, issue_flags); + + bl = io_buffer_get_list(ctx, buf_group); + if (bl && (bl->flags & IOBL_BUF_RING) && (bl->flags & IOBL_PINNED)) { + bl->flags &= ~IOBL_PINNED; + ret = 0; + } + + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_uring_buf_ring_unpin); + /* cap it at a reasonable 256, will be one page even for 4K */ #define PEEK_MAX_IMPORT 256 @@ -747,6 +793,8 @@ int io_unregister_buf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOENT; if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; + if (bl->flags & IOBL_PINNED) + return -EBUSY; scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->io_bl_xa, bl->bgid); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 11d165888b8e..781630c2cc10 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -12,6 +12,11 @@ enum { IOBL_INC = 2, /* buffers are kernel managed */ IOBL_KERNEL_MANAGED = 4, + /* + * buffer ring is pinned and cannot be unregistered by userspace until + * it has been unpinned + */ + IOBL_PINNED = 8, }; struct io_buffer_list { -- 2.47.3 Add an interface for buffers to be recycled back into a kernel-managed buffer ring. This is a preparatory patch for fuse over io-uring. Signed-off-by: Joanne Koong --- include/linux/io_uring/cmd.h | 11 +++++++++ io_uring/kbuf.c | 44 ++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 702b1903e6ee..a488e945f883 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -88,6 +88,10 @@ int io_uring_buf_ring_pin(struct io_uring_cmd *cmd, unsigned buf_group, unsigned issue_flags, struct io_buffer_list **bl); int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, unsigned buf_group, unsigned issue_flags); + +int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, unsigned int buf_group, + u64 addr, unsigned int len, unsigned int bid, + unsigned int issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -143,6 +147,13 @@ static inline int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } +static inline int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, + unsigned int buf_group, u64 addr, + unsigned int len, unsigned int bid, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} #endif static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index dee1764ed19f..17b6178be4ce 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -102,6 +102,50 @@ void io_kbuf_drop_legacy(struct io_kiocb *req) req->kbuf = NULL; } +int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, unsigned int buf_group, + u64 addr, unsigned int len, unsigned int bid, + unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(cmd); + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_buf_ring *br; + struct io_uring_buf *buf; + struct io_buffer_list *bl; + int ret = -EINVAL; + + if (WARN_ON_ONCE(req->flags & REQ_F_BUFFERS_COMMIT)) + return ret; + + io_ring_submit_lock(ctx, issue_flags); + + bl = io_buffer_get_list(ctx, buf_group); + + if (!bl || WARN_ON_ONCE(!(bl->flags & IOBL_BUF_RING)) || + WARN_ON_ONCE(!(bl->flags & IOBL_KERNEL_MANAGED))) + goto done; + + br = bl->buf_ring; + + if (WARN_ON_ONCE((br->tail - bl->head) >= bl->nr_entries)) + goto done; + + buf = &br->bufs[(br->tail) & bl->mask]; + + buf->addr = addr; + buf->len = len; + buf->bid = bid; + + req->flags &= ~REQ_F_BUFFER_RING; + + br->tail++; + ret = 0; + +done: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_uring_kmbuf_recycle); + bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; -- 2.47.3 io_uring_is_kmbuf_ring() returns true if there is a kernel-managed buffer ring at the specified buffer group. This is a preparatory patch for upcoming fuse kernel-managed buffer support, which needs to ensure the buffer ring registered by the server is a kernel-managed buffer ring. Signed-off-by: Joanne Koong --- include/linux/io_uring/cmd.h | 9 +++++++++ io_uring/kbuf.c | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index a488e945f883..04a937f6f4d3 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -92,6 +92,9 @@ int io_uring_buf_ring_unpin(struct io_uring_cmd *cmd, unsigned buf_group, int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, unsigned int buf_group, u64 addr, unsigned int len, unsigned int bid, unsigned int issue_flags); + +bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd, unsigned int buf_group, + unsigned int issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -154,6 +157,12 @@ static inline int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } +static inline bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd, + unsigned int buf_group, + unsigned int issue_flags) +{ + return false; +} #endif static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 17b6178be4ce..797cc2f0a5e9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -963,3 +963,23 @@ int io_register_kmbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return ret; } + +bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd, unsigned int buf_group, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_buffer_list *bl; + bool is_kmbuf_ring = false; + + io_ring_submit_lock(ctx, issue_flags); + + bl = io_buffer_get_list(ctx, buf_group); + if (likely(bl) && (bl->flags & IOBL_KERNEL_MANAGED)) { + WARN_ON_ONCE(!(bl->flags & IOBL_BUF_RING)); + is_kmbuf_ring = true; + } + + io_ring_submit_unlock(ctx, issue_flags); + return is_kmbuf_ring; +} +EXPORT_SYMBOL_GPL(io_uring_is_kmbuf_ring); -- 2.47.3 Export io_ring_buffer_select() so that it may be used by callers who pass in a pinned bufring without needing to grab the io_uring mutex. This is a preparatory patch that will be needed by fuse io-uring, which will need to select a buffer from a kernel-managed bufring while the uring mutex may already be held by in-progress commits, and may need to select a buffer in atomic contexts. Signed-off-by: Joanne Koong --- include/linux/io_uring/cmd.h | 14 ++++++++++++++ io_uring/kbuf.c | 7 ++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 04a937f6f4d3..d4b5943bdeb1 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -95,6 +95,10 @@ int io_uring_kmbuf_recycle(struct io_uring_cmd *cmd, unsigned int buf_group, bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd, unsigned int buf_group, unsigned int issue_flags); + +struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -163,6 +167,16 @@ static inline bool io_uring_is_kmbuf_ring(struct io_uring_cmd *cmd, { return false; } +static inline struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, + size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) +{ + struct io_br_sel sel = { + .val = -EOPNOTSUPP, + }; + return sel; +} #endif static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 797cc2f0a5e9..9a93f10d3214 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -226,9 +226,9 @@ static bool io_should_commit(struct io_kiocb *req, struct io_buffer_list *bl, return false; } -static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl, - unsigned int issue_flags) +struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) { struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; @@ -261,6 +261,7 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, } return sel; } +EXPORT_SYMBOL_GPL(io_ring_buffer_select); struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, unsigned buf_group, unsigned int issue_flags) -- 2.47.3 Return the id of the selected buffer in io_buffer_select(). This is needed for kernel-managed buffer rings to later recycle the selected buffer. Signed-off-by: Joanne Koong --- include/linux/io_uring/cmd.h | 2 +- include/linux/io_uring_types.h | 2 ++ io_uring/kbuf.c | 7 +++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index d4b5943bdeb1..94df2bdebe77 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -71,7 +71,7 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); /* * Select a buffer from the provided buffer group for multishot uring_cmd. - * Returns the selected buffer address and size. + * Returns the selected buffer address, size, and id. */ struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, size_t *len, diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 36cc2e0346d9..5a56bb341337 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -100,6 +100,8 @@ struct io_br_sel { void *kaddr; }; ssize_t val; + /* id of the selected buffer */ + unsigned buf_id; }; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 9a93f10d3214..24c1e34ea23e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -250,6 +250,7 @@ struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_index = READ_ONCE(buf->bid); sel.buf_list = bl; + sel.buf_id = req->buf_index; if (bl->flags & IOBL_KERNEL_MANAGED) sel.kaddr = (void *)(uintptr_t)READ_ONCE(buf->addr); else @@ -274,10 +275,12 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { - if (bl->flags & IOBL_BUF_RING) + if (bl->flags & IOBL_BUF_RING) { sel = io_ring_buffer_select(req, len, bl, issue_flags); - else + } else { sel.addr = io_provided_buffer_select(req, len, bl); + sel.buf_id = req->buf_index; + } } io_ring_submit_unlock(req->ctx, issue_flags); return sel; -- 2.47.3 When uring_cmd operations select a buffer, the completion queue entry should indicate which buffer was selected. Set IORING_CQE_F_BUFFER on the completed entry and encode the buffer index if a buffer was selected. This will be needed for fuse, which needs to relay to userspace which selected buffer contains the data. Signed-off-by: Joanne Koong --- io_uring/uring_cmd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index ee7b49f47cb5..6d38df1a812d 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -151,6 +151,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, unsigned issue_flags, bool is_cqe32) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + u32 cflags = 0; if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) return; @@ -160,7 +161,10 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, if (ret < 0) req_set_fail(req); - io_req_set_res(req, ret, 0); + if (req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_BUFFER_RING)) + cflags |= IORING_CQE_F_BUFFER | + (req->buf_index << IORING_CQE_BUFFER_SHIFT); + io_req_set_res(req, ret, cflags); if (is_cqe32) { if (req->ctx->flags & IORING_SETUP_CQE_MIXED) req->cqe.flags |= IORING_CQE_F_32; -- 2.47.3