Currently, most of the userfaultfd features are implemented directly in the core mm. It will invoke VMA specific functions whenever necessary. So far it is fine because it almost only interacts with shmem and hugetlbfs. Introduce a generic userfaultfd API extension for vm_operations_struct, so that any code that implements vm_operations_struct (including kernel modules that can be compiled separately from the kernel core) can support userfaults without modifying the core files. With this API applied, if a module wants to support userfaultfd, the module should only need to properly define vm_uffd_ops and hook it to vm_operations_struct, instead of changing anything in core mm. This API will not work for anonymous memory. Handling of userfault operations for anonymous memory remains unchanged in core mm. Due to a security concern while reviewing older versions of this series [1], uffd_copy() will be temprorarily removed. IOW, MISSING-capable memory types can only be hard-coded and implemented in mm/. It would also affect UFFDIO_COPY and UFFDIO_ZEROPAGE. Other functions should still be able to be provided from vm_uffd_ops. Introduces the API only so that existing userfaultfd users can be moved over without breaking them. [1] https://lore.kernel.org/all/20250627154655.2085903-1-peterx@redhat.com/ Signed-off-by: Peter Xu --- include/linux/mm.h | 9 +++++++++ include/linux/userfaultfd_k.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c01c4b59ca67..011962130c148 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -620,6 +620,8 @@ struct vm_fault { */ }; +struct vm_uffd_ops; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -705,6 +707,13 @@ struct vm_operations_struct { struct page *(*find_normal_page)(struct vm_area_struct *vma, unsigned long addr); #endif /* CONFIG_FIND_NORMAL_PAGE */ +#ifdef CONFIG_USERFAULTFD + /* + * Userfaultfd related ops. Modules need to define this to support + * userfaultfd. + */ + const struct vm_uffd_ops *userfaultfd_ops; +#endif }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c0e716aec26aa..b5b4f3f174b32 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -92,6 +92,35 @@ enum mfill_atomic_mode { NR_MFILL_ATOMIC_MODES, }; +/* VMA userfaultfd operations */ +struct vm_uffd_ops { + /** + * @supported_ioctls: userfaultfd ioctls supported in bitmask. + * + * Userfaultfd ioctls supported by the module. Below will always + * be supported by default whenever a module provides vm_uffd_ops: + * + * _UFFDIO_API, _UFFDIO_REGISTER, _UFFDIO_UNREGISTER, _UFFDIO_WAKE + * + * The module needs to provide all the rest optionally supported + * ioctls as a bitmask. For example, a module needs to set the bit + * BIT(_UFFDIO_CONTINUE) to support userfaultfd minor faults. + */ + unsigned long supported_ioctls; + /** + * minor_get_folio: Handler to resolve UFFDIO_CONTINUE request. + * Must be specified if _UFFDIO_CONTINUE is set. + * + * @inode: the inode for folio lookup + * @pgoff: the pgoff of the folio + * @folio: returned folio pointer + * + * Return: zero if succeeded, negative for errors. + */ + int (*minor_get_folio)(struct inode *inode, pgoff_t pgoff, + struct folio **folio); +}; + #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) -- 2.50.1 Add support for the new vm_uffd_ops API for shmem. Note that this only introduces the support, the API is not yet used by core mm. It only needs a separate minor_get_folio() definition but that's oneliner. Due to the limitation of the current vm_uffd_ops on MISSING mode support, the shmem UFFDIO_COPY/ZEROPAGE process are still hard-coded in mm/. Cc: Hugh Dickins Acked-by: Mike Rapoport Signed-off-by: Peter Xu --- mm/shmem.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index b50ce7dbc84a0..0be112689f9e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3146,6 +3146,13 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD + +static int shmem_uffd_get_folio(struct inode *inode, pgoff_t pgoff, + struct folio **folio) +{ + return shmem_get_folio(inode, pgoff, 0, folio, SGP_NOALLOC); +} + int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, @@ -5189,6 +5196,17 @@ static int shmem_error_remove_folio(struct address_space *mapping, return 0; } +#ifdef CONFIG_USERFAULTFD +static const struct vm_uffd_ops shmem_uffd_ops = { + .supported_ioctls = BIT(_UFFDIO_COPY) | + BIT(_UFFDIO_ZEROPAGE) | + BIT(_UFFDIO_WRITEPROTECT) | + BIT(_UFFDIO_CONTINUE) | + BIT(_UFFDIO_POISON), + .minor_get_folio = shmem_uffd_get_folio, +}; +#endif + static const struct address_space_operations shmem_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS @@ -5291,6 +5309,9 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .userfaultfd_ops = &shmem_uffd_ops, +#endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { @@ -5300,6 +5321,9 @@ static const struct vm_operations_struct shmem_anon_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .userfaultfd_ops = &shmem_uffd_ops, +#endif }; int shmem_init_fs_context(struct fs_context *fc) -- 2.50.1 Add support for the new vm_uffd_ops API for hugetlb. Note that this only introduces the support, the API is not yet used by core mm. Due to legacy reasons, it's still not trivial to move hugetlb completely to the API. But it will still use supported_ioctls properly on the API with all the implementations still hard coded in mm/. Cc: Muchun Song Cc: Oscar Salvador Acked-by: Mike Rapoport Signed-off-by: Peter Xu --- mm/hugetlb.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 657b9facba280..ad98876d338e5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5503,6 +5503,21 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) return 0; } +#ifdef CONFIG_USERFAULTFD +static const struct vm_uffd_ops hugetlb_uffd_ops = { + /* _UFFDIO_ZEROPAGE not supported */ + .supported_ioctls = BIT(_UFFDIO_COPY) | + BIT(_UFFDIO_WRITEPROTECT) | + BIT(_UFFDIO_CONTINUE) | + BIT(_UFFDIO_POISON), + /* + * Hugetlbfs still has its own hard-coded handler in userfaultfd, + * due to limitations similar to vm_operations_struct.fault(). + * TODO: generalize it to use the API functions. + */ +}; +#endif + /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. @@ -5516,6 +5531,9 @@ const struct vm_operations_struct hugetlb_vm_ops = { .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, +#ifdef CONFIG_USERFAULTFD + .userfaultfd_ops = &hugetlb_uffd_ops, +#endif }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, -- 2.50.1 Move userfaultfd core to use new vm_uffd_ops API. After this change, file systems that implement vm_operations_struct can start using new API for userfaultfd operations. When at it, moving vma_can_userfault() into mm/userfaultfd.c instead, because it's getting too big. It's only used in slow paths so it shouldn't be an issue. Move the pte marker check before wp_async, which might be more intuitive because wp_async depends on pte markers. That shouldn't cause any functional change though because only one check would take effect depending on whether pte marker was selected in config. This will also remove quite some hard-coded checks for either shmem or hugetlbfs. Now all the old checks should still work but with vm_uffd_ops. Note that anonymous memory will still need to be processed separately because it doesn't have vm_ops at all. Reviewed-by: James Houghton Acked-by: Mike Rapoport Signed-off-by: Peter Xu --- include/linux/userfaultfd_k.h | 46 ++++--------- mm/userfaultfd.c | 120 +++++++++++++++++++++++++++------- 2 files changed, 109 insertions(+), 57 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index b5b4f3f174b32..983efaecfdcb0 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -126,9 +126,14 @@ struct vm_uffd_ops { #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1)) +static inline enum mfill_atomic_mode uffd_flags_get_mode(uffd_flags_t flags) +{ + return (__force enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK); +} + static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected) { - return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected); + return uffd_flags_get_mode(flags) == expected; } static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) @@ -237,41 +242,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - vm_flags_t vm_flags, - bool wp_async) +static inline const struct vm_uffd_ops *vma_get_uffd_ops(struct vm_area_struct *vma) { - vm_flags &= __VM_UFFD_FLAGS; - - if (vma->vm_flags & VM_DROPPABLE) - return false; - - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; - - /* - * If wp async enabled, and WP is the only mode enabled, allow any - * memory type. - */ - if (wp_async && (vm_flags == VM_UFFD_WP)) - return true; - -#ifndef CONFIG_PTE_MARKER_UFFD_WP - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) - return false; -#endif - - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + if (vma->vm_ops && vma->vm_ops->userfaultfd_ops) + return vma->vm_ops->userfaultfd_ops; + return NULL; } +bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags, bool wp_async); + static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 00122f42718cc..b50db51e75d5b 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -20,6 +20,61 @@ #include "internal.h" #include "swap.h" +static unsigned long uffd_get_supported_modes(struct vm_area_struct *vma) +{ + const struct vm_uffd_ops *uffd_ops = vma_get_uffd_ops(vma); + unsigned long ioctls, supported = 0; + + if (!uffd_ops) + return 0; + + ioctls = uffd_ops->supported_ioctls; + + if (ioctls & (BIT(_UFFDIO_COPY) | BIT(_UFFDIO_ZEROPAGE))) + supported |= VM_UFFD_MISSING; + if (ioctls & BIT(_UFFDIO_CONTINUE)) + supported |= VM_UFFD_MINOR; + if (ioctls & BIT(_UFFDIO_WRITEPROTECT)) + supported |= VM_UFFD_WP; + + return supported; +} + +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async) +{ + unsigned long supported = 0; + + if (vma->vm_flags & VM_DROPPABLE) + return false; + + vm_flags &= __VM_UFFD_FLAGS; + + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then any file system (like shmem or hugetlbfs) are not + * supported but only anonymous. + */ + if (!IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP) && + ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))) + return false; + + /* + * If wp async enabled, and WP is the only mode enabled, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + + if (vma_is_anonymous(vma)) + /* Anonymous has no page cache, MINOR not supported */ + supported = VM_UFFD_MISSING | VM_UFFD_WP; + else if (vma_get_uffd_ops(vma)) + supported = uffd_get_supported_modes(vma); + + return !(vm_flags & ~supported); +} + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { @@ -382,13 +437,17 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, unsigned long dst_addr, uffd_flags_t flags) { + const struct vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma); struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); struct folio *folio; struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->minor_get_folio)) + return -EINVAL; + + ret = uffd_ops->minor_get_folio(inode, pgoff, &folio); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; @@ -504,18 +563,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb( u32 hash; struct address_space *mapping; - /* - * There is no default zero huge page for all huge page sizes as - * supported by hugetlb. A PMD_SIZE huge pages may exist as used - * by THP. Since we can not reliably insert a zero page, this - * feature is not supported. - */ - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); - return -EINVAL; - } - src_addr = src_start; dst_addr = dst_start; copied = 0; @@ -694,6 +741,41 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, return err; } +static inline bool +vma_uffd_ops_supported(struct vm_area_struct *vma, uffd_flags_t flags) +{ + enum mfill_atomic_mode mode = uffd_flags_get_mode(flags); + const struct vm_uffd_ops *uffd_ops; + unsigned long supported_ioctls; + + if ((flags & MFILL_ATOMIC_WP) && !(vma->vm_flags & VM_UFFD_WP)) + return false; + + /* Anonymous supports everything except CONTINUE */ + if (vma_is_anonymous(vma)) + return mode != MFILL_ATOMIC_CONTINUE; + + uffd_ops = vma_get_uffd_ops(vma); + if (!uffd_ops) + return false; + + supported_ioctls = uffd_ops->supported_ioctls; + switch (mode) { + case MFILL_ATOMIC_COPY: + return supported_ioctls & BIT(_UFFDIO_COPY); + case MFILL_ATOMIC_ZEROPAGE: + return supported_ioctls & BIT(_UFFDIO_ZEROPAGE); + case MFILL_ATOMIC_CONTINUE: + if (!(vma->vm_flags & VM_SHARED)) + return false; + return supported_ioctls & BIT(_UFFDIO_CONTINUE); + case MFILL_ATOMIC_POISON: + return supported_ioctls & BIT(_UFFDIO_POISON); + default: + return false; + } +} + static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, @@ -752,11 +834,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, dst_vma->vm_flags & VM_SHARED)) goto out_unlock; - /* - * validate 'mode' now that we know the dst_vma: don't allow - * a wrprotect copy if the userfaultfd didn't register as WP. - */ - if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) + if (!vma_uffd_ops_supported(dst_vma, flags)) goto out_unlock; /* @@ -766,12 +844,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start, len, flags); - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) - goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) - goto out_unlock; - while (src_addr < src_start + len) { pmd_t dst_pmdval; -- 2.50.1