From: "Kiryl Shutsemau (Meta)" Wire the fault side of read-write protection tracking and turn the userspace interface on. An RWP-protected PTE is PAGE_NONE with the uffd bit set. The PROT_NONE triggers a fault on any access; the uffd bit distinguishes it from plain mprotect(PROT_NONE) or NUMA hinting. Fault dispatch, per level: PTE handle_pte_fault() -> do_uffd_rwp() PMD __handle_mm_fault() -> do_huge_pmd_uffd_rwp() hugetlb hugetlb_fault() -> hugetlb_handle_userfault() The RWP branches gate on userfaultfd_pte_rwp() / userfaultfd_huge_pmd_rwp() (VM_UFFD_RWP plus the uffd bit) and fall through to do_numa_page() / do_huge_pmd_numa_page() otherwise. Each delivers a UFFD_PAGEFAULT_FLAG_RWP message through handle_userfault(); the handler resolves it with UFFDIO_RWPROTECT clearing MODE_RWP. userfaultfd_must_wait() and userfaultfd_huge_must_wait() add matching protnone+uffd waiters so sync-mode fault handlers block correctly. Expose the UAPI: UFFDIO_REGISTER_MODE_RWP -> UFFD_API_REGISTER_MODES UFFD_FEATURE_RWP -> UFFD_API_FEATURES _UFFDIO_RWPROTECT -> UFFD_API_RANGE_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC UFFD_FEATURE_RWP is masked out at UFFDIO_API time when PROT_NONE is not available or VM_UFFD_RWP aliases VM_NONE (32-bit), so userspace never sees an advertised-but-broken feature. Works on anonymous, shmem, and hugetlb memory. Signed-off-by: Kiryl Shutsemau Assisted-by: Claude:claude-opus-4-6 Reviewed-by: Mike Rapoport (Microsoft) --- fs/userfaultfd.c | 32 ++++++++++++++++++++++++++++++-- include/linux/huge_mm.h | 7 +++++++ include/linux/userfaultfd_k.h | 24 ++++++++++++++++++++++++ include/uapi/linux/userfaultfd.h | 12 ++++++++---- mm/huge_memory.c | 5 +++++ mm/hugetlb.c | 11 +++++++++++ mm/memory.c | 21 +++++++++++++++++++-- 7 files changed, 104 insertions(+), 8 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f2097c558165..f8f1619f5183 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -261,6 +261,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, */ if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) return true; + /* + * PTE is still RW-protected (protnone with uffd bit), wait for + * resolution. Plain PROT_NONE without the marker is not an RWP fault. + */ + if (pte_protnone(pte) && huge_pte_uffd(pte) && (reason & VM_UFFD_RWP)) + return true; return false; } @@ -321,8 +327,14 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, if (!pmd_present(_pmd)) return false; - if (pmd_trans_huge(_pmd)) - return !pmd_write(_pmd) && (reason & VM_UFFD_WP); + if (pmd_trans_huge(_pmd)) { + if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) + return true; + if (pmd_protnone(_pmd) && pmd_uffd(_pmd) && + (reason & VM_UFFD_RWP)) + return true; + return false; + } pte = pte_offset_map(pmd, address); if (!pte) @@ -347,6 +359,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, */ if (!pte_write(ptent) && (reason & VM_UFFD_WP)) goto out; + /* + * PTE is still RW-protected (protnone with uffd bit), wait for + * userspace to resolve. Plain PROT_NONE without the marker is not + * an RWP fault. + */ + if (pte_protnone(ptent) && pte_uffd(ptent) && (reason & VM_UFFD_RWP)) + goto out; ret = false; out: @@ -2086,6 +2105,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; } + /* + * RWP needs both PROT_NONE support and the uffd-wp PTE bit. The + * VM_UFFD_RWP check covers compile-time unavailability; the + * pgtable_supports_uffd() check covers runtime (e.g. riscv + * without the SVRSW60T59B extension) where the PTE bit is declared + * but not actually usable. + */ + if (VM_UFFD_RWP == VM_NONE || !pgtable_supports_uffd()) + uffdio_api.features &= ~UFFD_FEATURE_RWP; ret = -EINVAL; if (features & ~uffdio_api.features) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2949e5acff35..e980909ee49e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -520,6 +520,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); +vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf); + vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf); extern struct folio *huge_zero_folio; @@ -702,6 +704,11 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, return NULL; } +static inline vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf) +{ + return 0; +} + static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { return 0; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d46974be864e..1beae4f2f479 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -247,6 +247,18 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, return userfaultfd_wp(vma) && pmd_uffd(pmd); } +static inline bool userfaultfd_pte_rwp(struct vm_area_struct *vma, + pte_t pte) +{ + return userfaultfd_rwp(vma) && pte_uffd(pte); +} + +static inline bool userfaultfd_huge_pmd_rwp(struct vm_area_struct *vma, + pmd_t pmd) +{ + return userfaultfd_rwp(vma) && pmd_uffd(pmd); +} + static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return vma->vm_flags & __VM_UFFD_FLAGS; @@ -399,6 +411,18 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, return false; } +static inline bool userfaultfd_pte_rwp(struct vm_area_struct *vma, + pte_t pte) +{ + return false; +} + +static inline bool userfaultfd_huge_pmd_rwp(struct vm_area_struct *vma, + pmd_t pmd) +{ + return false; +} + static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return false; diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 7b78aa3b5318..d803e76d47ad 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -25,7 +25,8 @@ #define UFFD_API ((__u64)0xAA) #define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ UFFDIO_REGISTER_MODE_WP | \ - UFFDIO_REGISTER_MODE_MINOR) + UFFDIO_REGISTER_MODE_MINOR | \ + UFFDIO_REGISTER_MODE_RWP) #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ @@ -42,7 +43,8 @@ UFFD_FEATURE_WP_UNPOPULATED | \ UFFD_FEATURE_POISON | \ UFFD_FEATURE_WP_ASYNC | \ - UFFD_FEATURE_MOVE) + UFFD_FEATURE_MOVE | \ + UFFD_FEATURE_RWP) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -54,13 +56,15 @@ (__u64)1 << _UFFDIO_MOVE | \ (__u64)1 << _UFFDIO_WRITEPROTECT | \ (__u64)1 << _UFFDIO_CONTINUE | \ - (__u64)1 << _UFFDIO_POISON) + (__u64)1 << _UFFDIO_POISON | \ + (__u64)1 << _UFFDIO_RWPROTECT) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_WRITEPROTECT | \ (__u64)1 << _UFFDIO_CONTINUE | \ - (__u64)1 << _UFFDIO_POISON) + (__u64)1 << _UFFDIO_POISON | \ + (__u64)1 << _UFFDIO_RWPROTECT) /* * Valid ioctl command number range with this API is from 0x00 to diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 189192ea45cf..76ca0fbaa802 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2264,6 +2264,11 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma, return pmd_dirty(pmd); } +vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf) +{ + return handle_userfault(vmf, VM_UFFD_RWP); +} + /* NUMA hinting page fault entry point for trans huge pmds */ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eee32a325481..9fc31cbcba4b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6067,6 +6067,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mutex; } + /* + * Protnone hugetlb PTEs with the uffd bit are used by + * userfaultfd RWP for access tracking. Plain PROT_NONE (without the + * marker) is not an RWP fault and is not expected on hugetlb (no + * NUMA hinting), so let normal hugetlb fault handling proceed. + */ + if (pte_protnone(vmf.orig_pte) && vma_is_accessible(vma) && + userfaultfd_rwp(vma) && huge_pte_uffd(vmf.orig_pte)) { + return hugetlb_handle_userfault(&vmf, mapping, VM_UFFD_RWP); + } + /* * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any diff --git a/mm/memory.c b/mm/memory.c index ea9616e3dbaf..e0dcf2c28d9d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6172,6 +6172,12 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru } } +static vm_fault_t do_uffd_rwp(struct vm_fault *vmf) +{ + pte_unmap(vmf->pte); + return handle_userfault(vmf, VM_UFFD_RWP); +} + static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -6446,8 +6452,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); - if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) { + /* + * RWP-protected PTEs are protnone plus the uffd bit. On a + * VM_UFFD_RWP VMA, a protnone PTE without the uffd bit is + * NUMA hinting and must still fall through to do_numa_page(). + */ + if (userfaultfd_pte_rwp(vmf->vma, vmf->orig_pte)) + return do_uffd_rwp(vmf); return do_numa_page(vmf); + } spin_lock(vmf->ptl); entry = vmf->orig_pte; @@ -6561,8 +6575,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return 0; } if (pmd_trans_huge(vmf.orig_pmd)) { - if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) { + if (userfaultfd_huge_pmd_rwp(vma, vmf.orig_pmd)) + return do_huge_pmd_uffd_rwp(&vmf); return do_huge_pmd_numa_page(&vmf); + } if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !pmd_write(vmf.orig_pmd)) { -- 2.51.2