hmm_range_fault() currently triggers page faults from inside the page-table walk callbacks: hmm_vma_walk_pmd(), hmm_vma_walk_pud(), hmm_vma_walk_hugetlb_entry() and the pte-level helper all call hmm_vma_fault(), which in turn calls handle_mm_fault() while the walker still holds nested locks. The pte spinlock is dropped explicitly by each caller, and the hugetlb path manually drops and retakes hugetlb_vma_lock_read around the fault to dodge a deadlock against the walk framework's unconditional unlock. This layering does not extend cleanly to fault handlers that may release mmap_lock (VM_FAULT_RETRY, VM_FAULT_COMPLETED). If the lock is dropped while walk_page_range() is mid-traversal, the VMA can be freed before the walk framework's matching hugetlb_vma_unlock_read(), turning that unlock into a use-after-free. Split the responsibilities the way get_user_pages() does. Walk callbacks become inspect-only: when they detect a range that needs to be faulted in, they record it in struct hmm_vma_walk and return a private sentinel (HMM_FAULT_PENDING). The outer loop in hmm_range_fault() then drops out of walk_page_range(), invokes a new helper hmm_do_fault() that calls handle_mm_fault() with only mmap_lock held, and restarts the walk so the now-present entries are collected into hmm_pfns. No functional change for existing callers. As a side effect the hugetlb callback no longer needs the hugetlb_vma_{un}lock_read dance, and every fault-path exit from the callbacks now releases the pte spinlock on a single, common path. This refactor is also a precursor for adding an unlockable variant of hmm_range_fault() in a follow-up patch. Signed-off-by: Stanislav Kinsburskii --- mm/hmm.c | 118 +++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 43 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 5955f2f0c83db..2b157fcbc2928 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -33,8 +33,17 @@ struct hmm_vma_walk { struct hmm_range *range; unsigned long last; + unsigned long end; + unsigned int required_fault; }; +/* + * Internal sentinel returned by walk callbacks when they need a page fault. + * The callback stores end/required_fault in hmm_vma_walk; the outer loop + * consumes the sentinel and never propagates it to the caller. + */ +#define HMM_FAULT_PENDING -EAGAIN + enum { HMM_NEED_FAULT = 1 << 0, HMM_NEED_WRITE_FAULT = 1 << 1, @@ -60,37 +69,25 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end, } /* - * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) - * @addr: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @required_fault: HMM_NEED_* flags - * @walk: mm_walk structure - * Return: -EBUSY after page fault, or page fault error + * hmm_record_fault() - record a range that needs to be faulted in * - * This function will be called whenever pmd_none() or pte_none() returns true, - * or whenever there is no page directory covering the virtual address range. + * Called by the walk callbacks when they discover that part of the range + * needs a page fault. The callback records what to fault and returns + * HMM_FAULT_PENDING; the outer loop in hmm_range_fault() drops back out of + * walk_page_range() and invokes handle_mm_fault() from a context where no + * page-table or hugetlb_vma_lock is held. */ -static int hmm_vma_fault(unsigned long addr, unsigned long end, - unsigned int required_fault, struct mm_walk *walk) +static int hmm_record_fault(unsigned long addr, unsigned long end, + unsigned int required_fault, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct vm_area_struct *vma = walk->vma; - unsigned int fault_flags = FAULT_FLAG_REMOTE; WARN_ON_ONCE(!required_fault); hmm_vma_walk->last = addr; - - if (required_fault & HMM_NEED_WRITE_FAULT) { - if (!(vma->vm_flags & VM_WRITE)) - return -EPERM; - fault_flags |= FAULT_FLAG_WRITE; - } - - for (; addr < end; addr += PAGE_SIZE) - if (handle_mm_fault(vma, addr, fault_flags, NULL) & - VM_FAULT_ERROR) - return -EFAULT; - return -EBUSY; + hmm_vma_walk->end = end; + hmm_vma_walk->required_fault = required_fault; + return HMM_FAULT_PENDING; } static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, @@ -174,7 +171,7 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); } if (required_fault) - return hmm_vma_fault(addr, end, required_fault, walk); + return hmm_record_fault(addr, end, required_fault, walk); return hmm_pfns_fill(addr, end, range, 0); } @@ -209,7 +206,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); if (required_fault) - return hmm_vma_fault(addr, end, required_fault, walk); + return hmm_record_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { @@ -328,7 +325,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, fault: pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ - return hmm_vma_fault(addr, end, required_fault, walk); + return hmm_record_fault(addr, end, required_fault, walk); } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -371,7 +368,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, npages, 0); if (required_fault) { if (softleaf_is_device_private(entry)) - return hmm_vma_fault(addr, end, required_fault, walk); + return hmm_record_fault(addr, end, required_fault, walk); else return -EFAULT; } @@ -517,7 +514,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, npages, cpu_flags); if (required_fault) { spin_unlock(ptl); - return hmm_vma_fault(addr, end, required_fault, walk); + return hmm_record_fault(addr, end, required_fault, walk); } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); @@ -564,21 +561,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) { - int ret; - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - /* - * Avoid deadlock: drop the vma lock before calling - * hmm_vma_fault(), which will itself potentially take and - * drop the vma lock. This is also correct from a - * protection point of view, because there is no further - * use here of either pte or ptl after dropping the vma - * lock. - */ - ret = hmm_vma_fault(addr, end, required_fault, walk); - hugetlb_vma_lock_read(vma); - return ret; + return hmm_record_fault(addr, end, required_fault, walk); } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); @@ -637,6 +621,44 @@ static const struct mm_walk_ops hmm_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; +/* + * hmm_do_fault - fault in a range recorded by a walk callback + * + * Called from the outer loop in hmm_range_fault() after a callback + * returned HMM_FAULT_PENDING. At this point we hold only mmap_lock; + * the page-table spinlock and any hugetlb_vma_lock acquired by the walk + * framework have already been released by the unwind. + * + * Returns -EBUSY on success (all pages faulted, caller should re-walk). + * Returns a negative errno on failure. + */ +static int hmm_do_fault(struct mm_struct *mm, + struct hmm_vma_walk *hmm_vma_walk) +{ + unsigned long addr = hmm_vma_walk->last; + unsigned long end = hmm_vma_walk->end; + unsigned int required_fault = hmm_vma_walk->required_fault; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + struct vm_area_struct *vma; + + vma = vma_lookup(mm, addr); + if (!vma) + return -EFAULT; + + if (required_fault & HMM_NEED_WRITE_FAULT) { + if (!(vma->vm_flags & VM_WRITE)) + return -EPERM; + fault_flags |= FAULT_FLAG_WRITE; + } + + for (; addr < end; addr += PAGE_SIZE) + if (handle_mm_fault(vma, addr, fault_flags, NULL) & + VM_FAULT_ERROR) + return -EFAULT; + + return -EBUSY; +} + /** * hmm_range_fault - try to fault some address in a virtual address range * @range: argument structure @@ -674,6 +696,16 @@ int hmm_range_fault(struct hmm_range *range) return -EBUSY; ret = walk_page_range(mm, hmm_vma_walk.last, range->end, &hmm_walk_ops, &hmm_vma_walk); + /* + * When HMM_FAULT_PENDING is returned a walk callback + * recorded a range that needs handle_mm_fault(); + * hmm_do_fault() runs the fault outside walk_page_range() + * (so no page-table or hugetlb_vma_lock is held) and + * returns -EBUSY so the loop re-walks and picks up the + * now-present entries. + */ + if (ret == HMM_FAULT_PENDING) + ret = hmm_do_fault(mm, &hmm_vma_walk); /* * When -EBUSY is returned the loop restarts with * hmm_vma_walk.last set to an address that has not been stored hmm_range_fault() holds the mmap read lock for the duration of the call. This is incompatible with mappings whose fault handler may release the mmap lock - notably userfaultfd-managed regions, where handle_mm_fault() returns VM_FAULT_RETRY or VM_FAULT_COMPLETED after dropping the lock. Drivers that need to populate device page tables for such mappings have no way to do so today. Add hmm_range_fault_unlockable(), modelled on the int *locked pattern from get_user_pages_remote() in mm/gup.c. Callers set *locked = 1 and pass &locked; the function may set *locked = 0 to report that handle_mm_fault() dropped the mmap lock during a page fault, in which case the caller must reacquire it and restart the walk with a fresh mmu_interval_read_begin() sequence. The implementation is local to hmm_do_fault() and the outer loop in hmm_range_fault_unlockable(). hmm_do_fault() conditionally sets FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE when locked is non-NULL and translates VM_FAULT_RETRY / VM_FAULT_COMPLETED into *locked = 0 plus a private return code consumed by the outer loop, which in turn returns 0 (or -EINTR on fatal signal) to the caller. The previous refactor that moved page fault handling out of the page-table walk callbacks is what makes this change small. Faults now run after walk_page_range() has unwound, with only the mmap lock held, so dropping it does not interact with the walker's pte spinlock or hugetlb_vma_lock. Hugetlb regions therefore participate in the unlockable path uniformly with PTE- and PMD-level mappings; no special case is required. hmm_range_fault() becomes a thin wrapper, preserving exact behaviour for all existing callers. No EXPORT_SYMBOL behaviour change for hmm_range_fault. Documentation/mm/hmm.rst is updated with a description of the new API and the recommended caller pattern. Signed-off-by: Stanislav Kinsburskii --- Documentation/mm/hmm.rst | 62 +++++++++++++++++++++++++++++++++++++ include/linux/hmm.h | 1 + mm/hmm.c | 77 +++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 135 insertions(+), 5 deletions(-) diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst index 7d61b7a8b65b7..a9309023ec232 100644 --- a/Documentation/mm/hmm.rst +++ b/Documentation/mm/hmm.rst @@ -208,6 +208,68 @@ invalidate() callback. That lock must be held before calling mmu_interval_read_retry() to avoid any race with a concurrent CPU page table update. +Dropping the mmap lock during page faults +========================================= + +Some VMAs have fault handlers that need to release the mmap lock while +servicing a fault (for example, regions managed by ``userfaultfd``). +``hmm_range_fault()`` cannot be used on such mappings because it must hold the +mmap lock for the duration of the call. Drivers that need to support them +should call:: + + int hmm_range_fault_unlockable(struct hmm_range *range, int *locked); + +The caller sets ``*locked = 1`` and holds ``mmap_read_lock`` before the call. +If the mmap lock is dropped inside ``handle_mm_fault()``, the function sets +``*locked = 0`` and returns ``0``; the caller is responsible for reacquiring +the lock and restarting the walk from ``range->start`` with a fresh notifier +sequence. When ``locked`` is ``NULL`` the function keeps the lock held for the +duration of the call, identical to ``hmm_range_fault()``. + +A typical caller looks like this:: + + int driver_populate_range_unlockable(...) + { + struct hmm_range range; + int locked; + ... + + range.notifier = &interval_sub; + range.start = ...; + range.end = ...; + range.hmm_pfns = ...; + + if (!mmget_not_zero(interval_sub.mm)) + return -EFAULT; + + again: + range.notifier_seq = mmu_interval_read_begin(&interval_sub); + locked = 1; + mmap_read_lock(mm); + ret = hmm_range_fault_unlockable(&range, &locked); + if (locked) + mmap_read_unlock(mm); + if (ret) { + if (ret == -EBUSY) + goto again; + return ret; + } + if (!locked) + goto again; + + take_lock(driver->update); + if (mmu_interval_read_retry(&interval_sub, range.notifier_seq)) { + release_lock(driver->update); + goto again; + } + + /* Use pfns array content to update device page table, + * under the update lock */ + + release_lock(driver->update); + return 0; + } + Leverage default_flags and pfn_flags_mask ========================================= diff --git a/include/linux/hmm.h b/include/linux/hmm.h index db75ffc949a7a..46e581865c48a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -123,6 +123,7 @@ struct hmm_range { * Please see Documentation/mm/hmm.rst for how to use the range API. */ int hmm_range_fault(struct hmm_range *range); +int hmm_range_fault_unlockable(struct hmm_range *range, int *locked); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/mm/hmm.c b/mm/hmm.c index 2b157fcbc2928..be13894e67bb8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -32,6 +32,7 @@ struct hmm_vma_walk { struct hmm_range *range; + int *locked; unsigned long last; unsigned long end; unsigned int required_fault; @@ -44,6 +45,13 @@ struct hmm_vma_walk { */ #define HMM_FAULT_PENDING -EAGAIN +/* + * Internal sentinel returned by hmm_do_fault() when handle_mm_fault() drops + * the mmap lock during a page fault. hmm_do_fault() sets *locked = 0; the + * outer loop consumes the sentinel and never propagates it to the caller. + */ +#define HMM_FAULT_UNLOCKED -ENOLCK + enum { HMM_NEED_FAULT = 1 << 0, HMM_NEED_WRITE_FAULT = 1 << 1, @@ -639,6 +647,7 @@ static int hmm_do_fault(struct mm_struct *mm, unsigned long end = hmm_vma_walk->end; unsigned int required_fault = hmm_vma_walk->required_fault; unsigned int fault_flags = FAULT_FLAG_REMOTE; + int *locked = hmm_vma_walk->locked; struct vm_area_struct *vma; vma = vma_lookup(mm, addr); @@ -651,10 +660,20 @@ static int hmm_do_fault(struct mm_struct *mm, fault_flags |= FAULT_FLAG_WRITE; } - for (; addr < end; addr += PAGE_SIZE) - if (handle_mm_fault(vma, addr, fault_flags, NULL) & - VM_FAULT_ERROR) + if (locked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + for (; addr < end; addr += PAGE_SIZE) { + vm_fault_t ret; + + ret = handle_mm_fault(vma, addr, fault_flags, NULL); + if (ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) { + *locked = 0; + return HMM_FAULT_UNLOCKED; + } + if (ret & VM_FAULT_ERROR) return -EFAULT; + } return -EBUSY; } @@ -677,11 +696,53 @@ static int hmm_do_fault(struct mm_struct *mm, * * This is similar to get_user_pages(), except that it can read the page tables * without mutating them (ie causing faults). + * + * The mmap lock must be held by the caller and will remain held on return. + * For a variant that allows the mmap lock to be dropped during faults (e.g., + * for userfaultfd support), see hmm_range_fault_unlockable(). */ int hmm_range_fault(struct hmm_range *range) +{ + return hmm_range_fault_unlockable(range, NULL); +} +EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_range_fault_unlockable - fault in a range, possibly dropping the mmap lock + * @range: argument structure + * @locked: pointer to caller's lock state, or %NULL + * + * Behaves like hmm_range_fault(), but allows handle_mm_fault() to drop the + * mmap read lock during a fault. This makes the function usable on mappings + * whose fault path may release the lock (for example, userfaultfd-managed + * regions). + * + * If @locked is %NULL the mmap lock is never released and the function + * behaves exactly like hmm_range_fault(). + * + * If @locked is non-%NULL the caller must hold mmap_read_lock and set + * *@locked = 1 before the call. On return: + * + * *@locked == 1: the mmap lock is still held. The return value has the + * same meaning as hmm_range_fault() (0 on success, or one + * of the error codes documented there). + * + * *@locked == 0: the mmap lock was dropped during a page fault. No PFNs + * collected so far are guaranteed to be valid because the + * address space may have changed under us. The return + * value is either 0 (caller must reacquire the lock and + * restart with a fresh mmu_interval_read_begin()) or + * -EINTR (a fatal signal is pending; abort). + * + * The caller is responsible for reacquiring mmap_read_lock and restarting + * the operation from range->start. See Documentation/mm/hmm.rst for the + * full usage pattern. + */ +int hmm_range_fault_unlockable(struct hmm_range *range, int *locked) { struct hmm_vma_walk hmm_vma_walk = { .range = range, + .locked = locked, .last = range->start, }; struct mm_struct *mm = range->notifier->mm; @@ -704,8 +765,14 @@ int hmm_range_fault(struct hmm_range *range) * returns -EBUSY so the loop re-walks and picks up the * now-present entries. */ - if (ret == HMM_FAULT_PENDING) + if (ret == HMM_FAULT_PENDING) { ret = hmm_do_fault(mm, &hmm_vma_walk); + if (ret == HMM_FAULT_UNLOCKED) { + if (fatal_signal_pending(current)) + return -EINTR; + return 0; /* caller must restart */ + } + } /* * When -EBUSY is returned the loop restarts with * hmm_vma_walk.last set to an address that has not been stored @@ -715,7 +782,7 @@ int hmm_range_fault(struct hmm_range *range) } while (ret == -EBUSY); return ret; } -EXPORT_SYMBOL(hmm_range_fault); +EXPORT_SYMBOL(hmm_range_fault_unlockable); /** * hmm_dma_map_alloc - Allocate HMM map structure Add a selftest that exercises hmm_range_fault_unlockable() with a userfaultfd-backed mapping. The test: 1. Creates an anonymous mmap region 2. Registers it with userfaultfd (UFFDIO_REGISTER_MODE_MISSING) 3. Spawns a handler thread that responds to page faults by filling pages with a known pattern (0xAB) via UFFDIO_COPY 4. Issues HMM_DMIRROR_READ_UNLOCKABLE to the test_hmm driver, which calls hmm_range_fault_unlockable() internally 5. Verifies the device read back the data provided by the userfaultfd handler This requires changes to the test_hmm kernel module: - New dmirror_range_fault_unlockable() that uses the new HMM API - New dmirror_fault_unlockable() and dmirror_read_unlockable() wrappers - New HMM_DMIRROR_READ_UNLOCKABLE ioctl (0x09) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/test_hmm.c | 122 ++++++++++++++++++++++++++++++ lib/test_hmm_uapi.h | 1 tools/testing/selftests/mm/hmm-tests.c | 132 ++++++++++++++++++++++++++++++++ 3 files changed, 255 insertions(+) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 0964d53365e61..20b14e279a8bd 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -327,6 +327,84 @@ static int dmirror_range_fault(struct dmirror *dmirror, return ret; } +static int dmirror_range_fault_unlockable(struct dmirror *dmirror, + struct hmm_range *range) +{ + struct mm_struct *mm = dmirror->notifier.mm; + unsigned long timeout = + jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); + int locked; + int ret; + + while (true) { + if (time_after(jiffies, timeout)) { + ret = -EBUSY; + goto out; + } + + range->notifier_seq = mmu_interval_read_begin(range->notifier); + locked = 1; + mmap_read_lock(mm); + ret = hmm_range_fault_unlockable(range, &locked); + if (locked) + mmap_read_unlock(mm); + if (ret) { + if (ret == -EBUSY) + continue; + goto out; + } + if (!locked) + continue; + + mutex_lock(&dmirror->mutex); + if (mmu_interval_read_retry(range->notifier, + range->notifier_seq)) { + mutex_unlock(&dmirror->mutex); + continue; + } + break; + } + + ret = dmirror_do_fault(dmirror, range); + + mutex_unlock(&dmirror->mutex); +out: + return ret; +} + +static int dmirror_fault_unlockable(struct dmirror *dmirror, + unsigned long start, + unsigned long end, bool write) +{ + struct mm_struct *mm = dmirror->notifier.mm; + unsigned long addr; + unsigned long pfns[32]; + struct hmm_range range = { + .notifier = &dmirror->notifier, + .hmm_pfns = pfns, + .pfn_flags_mask = 0, + .default_flags = + HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), + .dev_private_owner = dmirror->mdevice, + }; + int ret = 0; + + if (!mmget_not_zero(mm)) + return 0; + + for (addr = start; addr < end; addr = range.end) { + range.start = addr; + range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); + + ret = dmirror_range_fault_unlockable(dmirror, &range); + if (ret) + break; + } + + mmput(mm); + return ret; +} + static int dmirror_fault(struct dmirror *dmirror, unsigned long start, unsigned long end, bool write) { @@ -426,6 +504,47 @@ static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) return ret; } +static int dmirror_read_unlockable(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) +{ + struct dmirror_bounce bounce; + unsigned long start, end; + unsigned long size = cmd->npages << PAGE_SHIFT; + int ret; + + start = cmd->addr; + end = start + size; + if (end < start) + return -EINVAL; + + ret = dmirror_bounce_init(&bounce, start, size); + if (ret) + return ret; + + while (1) { + mutex_lock(&dmirror->mutex); + ret = dmirror_do_read(dmirror, start, end, &bounce); + mutex_unlock(&dmirror->mutex); + if (ret != -ENOENT) + break; + + start = cmd->addr + (bounce.cpages << PAGE_SHIFT); + ret = dmirror_fault_unlockable(dmirror, start, end, false); + if (ret) + break; + cmd->faults++; + } + + if (ret == 0) { + if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, + bounce.size)) + ret = -EFAULT; + } + cmd->cpages = bounce.cpages; + dmirror_bounce_fini(&bounce); + return ret; +} + static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, unsigned long end, struct dmirror_bounce *bounce) { @@ -1537,6 +1656,9 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, dmirror->flags = cmd.npages; ret = 0; break; + case HMM_DMIRROR_READ_UNLOCKABLE: + ret = dmirror_read_unlockable(dmirror, &cmd); + break; default: return -EINVAL; diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index f94c6d4573382..076df6df92275 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -38,6 +38,7 @@ struct hmm_dmirror_cmd { #define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) #define HMM_DMIRROR_RELEASE _IOWR('H', 0x07, struct hmm_dmirror_cmd) #define HMM_DMIRROR_FLAGS _IOWR('H', 0x08, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_READ_UNLOCKABLE _IOWR('H', 0x09, struct hmm_dmirror_cmd) #define HMM_DMIRROR_FLAG_FAIL_ALLOC (1ULL << 0) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index e8328c89d855e..12e988b96c158 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include /* @@ -2852,4 +2855,133 @@ TEST_F_TIMEOUT(hmm, benchmark_thp_migration, 120) &thp_results, ®ular_results); } } +/* + * Test that HMM can fault in pages backed by userfaultfd using the + * hmm_range_fault_unlockable() path. This exercises the lock-drop retry + * logic in the HMM framework. + */ +struct uffd_thread_args { + int uffd; + void *page_buffer; + unsigned long page_size; +}; + +static void *uffd_handler_thread(void *arg) +{ + struct uffd_thread_args *args = arg; + struct uffd_msg msg; + struct uffdio_copy copy; + struct pollfd pollfd; + int ret; + + pollfd.fd = args->uffd; + pollfd.events = POLLIN; + + while (1) { + ret = poll(&pollfd, 1, 5000); + if (ret <= 0) + break; + + ret = read(args->uffd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) + break; + + if (msg.event != UFFD_EVENT_PAGEFAULT) + break; + + /* Fill the page with a known pattern */ + memset(args->page_buffer, 0xAB, args->page_size); + + copy.dst = msg.arg.pagefault.address & ~(args->page_size - 1); + copy.src = (unsigned long)args->page_buffer; + copy.len = args->page_size; + copy.mode = 0; + copy.copy = 0; + + ret = ioctl(args->uffd, UFFDIO_COPY, ©); + if (ret < 0) + break; + } + + return NULL; +} + +TEST_F(hmm, userfaultfd_read) +{ + struct hmm_buffer *buffer; + struct uffd_thread_args uffd_args; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned char *ptr; + pthread_t thread; + int uffd; + int ret; + struct uffdio_api api; + struct uffdio_register reg; + + npages = 4; + size = npages << self->page_shift; + + /* Create userfaultfd */ + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) + SKIP(return, "userfaultfd not available"); + + api.api = UFFD_API; + api.features = 0; + ret = ioctl(uffd, UFFDIO_API, &api); + ASSERT_EQ(ret, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Create anonymous mapping */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Register the region with userfaultfd */ + reg.range.start = (unsigned long)buffer->ptr; + reg.range.len = size; + reg.mode = UFFDIO_REGISTER_MODE_MISSING; + ret = ioctl(uffd, UFFDIO_REGISTER, ®); + ASSERT_EQ(ret, 0); + + /* Set up the handler thread */ + uffd_args.uffd = uffd; + uffd_args.page_buffer = malloc(self->page_size); + ASSERT_NE(uffd_args.page_buffer, NULL); + uffd_args.page_size = self->page_size; + + ret = pthread_create(&thread, NULL, uffd_handler_thread, &uffd_args); + ASSERT_EQ(ret, 0); + + /* + * Use the unlockable read path which allows the mmap lock to be + * dropped during the fault, enabling userfaultfd resolution. + */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ_UNLOCKABLE, + buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Verify the device read the data filled by the uffd handler */ + ptr = buffer->mirror; + for (i = 0; i < size; ++i) + ASSERT_EQ(ptr[i], (unsigned char)0xAB); + + pthread_join(thread, NULL); + free(uffd_args.page_buffer); + close(uffd); + hmm_buffer_free(buffer); +} + TEST_HARNESS_MAIN