The is_vma_writer_only() function is misnamed - this isn't determining if there is only a write lock, as it checks for the presence of the VM_REFCNT_EXCLUDE_READERS_FLAG. Really, it is checking to see whether readers are excluded, with a possibility of a false positive in the case of a detachment (there we expect the vma->vm_refcnt to eventually be set to VM_REFCNT_EXCLUDE_READERS_FLAG, whereas for an attached VMA we expect it to eventually be set to VM_REFCNT_EXCLUDE_READERS_FLAG + 1). Rename the function accordingly. Relatedly, we use a finnicky __refcount_dec_and_test() primitive directly in vma_refcount_put(), using the old value to determine what the reference count ought to be after the operation is complete (ignoring racing reference count adjustments). Wrap this into a __vma_refcount_put() function, which we can then utilise in vma_mark_detached() and thus keep the refcount primitive usage abstracted. Also adjust comments, removing duplicative comments covered elsewhere and adding more to aid understanding. No functional change intended. Signed-off-by: Lorenzo Stoakes --- include/linux/mmap_lock.h | 62 +++++++++++++++++++++++++++++++-------- mm/mmap_lock.c | 18 +++++------- 2 files changed, 57 insertions(+), 23 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index a764439d0276..0b3614aadbb4 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -122,15 +122,27 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) vma->vm_lock_seq = UINT_MAX; } -static inline bool is_vma_writer_only(int refcnt) +/** + * are_readers_excluded() - Determine whether @refcnt describes a VMA which has + * excluded all VMA read locks. + * @refcnt: The VMA reference count obtained from vm_area_struct->vm_refcnt. + * + * We may be raced by other readers temporarily incrementing the reference + * count, though the race window is very small, this might cause spurious + * wakeups. + * + * In the case of a detached VMA, we may incorrectly indicate that readers are + * excluded when one remains, because in that scenario we target a refcount of + * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of + * VM_REFCNT_EXCLUDE_READERS_FLAG + 1. + * + * However, the race window for that is very small so it is unlikely. + * + * Returns: true if readers are excluded, false otherwise. + */ +static inline bool are_readers_excluded(int refcnt) { /* - * With a writer and no readers, refcnt is VM_REFCNT_EXCLUDE_READERS_FLAG - * if the vma is detached and (VM_REFCNT_EXCLUDE_READERS_FLAG + 1) if it is - * attached. Waiting on a detached vma happens only in - * vma_mark_detached() and is a rare case, therefore most of the time - * there will be no unnecessary wakeup. - * * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ @@ -138,18 +150,42 @@ static inline bool is_vma_writer_only(int refcnt) refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; } +static inline bool __vma_refcount_put(struct vm_area_struct *vma, int *refcnt) +{ + int oldcnt; + bool detached; + + detached = __refcount_dec_and_test(&vma->vm_refcnt, &oldcnt); + if (refcnt) + *refcnt = oldcnt - 1; + return detached; +} + +/** + * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a + * read-lock being dropped. + * @vma: The VMA whose reference count we wish to decrement. + * + * If we were the last reader, wake up threads waiting to obtain an exclusive + * lock. + */ static inline void vma_refcount_put(struct vm_area_struct *vma) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */ struct mm_struct *mm = vma->vm_mm; - int oldcnt; + int refcnt; + bool detached; rwsem_release(&vma->vmlock_dep_map, _RET_IP_); - if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { - if (is_vma_writer_only(oldcnt - 1)) - rcuwait_wake_up(&mm->vma_writer_wait); - } + detached = __vma_refcount_put(vma, &refcnt); + /* + * __vma_enter_locked() may be sleeping waiting for readers to drop + * their reference count, so wake it up if we were the last reader + * blocking it from being acquired. + */ + if (!detached && are_readers_excluded(refcnt)) + rcuwait_wake_up(&mm->vma_writer_wait); } /* diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 75dc098aea14..ebacb57e5f16 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -130,25 +130,23 @@ EXPORT_SYMBOL_GPL(__vma_start_write); void vma_mark_detached(struct vm_area_struct *vma) { + bool detached; + vma_assert_write_locked(vma); vma_assert_attached(vma); /* - * We are the only writer, so no need to use vma_refcount_put(). - * The condition below is unlikely because the vma has been already - * write-locked and readers can increment vm_refcnt only temporarily - * before they check vm_lock_seq, realize the vma is locked and drop - * back the vm_refcnt. That is a narrow window for observing a raised - * vm_refcnt. - * * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + detached = __vma_refcount_put(vma, NULL); + if (unlikely(!detached)) { /* Wait until vma is detached with no readers. */ if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { - bool detached; - + /* + * Once this is complete, no readers can increment the + * reference count, and the VMA is marked detached. + */ __vma_exit_locked(vma, &detached); WARN_ON_ONCE(!detached); } -- 2.52.0