It is confusing to have __vma_enter_exclusive_locked() return 0, 1 or an error (but only when waiting for readers in TASK_KILLABLE state), and having the return value be stored in a stack variable called 'locked' is further confusion. More generally, we are doing a lock of rather finnicky things during the acquisition of a state in which readers are excluded and moving out of this state, including tracking whether we are detached or not or whether an error occurred. We are implementing logic in __vma_enter_exclusive_locked() that effectively acts as if 'if one caller calls us do X, if another then do Y', which is very confusing from a control flow perspective. Introducing the shared helper object state helps us avoid this, as we can now handle the 'an error arose but we're detached' condition correctly in both callers - a warning if not detaching, and treating the situation as if no error arose in the case of a VMA detaching. This also acts to help document what's going on and allows us to add some more logical debug asserts. Also update vma_mark_detached() to add a guard clause for the likely 'already detached' state (given we hold the mmap write lock), and add a comment about ephemeral VMA read lock reference count increments to clarify why we are entering/exiting an exclusive locked state here. Finally, separate vma_mark_detached() into its fast-path component and make it inline, then place the slow path for excluding readers in mmap_lock.c. No functional change intended. Signed-off-by: Lorenzo Stoakes --- include/linux/mm_types.h | 14 ++-- include/linux/mmap_lock.h | 23 +++++- mm/mmap_lock.c | 152 +++++++++++++++++++++----------------- 3 files changed, 112 insertions(+), 77 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 12281a1128c9..ca47a5d3d71e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1011,15 +1011,15 @@ struct vm_area_struct { * decrementing it again. * * VM_REFCNT_EXCLUDE_READERS_FLAG - Detached, pending - * __vma_exit_locked() completion which will decrement the reference - * count to zero. IMPORTANT - at this stage no further readers can - * increment the reference count. It can only be reduced. + * __vma_exit_exclusive_locked() completion which will decrement the + * reference count to zero. IMPORTANT - at this stage no further readers + * can increment the reference count. It can only be reduced. * * VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either write-locking - * an attached VMA and has yet to invoke __vma_exit_locked(), OR a - * thread is detaching a VMA and is waiting on a single spurious reader - * in order to decrement the reference count. IMPORTANT - as above, no - * further readers can increment the reference count. + * an attached VMA and has yet to invoke __vma_exit_exclusive_locked(), + * OR a thread is detaching a VMA and is waiting on a single spurious + * reader in order to decrement the reference count. IMPORTANT - as + * above, no further readers can increment the reference count. * * > VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either * write-locking or detaching a VMA is waiting on readers to diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index d6df6aad3e24..678f90080fa6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -358,7 +358,28 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) refcount_set_release(&vma->vm_refcnt, 1); } -void vma_mark_detached(struct vm_area_struct *vma); +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma); + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * The VMA still being attached (refcnt > 0) - is unlikely, because the + * vma has been already write-locked and readers can increment vm_refcnt + * only temporarily before they check vm_lock_seq, realize the vma is + * locked and drop back the vm_refcnt. That is a narrow window for + * observing a raised vm_refcnt. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ + if (likely(!__vma_refcount_put_return(vma))) + return; + + __vma_exclude_readers_for_detach(vma); +} struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 72f15f606093..b523a3fe110c 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -46,20 +46,38 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK +/* State shared across __vma_[enter, exit]_exclusive_locked(). */ +struct vma_exclude_readers_state { + /* Input parameters. */ + struct vm_area_struct *vma; + int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */ + bool detaching; + + /* Output parameters. */ + bool detached; + bool exclusive; /* Are we exclusively locked? */ +}; + /* * Now that all readers have been evicted, mark the VMA as being out of the * 'exclude readers' state. - * - * Returns true if the VMA is now detached, otherwise false. */ -static bool __must_check __vma_end_exclude_readers(struct vm_area_struct *vma) +static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves) { - bool detached; + struct vm_area_struct *vma = ves->vma; - detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, - &vma->vm_refcnt); + VM_WARN_ON_ONCE(ves->detached); + + ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); __vma_lockdep_release_exclusive(vma); - return detached; +} + +static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves) +{ + const unsigned int tgt = ves->detaching ? 0 : 1; + + return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG; } /* @@ -69,32 +87,29 @@ static bool __must_check __vma_end_exclude_readers(struct vm_area_struct *vma) * Note that this function pairs with vma_refcount_put() which will wake up this * thread when it detects that the last reader has released its lock. * - * The state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases where we - * wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal signal - * is permitted to kill it. + * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases + * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal + * signal is permitted to kill it. * - * The function will return 0 immediately if the VMA is detached, or wait for - * readers and return 1 once they have all exited, leaving the VMA exclusively - * locked. + * The function sets the ves->exclusive parameter to true if readers were + * excluded, or false if the VMA was detached or an error arose on wait. * - * If the function returns 1, the caller is required to invoke - * __vma_end_exclude_readers() once the exclusive state is no longer required. + * If the function indicates an exclusive lock was acquired via ves->exclusive + * the caller is required to invoke __vma_end_exclude_readers() once the + * exclusive state is no longer required. * - * If state is set to something other than TASK_UNINTERRUPTIBLE, the function - * may also return -EINTR to indicate a fatal signal was received while waiting. + * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the + * function may also return -EINTR to indicate a fatal signal was received while + * waiting. */ -static int __vma_start_exclude_readers(struct vm_area_struct *vma, - bool detaching, int state) +static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves) { - int err; - unsigned int tgt_refcnt = VM_REFCNT_EXCLUDE_READERS_FLAG; + struct vm_area_struct *vma = ves->vma; + unsigned int tgt_refcnt = get_target_refcnt(ves); + int err = 0; mmap_assert_write_locked(vma->vm_mm); - /* Additional refcnt if the vma is attached. */ - if (!detaching) - tgt_refcnt++; - /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). @@ -102,37 +117,39 @@ static int __vma_start_exclude_readers(struct vm_area_struct *vma, * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) + if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { + ves->detached = true; return 0; + } __vma_lockdep_acquire_exclusive(vma); err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, - state); + ves->state); if (err) { - if (__vma_end_exclude_readers(vma)) { - /* - * The wait failed, but the last reader went away - * as well. Tell the caller the VMA is detached. - */ - WARN_ON_ONCE(!detaching); - err = 0; - } + __vma_end_exclude_readers(ves); return err; } - __vma_lockdep_stat_mark_acquired(vma); - return 1; + __vma_lockdep_stat_mark_acquired(vma); + ves->exclusive = true; + return 0; } int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, int state) { - int locked; + int err; + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = state, + }; - locked = __vma_start_exclude_readers(vma, false, state); - if (locked < 0) - return locked; + err = __vma_start_exclude_readers(&ves); + if (err) { + WARN_ON_ONCE(ves.detached); + return err; + } /* * We should use WRITE_ONCE() here because we can have concurrent reads @@ -142,45 +159,42 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - if (locked) { - bool detached = __vma_end_exclude_readers(vma); - - /* The VMA should remain attached. */ - WARN_ON_ONCE(detached); + if (ves.exclusive) { + __vma_end_exclude_readers(&ves); + /* VMA should remain attached. */ + WARN_ON_ONCE(ves.detached); } return 0; } EXPORT_SYMBOL_GPL(__vma_start_write); -void vma_mark_detached(struct vm_area_struct *vma) +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma) { - vma_assert_write_locked(vma); - vma_assert_attached(vma); + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = TASK_UNINTERRUPTIBLE, + .detaching = true, + }; + int err; /* - * This condition - that the VMA is still attached (refcnt > 0) - is - * unlikely, because the vma has been already write-locked and readers - * can increment vm_refcnt only temporarily before they check - * vm_lock_seq, realize the vma is locked and drop back the - * vm_refcnt. That is a narrow window for observing a raised vm_refcnt. - * - * See the comment describing the vm_area_struct->vm_refcnt field for - * details of possible refcnt values. + * Wait until the VMA is detached with no readers. Since we hold the VMA + * write lock, the only read locks that might be present are those from + * threads trying to acquire the read lock and incrementing the + * reference count before realising the write lock is held and + * decrementing it. */ - if (unlikely(__vma_refcount_put_return(vma))) { - /* Wait until vma is detached with no readers. */ - if (__vma_start_exclude_readers(vma, true, TASK_UNINTERRUPTIBLE)) { - bool detached; - - /* - * Once this is complete, no readers can increment the - * reference count, and the VMA is marked detached. - */ - detached = __vma_end_exclude_readers(vma); - WARN_ON_ONCE(!detached); - } + err = __vma_start_exclude_readers(&ves); + if (!err && ves.exclusive) { + /* + * Once this is complete, no readers can increment the + * reference count, and the VMA is marked detached. + */ + __vma_end_exclude_readers(&ves); } + /* If an error arose but we were detached anyway, we don't care. */ + WARN_ON_ONCE(!ves.detached); } /* -- 2.52.0