It is confusing to have __vma_enter_exclusive_locked() return 0, 1 or an error (but only when waiting for readers in TASK_KILLABLE state), and having the return value be stored in a stack variable called 'locked' is further confusion. More generally, we are doing a lock of rather finnicky things during the acquisition of a state in which readers are excluded and moving out of this state, including tracking whether we are detached or not or whether an error occurred. We are implementing logic in __vma_enter_exclusive_locked() that effectively acts as if 'if one caller calls us do X, if another then do Y', which is very confusing from a control flow perspective. Introducing the shared helper object state helps us avoid this, as we can now handle the 'an error arose but we're detached' condition correctly in both callers - a warning if not detaching, and treating the situation as if no error arose in the case of a VMA detaching. This also acts to help document what's going on and allows us to add some more logical debug asserts. Also update vma_mark_detached() to add a guard clause for the likely 'already detached' state (given we hold the mmap write lock), and add a comment about ephemeral VMA read lock reference count increments to clarify why we are entering/exiting an exclusive locked state here. No functional change intended. Signed-off-by: Lorenzo Stoakes --- mm/mmap_lock.c | 144 +++++++++++++++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 53 deletions(-) diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f73221174a8b..75166a43ffa4 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -46,20 +46,40 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK +/* State shared across __vma_[enter, exit]_exclusive_locked(). */ +struct vma_exclude_readers_state { + /* Input parameters. */ + struct vm_area_struct *vma; + int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */ + bool detaching; + + bool detached; + bool exclusive; /* Are we exclusively locked? */ +}; + /* * Now that all readers have been evicted, mark the VMA as being out of the * 'exclude readers' state. * * Returns true if the VMA is now detached, otherwise false. */ -static bool __must_check __vma_exit_exclusive_locked(struct vm_area_struct *vma) +static void __vma_exit_exclusive_locked(struct vma_exclude_readers_state *ves) { - bool detached; + struct vm_area_struct *vma = ves->vma; + + VM_WARN_ON_ONCE(ves->detached); + VM_WARN_ON_ONCE(!ves->exclusive); - detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, - &vma->vm_refcnt); + ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); __vma_lockdep_release_exclusive(vma); - return detached; +} + +static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves) +{ + const unsigned int tgt = ves->detaching ? 0 : 1; + + return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG; } /* @@ -69,30 +89,31 @@ static bool __must_check __vma_exit_exclusive_locked(struct vm_area_struct *vma) * Note that this function pairs with vma_refcount_put() which will wake up this * thread when it detects that the last reader has released its lock. * - * The state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases where we - * wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal signal - * is permitted to kill it. + * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases + * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal + * signal is permitted to kill it. * - * The function will return 0 immediately if the VMA is detached, and 1 once the - * VMA has evicted all readers, leaving the VMA exclusively locked. + * The function sets the ves->locked parameter to true if an exclusive lock was + * acquired, or false if the VMA was detached or an error arose on wait. * - * If the function returns 1, the caller is required to invoke - * __vma_exit_exclusive_locked() once the exclusive state is no longer required. + * If the function indicates an exclusive lock was acquired via ves->exclusive + * (or equivalently, returning 0 with !ves->detached), the caller is required to + * invoke __vma_exit_exclusive_locked() once the exclusive state is no longer + * required. * - * If state is set to something other than TASK_UNINTERRUPTIBLE, the function - * may also return -EINTR to indicate a fatal signal was received while waiting. + * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the + * function may also return -EINTR to indicate a fatal signal was received while + * waiting. */ -static int __vma_enter_exclusive_locked(struct vm_area_struct *vma, - bool detaching, int state) +static int __vma_enter_exclusive_locked(struct vma_exclude_readers_state *ves) { - int err; - unsigned int tgt_refcnt = VM_REFCNT_EXCLUDE_READERS_FLAG; + struct vm_area_struct *vma = ves->vma; + unsigned int tgt_refcnt = get_target_refcnt(ves); + int err = 0; mmap_assert_write_locked(vma->vm_mm); - - /* Additional refcnt if the vma is attached. */ - if (!detaching) - tgt_refcnt++; + VM_WARN_ON_ONCE(ves->detached); + VM_WARN_ON_ONCE(ves->exclusive); /* * If vma is detached then only vma_mark_attached() can raise the @@ -101,37 +122,39 @@ static int __vma_enter_exclusive_locked(struct vm_area_struct *vma, * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) + if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { + ves->detached = true; return 0; + } __vma_lockdep_acquire_exclusive(vma); err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, - state); + ves->state); if (err) { - if (__vma_exit_exclusive_locked(vma)) { - /* - * The wait failed, but the last reader went away - * as well. Tell the caller the VMA is detached. - */ - WARN_ON_ONCE(!detaching); - err = 0; - } + __vma_exit_exclusive_locked(ves); return err; } - __vma_lockdep_stat_mark_acquired(vma); - return 1; + __vma_lockdep_stat_mark_acquired(vma); + ves->exclusive = true; + return 0; } int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, int state) { - int locked; + int err; + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = state, + }; - locked = __vma_enter_exclusive_locked(vma, false, state); - if (locked < 0) - return locked; + err = __vma_enter_exclusive_locked(&ves); + if (err) { + WARN_ON_ONCE(ves.detached); + return err; + } /* * We should use WRITE_ONCE() here because we can have concurrent reads @@ -141,9 +164,11 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - /* vma should remain attached. */ - if (locked) - WARN_ON_ONCE(__vma_exit_exclusive_locked(vma)); + if (!ves.detached) { + __vma_exit_exclusive_locked(&ves); + /* VMA should remain attached. */ + WARN_ON_ONCE(ves.detached); + } return 0; } @@ -151,7 +176,12 @@ EXPORT_SYMBOL_GPL(__vma_start_write); void vma_mark_detached(struct vm_area_struct *vma) { - bool detached; + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = TASK_UNINTERRUPTIBLE, + .detaching = true, + }; + int err; vma_assert_write_locked(vma); vma_assert_attached(vma); @@ -160,18 +190,26 @@ void vma_mark_detached(struct vm_area_struct *vma) * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - detached = __vma_refcount_put(vma, NULL); - if (unlikely(!detached)) { - /* Wait until vma is detached with no readers. */ - if (__vma_enter_exclusive_locked(vma, true, TASK_UNINTERRUPTIBLE)) { - /* - * Once this is complete, no readers can increment the - * reference count, and the VMA is marked detached. - */ - detached = __vma_exit_exclusive_locked(vma); - WARN_ON_ONCE(!detached); - } + if (likely(__vma_refcount_put(vma, NULL))) + return; + + /* + * Wait until the VMA is detached with no readers. Since we hold the VMA + * write lock, the only read locks that might be present are those from + * threads trying to acquire the read lock and incrementing the + * reference count before realising the write lock is held and + * decrementing it. + */ + err = __vma_enter_exclusive_locked(&ves); + if (!err && !ves.detached) { + /* + * Once this is complete, no readers can increment the + * reference count, and the VMA is marked detached. + */ + __vma_exit_exclusive_locked(&ves); } + /* If an error arose but we were detached anyway, we don't care. */ + WARN_ON_ONCE(!ves.detached); } /* -- 2.52.0