The VMA_LOCK_OFFSET value encodes a flag which vma->vm_refcnt is set to in order to indicate that a VMA is in the process of having VMA read-locks excluded in __vma_enter_locked() (that is, first checking if there are any VMA read locks held, and if there are, waiting on them to be released). This happens when a VMA write lock is being established, or a VMA is being marked detached and discovers that the VMA reference count is elevated due to read-locks temporarily elevating the reference count only to discover a VMA write lock is in place. The naming does not convey any of this, so rename VMA_LOCK_OFFSET to VM_REFCNT_EXCLUDE_READERS_FLAG (with a sensible new prefix to differentiate from the newly introduced VMA_*_BIT flags). Also rename VMA_REF_LIMIT to VM_REFCNT_LIMIT to make this consistent also. Update comments to reflect this. No functional change intended. Reviewed-by: Suren Baghdasaryan Signed-off-by: Lorenzo Stoakes --- include/linux/mm_types.h | 17 +++++++++++++---- include/linux/mmap_lock.h | 14 ++++++++------ mm/mmap_lock.c | 17 ++++++++++------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 78950eb8926d..bdbf17c4f26b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -752,8 +752,17 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -#define VMA_LOCK_OFFSET 0x40000000 -#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) +/* + * While __vma_enter_locked() is working to ensure are no read-locks held on a + * VMA (either while acquiring a VMA write lock or marking a VMA detached) we + * set the VM_REFCNT_EXCLUDE_READERS_FLAG in vma->vm_refcnt to indiciate to + * vma_start_read() that the reference count should be left alone. + * + * Once the operation is complete, this value is subtracted from vma->vm_refcnt. + */ +#define VM_REFCNT_EXCLUDE_READERS_BIT (30) +#define VM_REFCNT_EXCLUDE_READERS_FLAG (1U << VM_REFCNT_EXCLUDE_READERS_BIT) +#define VM_REFCNT_LIMIT (VM_REFCNT_EXCLUDE_READERS_FLAG - 1) struct vma_numab_state { /* @@ -935,10 +944,10 @@ struct vm_area_struct { /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set + * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_FLAG is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 + * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_BIT is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index b50416fbba20..5acbd4ba1b52 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -125,12 +125,14 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) static inline bool is_vma_writer_only(int refcnt) { /* - * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma - * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on - * a detached vma happens only in vma_mark_detached() and is a rare - * case, therefore most of the time there will be no unnecessary wakeup. + * With a writer and no readers, refcnt is VM_REFCNT_EXCLUDE_READERS_FLAG + * if the vma is detached and (VM_REFCNT_EXCLUDE_READERS_FLAG + 1) if it is + * attached. Waiting on a detached vma happens only in + * vma_mark_detached() and is a rare case, therefore most of the time + * there will be no unnecessary wakeup. */ - return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) && + refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -159,7 +161,7 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int mmap_assert_locked(vma->vm_mm); if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) + VM_REFCNT_LIMIT))) return false; rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 7421b7ea8001..1d23b48552e9 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -54,7 +54,7 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, bool detaching, int state) { int err; - unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + unsigned int tgt_refcnt = VM_REFCNT_EXCLUDE_READERS_FLAG; mmap_assert_write_locked(vma->vm_mm); @@ -66,7 +66,7 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ - if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) return 0; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); @@ -74,7 +74,7 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, refcount_read(&vma->vm_refcnt) == tgt_refcnt, state); if (err) { - if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) { + if (refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { /* * The wait failed, but the last reader went away * as well. Tell the caller the VMA is detached. @@ -92,7 +92,8 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) { - *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + *detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } @@ -180,13 +181,15 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, } /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * If VM_REFCNT_EXCLUDE_READERS_FLAG is set, + * __refcount_inc_not_zero_limited_acquire() will fail because + * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG. + * * Acquire fence is required here to avoid reordering against later * vm_lock_seq check and checks inside lock_vma_under_rcu(). */ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { + VM_REFCNT_LIMIT))) { /* return EAGAIN if vma got detached from under us */ vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); goto err; -- 2.52.0 The possible vma->vm_refcnt values are confusing and vague, explain in detail what these can be in a comment describing the vma->vm_refcnt field and reference this comment in various places that read/write this field. No functional change intended. Signed-off-by: Lorenzo Stoakes --- include/linux/mm_types.h | 42 +++++++++++++++++++++++++++++++++++++-- include/linux/mmap_lock.h | 7 +++++++ mm/mmap_lock.c | 6 ++++++ 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bdbf17c4f26b..12281a1128c9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -758,7 +758,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) * set the VM_REFCNT_EXCLUDE_READERS_FLAG in vma->vm_refcnt to indiciate to * vma_start_read() that the reference count should be left alone. * - * Once the operation is complete, this value is subtracted from vma->vm_refcnt. + * See the comment describing vm_refcnt in vm_area_struct for details as to + * which values the VMA reference count can be. */ #define VM_REFCNT_EXCLUDE_READERS_BIT (30) #define VM_REFCNT_EXCLUDE_READERS_FLAG (1U << VM_REFCNT_EXCLUDE_READERS_BIT) @@ -989,7 +990,44 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif #ifdef CONFIG_PER_VMA_LOCK - /* Unstable RCU readers are allowed to read this. */ + /* + * Used to keep track of firstly, whether the VMA is attached, secondly, + * if attached, how many read locks are taken, and thirdly, if the + * VM_REFCNT_EXCLUDE_READERS_FLAG is set, whether any read locks held + * are currently in the process of being excluded. + * + * This value can be equal to: + * + * 0 - Detached. IMPORTANT: when the refcnt is zero, readers cannot + * increment it. + * + * 1 - Attached and either unlocked or write-locked. Write locks are + * identified via __is_vma_write_locked() which checks for equality of + * vma->vm_lock_seq and mm->mm_lock_seq. + * + * >1, < VM_REFCNT_EXCLUDE_READERS_FLAG - Read-locked or (unlikely) + * write-locked with other threads having temporarily incremented the + * reference count prior to determining it is write-locked and + * decrementing it again. + * + * VM_REFCNT_EXCLUDE_READERS_FLAG - Detached, pending + * __vma_exit_locked() completion which will decrement the reference + * count to zero. IMPORTANT - at this stage no further readers can + * increment the reference count. It can only be reduced. + * + * VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either write-locking + * an attached VMA and has yet to invoke __vma_exit_locked(), OR a + * thread is detaching a VMA and is waiting on a single spurious reader + * in order to decrement the reference count. IMPORTANT - as above, no + * further readers can increment the reference count. + * + * > VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either + * write-locking or detaching a VMA is waiting on readers to + * exit. IMPORTANT - as above, no ruther readers can increment the + * reference count. + * + * NOTE: Unstable RCU readers are allowed to read this. + */ refcount_t vm_refcnt ____cacheline_aligned_in_smp; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map vmlock_dep_map; diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 5acbd4ba1b52..a764439d0276 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,6 +130,9 @@ static inline bool is_vma_writer_only(int refcnt) * attached. Waiting on a detached vma happens only in * vma_mark_detached() and is a rare case, therefore most of the time * there will be no unnecessary wakeup. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. */ return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) && refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; @@ -249,6 +252,10 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) { unsigned int mm_lock_seq; + /* + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && !__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 1d23b48552e9..75dc098aea14 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -65,6 +65,9 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. */ if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) return 0; @@ -137,6 +140,9 @@ void vma_mark_detached(struct vm_area_struct *vma) * before they check vm_lock_seq, realize the vma is locked and drop * back the vm_refcnt. That is a narrow window for observing a raised * vm_refcnt. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ -- 2.52.0 The is_vma_writer_only() function is misnamed - this isn't determining if there is only a write lock, as it checks for the presence of the VM_REFCNT_EXCLUDE_READERS_FLAG. Really, it is checking to see whether readers are excluded, with a possibility of a false positive in the case of a detachment (there we expect the vma->vm_refcnt to eventually be set to VM_REFCNT_EXCLUDE_READERS_FLAG, whereas for an attached VMA we expect it to eventually be set to VM_REFCNT_EXCLUDE_READERS_FLAG + 1). Rename the function accordingly. Relatedly, we use a __refcount_dec_and_test() primitive directly in vma_refcount_put(), using the old value to determine what the reference count ought to be after the operation is complete (ignoring racing reference count adjustments). Wrap this into a __vma_refcount_put_return() function, which we can then utilise in vma_mark_detached() and thus keep the refcount primitive usage abstracted. This function, as the name implies, returns the value after the reference count has been updated. This reduces duplication in the two invocations of this function. Also adjust comments, removing duplicative comments covered elsewhere and adding more to aid understanding. No functional change intended. Signed-off-by: Lorenzo Stoakes --- include/linux/mmap_lock.h | 66 +++++++++++++++++++++++++++++++-------- mm/mmap_lock.c | 17 +++++----- 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index a764439d0276..294fb282052d 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -122,15 +122,22 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) vma->vm_lock_seq = UINT_MAX; } -static inline bool is_vma_writer_only(int refcnt) +/* + * This function determines whether the input VMA reference count describes a + * VMA which has excluded all VMA read locks. + * + * In the case of a detached VMA, we may incorrectly indicate that readers are + * excluded when one remains, because in that scenario we target a refcount of + * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of + * VM_REFCNT_EXCLUDE_READERS_FLAG + 1. + * + * However, the race window for that is very small so it is unlikely. + * + * Returns: true if readers are excluded, false otherwise. + */ +static inline bool __vma_are_readers_excluded(int refcnt) { /* - * With a writer and no readers, refcnt is VM_REFCNT_EXCLUDE_READERS_FLAG - * if the vma is detached and (VM_REFCNT_EXCLUDE_READERS_FLAG + 1) if it is - * attached. Waiting on a detached vma happens only in - * vma_mark_detached() and is a rare case, therefore most of the time - * there will be no unnecessary wakeup. - * * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ @@ -138,18 +145,51 @@ static inline bool is_vma_writer_only(int refcnt) refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1; } +/* + * Actually decrement the VMA reference count. + * + * The function returns the reference count as it was immediately after the + * decrement took place. If it returns zero, the VMA is now detached. + */ +static inline __must_check unsigned int +__vma_refcount_put_return(struct vm_area_struct *vma) +{ + int oldcnt; + + if (__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) + return 0; + + return oldcnt - 1; +} + +/** + * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a + * read-lock being dropped. + * @vma: The VMA whose reference count we wish to decrement. + * + * If we were the last reader, wake up threads waiting to obtain an exclusive + * lock. + */ static inline void vma_refcount_put(struct vm_area_struct *vma) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */ struct mm_struct *mm = vma->vm_mm; - int oldcnt; + int newcnt; rwsem_release(&vma->vmlock_dep_map, _RET_IP_); - if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { - if (is_vma_writer_only(oldcnt - 1)) - rcuwait_wake_up(&mm->vma_writer_wait); - } + newcnt = __vma_refcount_put_return(vma); + /* + * __vma_enter_locked() may be sleeping waiting for readers to drop + * their reference count, so wake it up if we were the last reader + * blocking it from being acquired. + * + * We may be raced by other readers temporarily incrementing the + * reference count, though the race window is very small, this might + * cause spurious wakeups. + */ + if (newcnt && __vma_are_readers_excluded(newcnt)) + rcuwait_wake_up(&mm->vma_writer_wait); } /* diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 75dc098aea14..6be1bbcde09e 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -134,21 +134,24 @@ void vma_mark_detached(struct vm_area_struct *vma) vma_assert_attached(vma); /* - * We are the only writer, so no need to use vma_refcount_put(). - * The condition below is unlikely because the vma has been already - * write-locked and readers can increment vm_refcnt only temporarily - * before they check vm_lock_seq, realize the vma is locked and drop - * back the vm_refcnt. That is a narrow window for observing a raised - * vm_refcnt. + * This condition - that the VMA is still attached (refcnt > 0) - is + * unlikely, because the vma has been already write-locked and readers + * can increment vm_refcnt only temporarily before they check + * vm_lock_seq, realize the vma is locked and drop back the + * vm_refcnt. That is a narrow window for observing a raised vm_refcnt. * * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + if (unlikely(__vma_refcount_put_return(vma))) { /* Wait until vma is detached with no readers. */ if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { bool detached; + /* + * Once this is complete, no readers can increment the + * reference count, and the VMA is marked detached. + */ __vma_exit_locked(vma, &detached); WARN_ON_ONCE(!detached); } -- 2.52.0 The code is littered with inscrutable and duplicative lockdep incantations, replace these with defines which explain what is going on and add commentary to explain what we're doing. If lockdep is disabled these become no-ops. We must use defines so _RET_IP_ remains meaningful. These are self-documenting and aid readability of the code. Additionally, instead of using the confusing rwsem_*() form for something that is emphatically not an rwsem, we instead explicitly use lock_[acquired, release]_shared/exclusive() lockdep invocations since we are doing something rather custom here and these make more sense to use. No functional change intended. Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Signed-off-by: Lorenzo Stoakes --- include/linux/mmap_lock.h | 37 ++++++++++++++++++++++++++++++++++--- mm/mmap_lock.c | 10 +++++----- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 294fb282052d..1887ca55ead7 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -78,6 +78,37 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) #ifdef CONFIG_PER_VMA_LOCK +/* + * VMA locks do not behave like most ordinary locks found in the kernel, so we + * cannot quite have full lockdep tracking in the way we would ideally prefer. + * + * Read locks act as shared locks which exclude an exclusive lock being + * taken. We therefore mark these accordingly on read lock acquire/release. + * + * Write locks are acquired exclusively per-VMA, but released in a shared + * fashion, that is upon vma_end_write_all(), we update the mmap's seqcount such + * that write lock is released. + * + * We therefore cannot track write locks per-VMA, nor do we try. Mitigating this + * is the fact that, of course, we do lockdep-track the mmap lock rwsem which + * must be held when taking a VMA write lock. + * + * We do, however, want to indicate that during either acquisition of a VMA + * write lock or detachment of a VMA that we require the lock held be exclusive, + * so we utilise lockdep to do so. + */ +#define __vma_lockdep_acquire_read(vma) \ + lock_acquire_shared(&vma->vmlock_dep_map, 0, 1, NULL, _RET_IP_) +#define __vma_lockdep_release_read(vma) \ + lock_release(&vma->vmlock_dep_map, _RET_IP_) +#define __vma_lockdep_acquire_exclusive(vma) \ + lock_acquire_exclusive(&vma->vmlock_dep_map, 0, 0, NULL, _RET_IP_) +#define __vma_lockdep_release_exclusive(vma) \ + lock_release(&vma->vmlock_dep_map, _RET_IP_) +/* Only meaningful if CONFIG_LOCK_STAT is defined. */ +#define __vma_lockdep_stat_mark_acquired(vma) \ + lock_acquired(&vma->vmlock_dep_map, _RET_IP_) + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -176,9 +207,9 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; int newcnt; - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); - + __vma_lockdep_release_read(vma); newcnt = __vma_refcount_put_return(vma); + /* * __vma_enter_locked() may be sleeping waiting for readers to drop * their reference count, so wake it up if we were the last reader @@ -207,7 +238,7 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int VM_REFCNT_LIMIT))) return false; - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + __vma_lockdep_acquire_read(vma); return true; } diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 6be1bbcde09e..85b2ae1d9720 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -72,7 +72,7 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) return 0; - rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + __vma_lockdep_acquire_exclusive(vma); err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, state); @@ -85,10 +85,10 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, WARN_ON_ONCE(!detaching); err = 0; } - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + __vma_lockdep_release_exclusive(vma); return err; } - lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + __vma_lockdep_stat_mark_acquired(vma); return 1; } @@ -97,7 +97,7 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) { *detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt); - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + __vma_lockdep_release_exclusive(vma); } int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, @@ -204,7 +204,7 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, goto err; } - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + __vma_lockdep_acquire_read(vma); if (unlikely(vma->vm_mm != mm)) goto err_unstable; -- 2.52.0 We're doing precisely the same thing that __vma_exit_locked() does, so de-duplicate this code and keep the refcount primitive in one place. No functional change intended. Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Signed-off-by: Lorenzo Stoakes --- mm/mmap_lock.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 85b2ae1d9720..1fabda07c922 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -45,6 +45,14 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK + +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); + __vma_lockdep_release_exclusive(vma); +} + /* * __vma_enter_locked() returns 0 immediately if the vma is not * attached, otherwise it waits for any current readers to finish and @@ -77,7 +85,10 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, refcount_read(&vma->vm_refcnt) == tgt_refcnt, state); if (err) { - if (refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { + bool detached; + + __vma_exit_locked(vma, &detached); + if (detached) { /* * The wait failed, but the last reader went away * as well. Tell the caller the VMA is detached. @@ -85,7 +96,6 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, WARN_ON_ONCE(!detaching); err = 0; } - __vma_lockdep_release_exclusive(vma); return err; } __vma_lockdep_stat_mark_acquired(vma); @@ -93,13 +103,6 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, return 1; } -static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) -{ - *detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, - &vma->vm_refcnt); - __vma_lockdep_release_exclusive(vma); -} - int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, int state) { -- 2.52.0 These functions are very confusing indeed. 'Entering' a lock could be interpreted as acquiring it, but this is not what these functions are interacting with. Equally they don't indicate at all what kind of lock we are 'entering' or 'exiting'. Finally they are misleading as we invoke these functions when we already hold a write lock to detach a VMA. These functions are explicitly simply 'entering' and 'exiting' a state in which we hold the EXCLUSIVE lock in order that we can either mark the VMA as being write-locked, or mark the VMA detached. Rename the functions accordingly, and also update __vma_end_exclude_readers() to return detached state with a __must_check directive, as it is simply clumsy to pass an output pointer here to detached state and inconsistent vs. __vma_start_exclude_readers(). Finally, remove the unnecessary 'inline' directives. No functional change intended. Signed-off-by: Lorenzo Stoakes --- include/linux/mmap_lock.h | 4 +-- mm/mmap_lock.c | 58 +++++++++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 1887ca55ead7..d6df6aad3e24 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -211,8 +211,8 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) newcnt = __vma_refcount_put_return(vma); /* - * __vma_enter_locked() may be sleeping waiting for readers to drop - * their reference count, so wake it up if we were the last reader + * __vma_start_exclude_readers() may be sleeping waiting for readers to + * drop their reference count, so wake it up if we were the last reader * blocking it from being acquired. * * We may be raced by other readers temporarily incrementing the diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 1fabda07c922..72f15f606093 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -46,19 +46,44 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK -static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +/* + * Now that all readers have been evicted, mark the VMA as being out of the + * 'exclude readers' state. + * + * Returns true if the VMA is now detached, otherwise false. + */ +static bool __must_check __vma_end_exclude_readers(struct vm_area_struct *vma) { - *detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, - &vma->vm_refcnt); + bool detached; + + detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); __vma_lockdep_release_exclusive(vma); + return detached; } /* - * __vma_enter_locked() returns 0 immediately if the vma is not - * attached, otherwise it waits for any current readers to finish and - * returns 1. Returns -EINTR if a signal is received while waiting. + * Mark the VMA as being in a state of excluding readers, check to see if any + * VMA read locks are indeed held, and if so wait for them to be released. + * + * Note that this function pairs with vma_refcount_put() which will wake up this + * thread when it detects that the last reader has released its lock. + * + * The state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases where we + * wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal signal + * is permitted to kill it. + * + * The function will return 0 immediately if the VMA is detached, or wait for + * readers and return 1 once they have all exited, leaving the VMA exclusively + * locked. + * + * If the function returns 1, the caller is required to invoke + * __vma_end_exclude_readers() once the exclusive state is no longer required. + * + * If state is set to something other than TASK_UNINTERRUPTIBLE, the function + * may also return -EINTR to indicate a fatal signal was received while waiting. */ -static inline int __vma_enter_locked(struct vm_area_struct *vma, +static int __vma_start_exclude_readers(struct vm_area_struct *vma, bool detaching, int state) { int err; @@ -85,13 +110,10 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma, refcount_read(&vma->vm_refcnt) == tgt_refcnt, state); if (err) { - bool detached; - - __vma_exit_locked(vma, &detached); - if (detached) { + if (__vma_end_exclude_readers(vma)) { /* * The wait failed, but the last reader went away - * as well. Tell the caller the VMA is detached. + * as well. Tell the caller the VMA is detached. */ WARN_ON_ONCE(!detaching); err = 0; @@ -108,7 +130,7 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, { int locked; - locked = __vma_enter_locked(vma, false, state); + locked = __vma_start_exclude_readers(vma, false, state); if (locked < 0) return locked; @@ -121,10 +143,10 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); if (locked) { - bool detached; + bool detached = __vma_end_exclude_readers(vma); - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(detached); /* vma should remain attached */ + /* The VMA should remain attached. */ + WARN_ON_ONCE(detached); } return 0; @@ -148,14 +170,14 @@ void vma_mark_detached(struct vm_area_struct *vma) */ if (unlikely(__vma_refcount_put_return(vma))) { /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { + if (__vma_start_exclude_readers(vma, true, TASK_UNINTERRUPTIBLE)) { bool detached; /* * Once this is complete, no readers can increment the * reference count, and the VMA is marked detached. */ - __vma_exit_locked(vma, &detached); + detached = __vma_end_exclude_readers(vma); WARN_ON_ONCE(!detached); } } -- 2.52.0 It is confusing to have __vma_enter_exclusive_locked() return 0, 1 or an error (but only when waiting for readers in TASK_KILLABLE state), and having the return value be stored in a stack variable called 'locked' is further confusion. More generally, we are doing a lock of rather finnicky things during the acquisition of a state in which readers are excluded and moving out of this state, including tracking whether we are detached or not or whether an error occurred. We are implementing logic in __vma_enter_exclusive_locked() that effectively acts as if 'if one caller calls us do X, if another then do Y', which is very confusing from a control flow perspective. Introducing the shared helper object state helps us avoid this, as we can now handle the 'an error arose but we're detached' condition correctly in both callers - a warning if not detaching, and treating the situation as if no error arose in the case of a VMA detaching. This also acts to help document what's going on and allows us to add some more logical debug asserts. Also update vma_mark_detached() to add a guard clause for the likely 'already detached' state (given we hold the mmap write lock), and add a comment about ephemeral VMA read lock reference count increments to clarify why we are entering/exiting an exclusive locked state here. Finally, separate vma_mark_detached() into its fast-path component and make it inline, then place the slow path for excluding readers in mmap_lock.c. No functional change intended. Signed-off-by: Lorenzo Stoakes --- include/linux/mm_types.h | 14 ++-- include/linux/mmap_lock.h | 23 +++++- mm/mmap_lock.c | 152 +++++++++++++++++++++----------------- 3 files changed, 112 insertions(+), 77 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 12281a1128c9..ca47a5d3d71e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1011,15 +1011,15 @@ struct vm_area_struct { * decrementing it again. * * VM_REFCNT_EXCLUDE_READERS_FLAG - Detached, pending - * __vma_exit_locked() completion which will decrement the reference - * count to zero. IMPORTANT - at this stage no further readers can - * increment the reference count. It can only be reduced. + * __vma_exit_exclusive_locked() completion which will decrement the + * reference count to zero. IMPORTANT - at this stage no further readers + * can increment the reference count. It can only be reduced. * * VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either write-locking - * an attached VMA and has yet to invoke __vma_exit_locked(), OR a - * thread is detaching a VMA and is waiting on a single spurious reader - * in order to decrement the reference count. IMPORTANT - as above, no - * further readers can increment the reference count. + * an attached VMA and has yet to invoke __vma_exit_exclusive_locked(), + * OR a thread is detaching a VMA and is waiting on a single spurious + * reader in order to decrement the reference count. IMPORTANT - as + * above, no further readers can increment the reference count. * * > VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either * write-locking or detaching a VMA is waiting on readers to diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index d6df6aad3e24..678f90080fa6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -358,7 +358,28 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) refcount_set_release(&vma->vm_refcnt, 1); } -void vma_mark_detached(struct vm_area_struct *vma); +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma); + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * The VMA still being attached (refcnt > 0) - is unlikely, because the + * vma has been already write-locked and readers can increment vm_refcnt + * only temporarily before they check vm_lock_seq, realize the vma is + * locked and drop back the vm_refcnt. That is a narrow window for + * observing a raised vm_refcnt. + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. + */ + if (likely(!__vma_refcount_put_return(vma))) + return; + + __vma_exclude_readers_for_detach(vma); +} struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 72f15f606093..b523a3fe110c 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -46,20 +46,38 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK +/* State shared across __vma_[enter, exit]_exclusive_locked(). */ +struct vma_exclude_readers_state { + /* Input parameters. */ + struct vm_area_struct *vma; + int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */ + bool detaching; + + /* Output parameters. */ + bool detached; + bool exclusive; /* Are we exclusively locked? */ +}; + /* * Now that all readers have been evicted, mark the VMA as being out of the * 'exclude readers' state. - * - * Returns true if the VMA is now detached, otherwise false. */ -static bool __must_check __vma_end_exclude_readers(struct vm_area_struct *vma) +static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves) { - bool detached; + struct vm_area_struct *vma = ves->vma; - detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, - &vma->vm_refcnt); + VM_WARN_ON_ONCE(ves->detached); + + ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); __vma_lockdep_release_exclusive(vma); - return detached; +} + +static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves) +{ + const unsigned int tgt = ves->detaching ? 0 : 1; + + return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG; } /* @@ -69,32 +87,29 @@ static bool __must_check __vma_end_exclude_readers(struct vm_area_struct *vma) * Note that this function pairs with vma_refcount_put() which will wake up this * thread when it detects that the last reader has released its lock. * - * The state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases where we - * wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal signal - * is permitted to kill it. + * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases + * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal + * signal is permitted to kill it. * - * The function will return 0 immediately if the VMA is detached, or wait for - * readers and return 1 once they have all exited, leaving the VMA exclusively - * locked. + * The function sets the ves->exclusive parameter to true if readers were + * excluded, or false if the VMA was detached or an error arose on wait. * - * If the function returns 1, the caller is required to invoke - * __vma_end_exclude_readers() once the exclusive state is no longer required. + * If the function indicates an exclusive lock was acquired via ves->exclusive + * the caller is required to invoke __vma_end_exclude_readers() once the + * exclusive state is no longer required. * - * If state is set to something other than TASK_UNINTERRUPTIBLE, the function - * may also return -EINTR to indicate a fatal signal was received while waiting. + * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the + * function may also return -EINTR to indicate a fatal signal was received while + * waiting. */ -static int __vma_start_exclude_readers(struct vm_area_struct *vma, - bool detaching, int state) +static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves) { - int err; - unsigned int tgt_refcnt = VM_REFCNT_EXCLUDE_READERS_FLAG; + struct vm_area_struct *vma = ves->vma; + unsigned int tgt_refcnt = get_target_refcnt(ves); + int err = 0; mmap_assert_write_locked(vma->vm_mm); - /* Additional refcnt if the vma is attached. */ - if (!detaching) - tgt_refcnt++; - /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). @@ -102,37 +117,39 @@ static int __vma_start_exclude_readers(struct vm_area_struct *vma, * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) + if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { + ves->detached = true; return 0; + } __vma_lockdep_acquire_exclusive(vma); err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, - state); + ves->state); if (err) { - if (__vma_end_exclude_readers(vma)) { - /* - * The wait failed, but the last reader went away - * as well. Tell the caller the VMA is detached. - */ - WARN_ON_ONCE(!detaching); - err = 0; - } + __vma_end_exclude_readers(ves); return err; } - __vma_lockdep_stat_mark_acquired(vma); - return 1; + __vma_lockdep_stat_mark_acquired(vma); + ves->exclusive = true; + return 0; } int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, int state) { - int locked; + int err; + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = state, + }; - locked = __vma_start_exclude_readers(vma, false, state); - if (locked < 0) - return locked; + err = __vma_start_exclude_readers(&ves); + if (err) { + WARN_ON_ONCE(ves.detached); + return err; + } /* * We should use WRITE_ONCE() here because we can have concurrent reads @@ -142,45 +159,42 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - if (locked) { - bool detached = __vma_end_exclude_readers(vma); - - /* The VMA should remain attached. */ - WARN_ON_ONCE(detached); + if (ves.exclusive) { + __vma_end_exclude_readers(&ves); + /* VMA should remain attached. */ + WARN_ON_ONCE(ves.detached); } return 0; } EXPORT_SYMBOL_GPL(__vma_start_write); -void vma_mark_detached(struct vm_area_struct *vma) +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma) { - vma_assert_write_locked(vma); - vma_assert_attached(vma); + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = TASK_UNINTERRUPTIBLE, + .detaching = true, + }; + int err; /* - * This condition - that the VMA is still attached (refcnt > 0) - is - * unlikely, because the vma has been already write-locked and readers - * can increment vm_refcnt only temporarily before they check - * vm_lock_seq, realize the vma is locked and drop back the - * vm_refcnt. That is a narrow window for observing a raised vm_refcnt. - * - * See the comment describing the vm_area_struct->vm_refcnt field for - * details of possible refcnt values. + * Wait until the VMA is detached with no readers. Since we hold the VMA + * write lock, the only read locks that might be present are those from + * threads trying to acquire the read lock and incrementing the + * reference count before realising the write lock is held and + * decrementing it. */ - if (unlikely(__vma_refcount_put_return(vma))) { - /* Wait until vma is detached with no readers. */ - if (__vma_start_exclude_readers(vma, true, TASK_UNINTERRUPTIBLE)) { - bool detached; - - /* - * Once this is complete, no readers can increment the - * reference count, and the VMA is marked detached. - */ - detached = __vma_end_exclude_readers(vma); - WARN_ON_ONCE(!detached); - } + err = __vma_start_exclude_readers(&ves); + if (!err && ves.exclusive) { + /* + * Once this is complete, no readers can increment the + * reference count, and the VMA is marked detached. + */ + __vma_end_exclude_readers(&ves); } + /* If an error arose but we were detached anyway, we don't care. */ + WARN_ON_ONCE(!ves.detached); } /* -- 2.52.0 We don't actually need to return an output parameter providing mm sequence number, rather we can separate that out into another function - __vma_raw_mm_seqnum() - and have any callers which need to obtain that invoke that instead. The access to the raw sequence number requires that we hold the exclusive mmap lock such that we know we can't race vma_end_write_all(), so move the assert to __vma_raw_mm_seqnum() to make this requirement clear. Also while we're here, convert all of the VM_BUG_ON_VMA()'s to VM_WARN_ON_ONCE_VMA()'s in line with the convention that we do not invoke oopses when we can avoid it. Signed-off-by: Lorenzo Stoakes --- include/linux/mmap_lock.h | 44 ++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 678f90080fa6..23bde4bd5a85 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -258,17 +258,30 @@ static inline void vma_end_read(struct vm_area_struct *vma) vma_refcount_put(vma); } -/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline unsigned int __vma_raw_mm_seqnum(struct vm_area_struct *vma) { + const struct mm_struct *mm = vma->vm_mm; + + /* We must hold an exclusive write lock for this access to be valid. */ mmap_assert_write_locked(vma->vm_mm); + return mm->mm_lock_seq.sequence; +} +/* + * Determine whether a VMA is write-locked. Must be invoked ONLY if the mmap + * write lock is held. + * + * Returns true if write-locked, otherwise false. + * + * Note that mm_lock_seq is updated only if the VMA is NOT write-locked. + */ +static inline bool __is_vma_write_locked(struct vm_area_struct *vma) +{ /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; - return (vma->vm_lock_seq == *mm_lock_seq); + return vma->vm_lock_seq == __vma_raw_mm_seqnum(vma); } int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, @@ -281,12 +294,10 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, */ static inline void vma_start_write(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) + if (__is_vma_write_locked(vma)) return; - __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE); + __vma_start_write(vma, __vma_raw_mm_seqnum(vma), TASK_UNINTERRUPTIBLE); } /** @@ -305,30 +316,25 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline __must_check int vma_start_write_killable(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) + if (__is_vma_write_locked(vma)) return 0; - return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE); + + return __vma_start_write(vma, __vma_raw_mm_seqnum(vma), TASK_KILLABLE); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - - VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); + VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { - unsigned int mm_lock_seq; - /* * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && - !__is_vma_write_locked(vma, &mm_lock_seq), vma); + VM_WARN_ON_ONCE_VMA(refcount_read(&vma->vm_refcnt) <= 1 && + !__is_vma_write_locked(vma), vma); } static inline bool vma_is_attached(struct vm_area_struct *vma) -- 2.52.0 We can use lockdep to avoid unnecessary work here, otherwise update the code to logically evaluate all pertinent cases and share code with vma_assert_write_locked(). Make it clear here that we treat the VMA being detached at this point as a bug, this was only implicit before. Reviewed-by: Suren Baghdasaryan Signed-off-by: Lorenzo Stoakes --- include/linux/mmap_lock.h | 41 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 23bde4bd5a85..4a0aafc66c5d 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -322,19 +322,56 @@ int vma_start_write_killable(struct vm_area_struct *vma) return __vma_start_write(vma, __vma_raw_mm_seqnum(vma), TASK_KILLABLE); } +/** + * vma_assert_write_locked() - assert that @vma holds a VMA write lock. + * @vma: The VMA to assert. + */ static inline void vma_assert_write_locked(struct vm_area_struct *vma) { VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma); } +/** + * vma_assert_locked() - assert that @vma holds either a VMA read or a VMA write + * lock and is not detached. + * @vma: The VMA to assert. + */ static inline void vma_assert_locked(struct vm_area_struct *vma) { + unsigned int refcnt; + + /* + * If read-locked or currently excluding readers, then the VMA is + * locked. + */ +#ifdef CONFIG_LOCKDEP + if (lock_is_held(&vma->vmlock_dep_map)) + return; +#endif + /* * See the comment describing the vm_area_struct->vm_refcnt field for * details of possible refcnt values. */ - VM_WARN_ON_ONCE_VMA(refcount_read(&vma->vm_refcnt) <= 1 && - !__is_vma_write_locked(vma), vma); + refcnt = refcount_read(&vma->vm_refcnt); + + /* + * In this case we're either read-locked, write-locked with temporary + * readers, or in the midst of excluding readers, all of which means + * we're locked. + */ + if (refcnt > 1) + return; + + /* It is a bug for the VMA to be detached here. */ + VM_WARN_ON_ONCE_VMA(!refcnt, vma); + + /* + * OK, the VMA has a reference count of 1 which means it is either + * unlocked and attached or write-locked, so assert that it is + * write-locked. + */ + vma_assert_write_locked(vma); } static inline bool vma_is_attached(struct vm_area_struct *vma) -- 2.52.0 Sometimes we wish to assert that a VMA is stable, that is - the VMA cannot be changed underneath us. This will be the case if EITHER the VMA lock or the mmap lock is held. In order to do so, we introduce a new assert vma_assert_stabilised() - this will make a lockdep assert if lockdep is enabled AND the VMA is read-locked. Currently lockdep tracking for VMA write locks is not implemented, so it suffices to check in this case that we have either an mmap read or write semaphore held. Note that because the VMA lock uses the non-standard vmlock_dep_map naming convention, we cannot use lockdep_assert_is_write_held() so have to open code this ourselves via lockdep-asserting that lock_is_held_type(&vma->vmlock_dep_map, 0). We have to be careful here - for instance when merging a VMA, we use the mmap write lock to stabilise the examination of adjacent VMAs which might be simultaneously VMA read-locked whilst being faulted in. If we were to assert VMA read lock using lockdep we would encounter an incorrect lockdep assert. Also, we have to be careful about asserting mmap locks are held - if we try to address the above issue by first checking whether mmap lock is held and if so asserting it via lockdep, we may find that we were raced by another thread acquiring an mmap read lock simultaneously that either we don't own (and thus can be released any time - so we are not stable) or was indeed released since we last checked. So to deal with these complexities we end up with either a precise (if lockdep is enabled) or imprecise (if not) approach - in the first instance we assert the lock is held using lockdep and thus whether we own it. If we do own it, then the check is complete, otherwise we must check for the VMA read lock being held (VMA write lock implies mmap write lock so the mmap lock suffices for this). If lockdep is not enabled we simply check if the mmap lock is held and risk a false negative (i.e. not asserting when we should do). There are a couple places in the kernel where we already do this stabliisation check - the anon_vma_name() helper in mm/madvise.c and vma_flag_set_atomic() in include/linux/mm.h, which we update to use vma_assert_stabilised(). This change abstracts these into vma_assert_stabilised(), uses lockdep if possible, and avoids a duplicate check of whether the mmap lock is held. This is also self-documenting and lays the foundations for further VMA stability checks in the code. The only functional change here is adding the lockdep check. Reviewed-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Signed-off-by: Lorenzo Stoakes --- include/linux/mm.h | 5 +--- include/linux/mmap_lock.h | 52 +++++++++++++++++++++++++++++++++++++++ mm/madvise.c | 4 +-- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6029a71a6908..d7ca837dd8a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1008,10 +1008,7 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, { unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); - /* mmap read lock/VMA read lock must be held. */ - if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) - vma_assert_locked(vma); - + vma_assert_stabilised(vma); if (__vma_flag_atomic_valid(vma, bit)) set_bit((__force int)bit, bitmap); } diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 4a0aafc66c5d..8e31c5dd0adb 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -374,6 +374,52 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) vma_assert_write_locked(vma); } +/** + * vma_assert_stabilised() - assert that this VMA cannot be changed from + * underneath us either by having a VMA or mmap lock held. + * @vma: The VMA whose stability we wish to assess. + * + * If lockdep is enabled we can precisely ensure stability via either an mmap + * lock owned by us or a specific VMA lock. + * + * With lockdep disabled we may sometimes race with other threads acquiring the + * mmap read lock simultaneous with our VMA read lock. + */ +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* + * If another thread owns an mmap lock, it may go away at any time, and + * thus is no guarantee of stability. + * + * If lockdep is enabled we can accurately determine if an mmap lock is + * held and owned by us. Otherwise we must approximate. + * + * It doesn't necessarily mean we are not stabilised however, as we may + * hold a VMA read lock (not a write lock as this would require an owned + * mmap lock). + * + * If (assuming lockdep is not enabled) we were to assert a VMA read + * lock first we may also run into issues, as other threads can hold VMA + * read locks simlutaneous to us. + * + * Therefore if lockdep is not enabled we risk a false negative (i.e. no + * assert fired). If accurate checking is required, enable lockdep. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + if (lockdep_is_held(&vma->vm_mm->mmap_lock)) + return; + } else { + if (rwsem_is_locked(&vma->vm_mm->mmap_lock)) + return; + } + + /* + * We're not stabilised by the mmap lock, so assert that we're + * stabilised by a VMA lock. + */ + vma_assert_locked(vma); +} + static inline bool vma_is_attached(struct vm_area_struct *vma) { return refcount_read(&vma->vm_refcnt); @@ -476,6 +522,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); } +static inline void vma_assert_stabilised(struct vm_area_struct *vma) +{ + /* If no VMA locks, then either mmap lock suffices to stabilise. */ + mmap_assert_locked(vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_write_lock(struct mm_struct *mm) diff --git a/mm/madvise.c b/mm/madvise.c index 4bf4c8c38fd3..1f3040688f04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -109,9 +109,7 @@ void anon_vma_name_free(struct kref *kref) struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) - vma_assert_locked(vma); - + vma_assert_stabilised(vma); return vma->anon_name; } -- 2.52.0