Add smp_cond_load_relaxed_timeout(), which extends smp_cond_load_relaxed() to allow waiting for a duration. The additional parameter allows for the timeout check. The waiting is done via the usual cpu_relax() spin-wait around the condition variable with periodic evaluation of the time-check. The number of times we spin is defined by SMP_TIMEOUT_SPIN_COUNT (chosen to be 200 by default) which, assuming each cpu_relax() iteration takes around 20-30 cycles (measured on a variety of x86 platforms), amounts to around 4000-6000 cycles. Cc: Arnd Bergmann Cc: Will Deacon Cc: Catalin Marinas Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Reviewed-by: Catalin Marinas Reviewed-by: Haris Okanovic Tested-by: Haris Okanovic Signed-off-by: Ankur Arora --- include/asm-generic/barrier.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index d4f581c1e21d..8483e139954f 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -273,6 +273,41 @@ do { \ }) #endif +#ifndef SMP_TIMEOUT_SPIN_COUNT +#define SMP_TIMEOUT_SPIN_COUNT 200 +#endif + +/** + * smp_cond_load_relaxed_timeout() - (Spin) wait for cond with no ordering + * guarantees until a timeout expires. + * @ptr: pointer to the variable to wait on + * @cond: boolean expression to wait for + * @time_check_expr: expression to decide when to bail out + * + * Equivalent to using READ_ONCE() on the condition variable. + */ +#ifndef smp_cond_load_relaxed_timeout +#define smp_cond_load_relaxed_timeout(ptr, cond_expr, time_check_expr) \ +({ \ + typeof(ptr) __PTR = (ptr); \ + __unqual_scalar_typeof(*ptr) VAL; \ + u32 __n = 0, __spin = SMP_TIMEOUT_SPIN_COUNT; \ + \ + for (;;) { \ + VAL = READ_ONCE(*__PTR); \ + if (cond_expr) \ + break; \ + cpu_relax(); \ + if (++__n < __spin) \ + continue; \ + if (time_check_expr) \ + break; \ + __n = 0; \ + } \ + (typeof(*ptr))VAL; \ +}) +#endif + /* * pmem_wmb() ensures that all stores for which the modification * are written to persistent storage by preceding instructions have -- 2.43.5 Add smp_cond_load_relaxed_timeout(), a timed variant of smp_cond_load_relaxed(). This uses __cmpwait_relaxed() to do the actual waiting, with the event-stream guaranteeing that we wake up from WFE periodically and not block forever in case there are no stores to the cacheline. For cases when the event-stream is unavailable, fallback to spin-waiting. Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Reviewed-by: Haris Okanovic Tested-by: Haris Okanovic Signed-off-by: Ankur Arora --- arch/arm64/include/asm/barrier.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index f5801b0ba9e9..4f0d9ed7a072 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -219,6 +219,29 @@ do { \ (typeof(*ptr))VAL; \ }) +/* Re-declared here to avoid include dependency. */ +extern bool arch_timer_evtstrm_available(void); + +#define smp_cond_load_relaxed_timeout(ptr, cond_expr, time_check_expr) \ +({ \ + typeof(ptr) __PTR = (ptr); \ + __unqual_scalar_typeof(*ptr) VAL; \ + bool __wfe = arch_timer_evtstrm_available(); \ + \ + for (;;) { \ + VAL = READ_ONCE(*__PTR); \ + if (cond_expr) \ + break; \ + if (time_check_expr) \ + break; \ + if (likely(__wfe)) \ + __cmpwait_relaxed(__PTR, VAL); \ + else \ + cpu_relax(); \ + } \ + (typeof(*ptr)) VAL; \ +}) + #include #endif /* __ASSEMBLY__ */ -- 2.43.5 In preparation for defining smp_cond_load_acquire_timeout(), remove the private copy. Lacking this, the rqspinlock code falls back to using smp_cond_load_acquire(). Cc: Kumar Kartikeya Dwivedi Cc: Alexei Starovoitov Reviewed-by: Catalin Marinas Reviewed-by: Haris Okanovic Signed-off-by: Ankur Arora --- arch/arm64/include/asm/rqspinlock.h | 85 ----------------------------- 1 file changed, 85 deletions(-) diff --git a/arch/arm64/include/asm/rqspinlock.h b/arch/arm64/include/asm/rqspinlock.h index 9ea0a74e5892..a385603436e9 100644 --- a/arch/arm64/include/asm/rqspinlock.h +++ b/arch/arm64/include/asm/rqspinlock.h @@ -3,91 +3,6 @@ #define _ASM_RQSPINLOCK_H #include - -/* - * Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom - * version based on [0]. In rqspinlock code, our conditional expression involves - * checking the value _and_ additionally a timeout. However, on arm64, the - * WFE-based implementation may never spin again if no stores occur to the - * locked byte in the lock word. As such, we may be stuck forever if - * event-stream based unblocking is not available on the platform for WFE spin - * loops (arch_timer_evtstrm_available). - * - * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this - * copy-paste. - * - * While we rely on the implementation to amortize the cost of sampling - * cond_expr for us, it will not happen when event stream support is - * unavailable, time_expr check is amortized. This is not the common case, and - * it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns - * comparison, hence just let it be. In case of event-stream, the loop is woken - * up at microsecond granularity. - * - * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com - */ - -#ifndef smp_cond_load_acquire_timewait - -#define smp_cond_time_check_count 200 - -#define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \ - time_limit_ns) ({ \ - typeof(ptr) __PTR = (ptr); \ - __unqual_scalar_typeof(*ptr) VAL; \ - unsigned int __count = 0; \ - for (;;) { \ - VAL = READ_ONCE(*__PTR); \ - if (cond_expr) \ - break; \ - cpu_relax(); \ - if (__count++ < smp_cond_time_check_count) \ - continue; \ - if ((time_expr_ns) >= (time_limit_ns)) \ - break; \ - __count = 0; \ - } \ - (typeof(*ptr))VAL; \ -}) - -#define __smp_cond_load_acquire_timewait(ptr, cond_expr, \ - time_expr_ns, time_limit_ns) \ -({ \ - typeof(ptr) __PTR = (ptr); \ - __unqual_scalar_typeof(*ptr) VAL; \ - for (;;) { \ - VAL = smp_load_acquire(__PTR); \ - if (cond_expr) \ - break; \ - __cmpwait_relaxed(__PTR, VAL); \ - if ((time_expr_ns) >= (time_limit_ns)) \ - break; \ - } \ - (typeof(*ptr))VAL; \ -}) - -#define smp_cond_load_acquire_timewait(ptr, cond_expr, \ - time_expr_ns, time_limit_ns) \ -({ \ - __unqual_scalar_typeof(*ptr) _val; \ - int __wfe = arch_timer_evtstrm_available(); \ - \ - if (likely(__wfe)) { \ - _val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \ - time_expr_ns, \ - time_limit_ns); \ - } else { \ - _val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \ - time_expr_ns, \ - time_limit_ns); \ - smp_acquire__after_ctrl_dep(); \ - } \ - (typeof(*ptr))_val; \ -}) - -#endif - -#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1) - #include #endif /* _ASM_RQSPINLOCK_H */ -- 2.43.5 Add the acquire variant of smp_cond_load_relaxed_timeout(). This reuses the relaxed variant, with an additional LOAD->LOAD ordering. Cc: Arnd Bergmann Cc: Will Deacon Cc: Catalin Marinas Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Reviewed-by: Catalin Marinas Reviewed-by: Haris Okanovic Tested-by: Haris Okanovic Signed-off-by: Ankur Arora --- include/asm-generic/barrier.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 8483e139954f..f2e90c993ead 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -308,6 +308,28 @@ do { \ }) #endif +/** + * smp_cond_load_acquire_timeout() - (Spin) wait for cond with ACQUIRE ordering + * until a timeout expires. + * + * Arguments: same as smp_cond_load_relaxed_timeout(). + * + * Equivalent to using smp_cond_load_acquire() on the condition variable with + * a timeout. + */ +#ifndef smp_cond_load_acquire_timeout +#define smp_cond_load_acquire_timeout(ptr, cond_expr, time_check_expr) \ +({ \ + __unqual_scalar_typeof(*ptr) _val; \ + _val = smp_cond_load_relaxed_timeout(ptr, cond_expr, \ + time_check_expr); \ + \ + /* Depends on the control dependency of the wait above. */ \ + smp_acquire__after_ctrl_dep(); \ + (typeof(*ptr))_val; \ +}) +#endif + /* * pmem_wmb() ensures that all stores for which the modification * are written to persistent storage by preceding instructions have -- 2.43.5 Switch out the conditional load inerfaces used by rqspinlock to smp_cond_read_acquire_timeout(). This interface handles the timeout check explicitly and does any necessary amortization, so use check_timeout() directly. Cc: Kumar Kartikeya Dwivedi Cc: Alexei Starovoitov Signed-off-by: Ankur Arora --- kernel/bpf/rqspinlock.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 5ab354d55d82..4d2c12d131ae 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -241,20 +241,14 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, } /* - * Do not amortize with spins when res_smp_cond_load_acquire is defined, - * as the macro does internal amortization for us. + * Amortize timeout check for busy-wait loops. */ -#ifndef res_smp_cond_load_acquire #define RES_CHECK_TIMEOUT(ts, ret, mask) \ ({ \ if (!(ts).spin++) \ (ret) = check_timeout((lock), (mask), &(ts)); \ (ret); \ }) -#else -#define RES_CHECK_TIMEOUT(ts, ret, mask) \ - ({ (ret) = check_timeout((lock), (mask), &(ts)); }) -#endif /* * Initialize the 'spin' member. @@ -313,11 +307,8 @@ EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); */ static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); -#ifndef res_smp_cond_load_acquire -#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) -#endif - -#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) +#define res_atomic_cond_read_acquire_timeout(v, c, t) \ + smp_cond_load_acquire_timeout(&(v)->counter, (c), (t)) /** * resilient_queued_spin_lock_slowpath - acquire the queued spinlock @@ -418,7 +409,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) */ if (val & _Q_LOCKED_MASK) { RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); - res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); + smp_cond_load_acquire_timeout(&lock->locked, !VAL, + (ret = check_timeout(lock, _Q_LOCKED_MASK, &ts))); } if (ret) { @@ -572,9 +564,8 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * us. */ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); - val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || - RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); - + val = res_atomic_cond_read_acquire_timeout(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), + (ret = check_timeout(lock, _Q_LOCKED_PENDING_MASK, &ts))); waitq_timeout: if (ret) { /* -- 2.43.5