Opening and closing an inode dirties the ->i_readcount field. Depending on the alignment of the inode, it may happen to false-share with other fields loaded both for both operations to various extent. This notably concerns the ->i_flctx field. Since most inodes don't have the field populated, this bit can be managed with a flag in ->i_opflags instead which bypasses the problem. Here are results I obtained while opening a file read-only in a loop with 24 cores doing the work on Sapphire Rapids. Utilizing the flag as opposed to reading ->i_flctx field was toggled at runtime as the benchmark was running, to make sure both results come from the same alignment. before: 3233740 after: 3373346 (+4%) before: 3284313 after: 3518711 (+7%) before: 3505545 after: 4092806 (+16%) Or to put it differently, this varies wildly depending on how (un)lucky you get. The primary bottleneck before and after is the avoidable lockref trip in do_dentry_open(). Reviewed-by: Jeff Layton Signed-off-by: Mateusz Guzik --- - no changes, this is a resend of v3, which is already rebased on everything fs/locks.c | 14 ++++++++++++-- include/linux/filelock.h | 15 +++++++++++---- include/linux/fs.h | 1 + 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 9f565802a88c..7a63fa3ca9b4 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -178,7 +178,6 @@ locks_get_lock_context(struct inode *inode, int type) { struct file_lock_context *ctx; - /* paired with cmpxchg() below */ ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; @@ -196,7 +195,18 @@ locks_get_lock_context(struct inode *inode, int type) * Assign the pointer if it's not already assigned. If it is, then * free the context we just allocated. */ - if (cmpxchg(&inode->i_flctx, NULL, ctx)) { + spin_lock(&inode->i_lock); + if (!(inode->i_opflags & IOP_FLCTX)) { + VFS_BUG_ON_INODE(inode->i_flctx, inode); + WRITE_ONCE(inode->i_flctx, ctx); + /* + * Paired with locks_inode_context(). + */ + smp_store_release(&inode->i_opflags, inode->i_opflags | IOP_FLCTX); + spin_unlock(&inode->i_lock); + } else { + VFS_BUG_ON_INODE(!inode->i_flctx, inode); + spin_unlock(&inode->i_lock); kmem_cache_free(flctx_cache, ctx); ctx = locks_inode_context(inode); } diff --git a/include/linux/filelock.h b/include/linux/filelock.h index dc15f5427680..4a8912b9653e 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -242,8 +242,12 @@ static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { /* - * Paired with the fence in locks_get_lock_context(). + * Paired with smp_store_release in locks_get_lock_context(). + * + * Ensures ->i_flctx will be visible if we spotted the flag. */ + if (likely(!(smp_load_acquire(&inode->i_opflags) & IOP_FLCTX))) + return NULL; return READ_ONCE(inode->i_flctx); } @@ -471,7 +475,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -490,7 +494,7 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -535,8 +539,11 @@ static inline int break_deleg_wait(struct delegated_inode *di) static inline int break_layout(struct inode *inode, bool wait) { + struct file_lock_context *flctx; + smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + flctx = locks_inode_context(inode); + if (flctx && !list_empty_careful(&flctx->flc_lease)) { unsigned int flags = LEASE_BREAK_LAYOUT; if (!wait) diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..094b0adcb035 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -631,6 +631,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 #define IOP_FASTPERM_MAY_EXEC 0x0080 +#define IOP_FLCTX 0x0100 /* * Inode state bits. Protected by inode->i_lock -- 2.48.1