Similarly to inode_state_read_once(), it makes the caller spell out they acknowledge instability of the returned value. Signed-off-by: Mateusz Guzik --- arch/powerpc/platforms/cell/spufs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/ext4/ialloc.c | 4 ++-- fs/hpfs/inode.c | 2 +- fs/inode.c | 12 ++++++------ fs/nfs/inode.c | 4 ++-- fs/smb/client/inode.c | 2 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_trace.h | 2 +- include/linux/fs.h | 13 +++++++++++++ include/trace/events/filelock.h | 2 +- security/landlock/fs.c | 2 +- 14 files changed, 33 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 10fa9b844fcc..f6de8c1169d5 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1430,7 +1430,7 @@ static int spufs_mfc_open(struct inode *inode, struct file *file) if (ctx->owner != current->mm) return -EINVAL; - if (icount_read(inode) != 1) + if (icount_read_once(inode) != 1) return -EBUSY; mutex_lock(&ctx->mapping_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8d97a8ad3858..f36c49e83c04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4741,7 +4741,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (icount_read(&inode->vfs_inode) > 1) + if (icount_read_once(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b1746273f186..2cb3c919d40d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2223,7 +2223,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = icount_read(inode); + count = icount_read_once(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3fd8f0099852..8c80d5087516 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (icount_read(inode) > 1) { + if (icount_read_once(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: count=%d", __func__, __LINE__, inode->i_ino, - icount_read(inode)); + icount_read_once(inode)); return; } if (inode->i_nlink) { diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 0e932cc8be1b..1b4fcf760aad 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !icount_read(i)) { + if (hpfs_inode->i_rddir_off && !icount_read_once(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/inode.c b/fs/inode.c index 5ad169d51728..1f5a383ccf27 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -907,7 +907,7 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (icount_read(inode)) + if (icount_read_once(inode)) continue; spin_lock(&inode->i_lock); @@ -1926,7 +1926,7 @@ static void iput_final(struct inode *inode) int drop; WARN_ON(inode_state_read(inode) & I_NEW); - VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); + VFS_BUG_ON_INODE(icount_read(inode) != 0, inode); if (op->drop_inode) drop = op->drop_inode(inode); @@ -1945,7 +1945,7 @@ static void iput_final(struct inode *inode) * Re-check ->i_count in case the ->drop_inode() hooks played games. * Note we only execute this if the verdict was to drop the inode. */ - VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); + VFS_BUG_ON_INODE(icount_read(inode) != 0, inode); if (drop) { inode_state_set(inode, I_FREEING); @@ -1989,7 +1989,7 @@ void iput(struct inode *inode) * equal to one, then two CPUs racing to further drop it can both * conclude it's fine. */ - VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode); + VFS_BUG_ON_INODE(icount_read_once(inode) < 1, inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; @@ -2023,7 +2023,7 @@ EXPORT_SYMBOL(iput); void iput_not_last(struct inode *inode) { VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_CLEAR), inode); - VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); + VFS_BUG_ON_INODE(icount_read_once(inode) < 2, inode); WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); } @@ -3046,7 +3046,7 @@ void dump_inode(struct inode *inode, const char *reason) } state = inode_state_read_once(inode); - count = atomic_read(&inode->i_count); + count = icount_read_once(inode); if (!sb || get_kernel_nofault(s_type, &sb->s_type) || !s_type || diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 98a8f0de1199..22834eddd5b1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - icount_read(inode)); + icount_read_once(inode)); out: return inode; @@ -2261,7 +2261,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%llu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - icount_read(inode), fattr->valid); + icount_read_once(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 888f9e35f14b..ab35e35b16d7 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2842,7 +2842,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, icount_read(inode), + full_path, inode, icount_read_once(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9a77d8b64ffa..38972786817e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %llu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !icount_read(inode)); + ubifs_assert(c, !icount_read_once(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index beaa26ec62da..4f659eba6ae5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1046,7 +1046,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (icount_read(VFS_I(ip))) + if (icount_read_once(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); if (whichfork == XFS_DATA_FORK) ASSERT(new_size <= XFS_ISIZE(ip)); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5e8190fe2be9..cbdec40826b3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1156,7 +1156,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = icount_read(VFS_I(ip)); + __entry->count = icount_read_once(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8afbe2ef2686..07363fce4406 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2225,8 +2225,21 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +/* + * returns the refcount on the inode. it can change arbitrarily. + */ +static inline int icount_read_once(const struct inode *inode) +{ + return atomic_read(&inode->i_count); +} + +/* + * returns the refcount on the inode. The lock guarantees no new references + * are added, but references can be dropped as long as the result is > 0. + */ static inline int icount_read(const struct inode *inode) { + lockdep_assert_held(&inode->i_lock); return atomic_read(&inode->i_count); } diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index 116774886244..c8c8847bb6f6 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -190,7 +190,7 @@ TRACE_EVENT(generic_add_lease, __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); - __entry->icount = icount_read(inode); + __entry->icount = icount_read_once(inode); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; diff --git a/security/landlock/fs.c b/security/landlock/fs.c index c1ecfe239032..32d560f12dbd 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1278,7 +1278,7 @@ static void hook_sb_delete(struct super_block *const sb) struct landlock_object *object; /* Only handles referenced inodes. */ - if (!icount_read(inode)) + if (!icount_read_once(inode)) continue; /* -- 2.48.1 The placement was illogical, move it next to igrab(). Take this opportunity to add docs and an assert on the refcount. While its modification remains gated with a WARN_ON, the new assert will also dump the inode state which might aid debugging. No functional changes. Signed-off-by: Mateusz Guzik --- fs/inode.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 1f5a383ccf27..e397a4b56671 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -522,15 +522,6 @@ static void init_once(void *foo) inode_init_once(inode); } -/* - * get additional reference to inode; caller must already hold one. - */ -void ihold(struct inode *inode) -{ - WARN_ON(atomic_inc_return(&inode->i_count) < 2); -} -EXPORT_SYMBOL(ihold); - struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { @@ -1578,6 +1569,17 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) } EXPORT_SYMBOL(iunique); +/** + * ihold - get a reference on the inode, provided you already have one + * @inode: inode to operate on + */ +void ihold(struct inode *inode) +{ + VFS_BUG_ON_INODE(icount_read_once(inode) < 1, inode); + WARN_ON(atomic_inc_return(&inode->i_count) < 2); +} +EXPORT_SYMBOL(ihold); + struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); -- 2.48.1 f2fs and ntfs play games where they transitioning the refcount 0->1 and release the inode spinlock, allowing other threads to grab a ref of their own. They also return 0 in that case, making this problem harmless. However, should they start using the I_DONTCACHE machinery down the road while retaining the above, iput_final() will get a race where it can proceed to teardown an inode with references. The easiest way out for the time being is to future-proof it by predicating caching on the count. Developing better ->drop_inode semantics and sanitizing all users is left as en exercise for the reader. Signed-off-by: Mateusz Guzik --- fs/inode.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index e397a4b56671..a06edd557e84 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1935,20 +1935,29 @@ static void iput_final(struct inode *inode) else drop = inode_generic_drop(inode); - if (!drop && - !(inode_state_read(inode) & I_DONTCACHE) && - (sb->s_flags & SB_ACTIVE)) { + /* + * XXXCRAP: there are ->drop_inode hooks playing nasty games releasing the + * spinlock and temporarily grabbing refs. This opens a possibility someone + * else will sneak in and grab a ref while it happens. + * + * If such a hook returns 0 (== don't drop) this happens to be harmless as long + * as the inode is not marked with I_DONTCACHE. Otherwise we are proceeding with + * teardown despite references being present. + * + * Damage-control the problem by including the count in the decision. However, + * assert no refs showed up if the hook decided to drop the inode. + */ + if (drop) + VFS_BUG_ON_INODE(icount_read(inode) != 0, inode); + + if (icount_read(inode) > 0 || + (!drop && !(inode_state_read(inode) & I_DONTCACHE) && + (sb->s_flags & SB_ACTIVE))) { __inode_lru_list_add(inode, true); spin_unlock(&inode->i_lock); return; } - /* - * Re-check ->i_count in case the ->drop_inode() hooks played games. - * Note we only execute this if the verdict was to drop the inode. - */ - VFS_BUG_ON_INODE(icount_read(inode) != 0, inode); - if (drop) { inode_state_set(inode, I_FREEING); } else { -- 2.48.1 With this change only 0->1 and 1->0 transitions need the lock. I verified all places which look at the refcount either only care about it staying 0 (and have the lock enforce it) or don't hold the inode lock to begin with (making the above change irrelevant to their correcness or lack thereof). I also confirmed nfs and btrfs like to call into these a lot and now avoid the lock in the common case, shaving off some atomics. Signed-off-by: Mateusz Guzik --- fs/dcache.c | 4 +++ fs/inode.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 4 +-- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 9ceab142896f..b63450ebb85c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2033,6 +2033,10 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) __d_instantiate(entry, inode); spin_unlock(&entry->d_lock); WARN_ON(!(inode_state_read(inode) & I_NEW)); + /* + * Paired with igrab_try_lockless() + */ + smp_wmb(); inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); diff --git a/fs/inode.c b/fs/inode.c index a06edd557e84..5e5cddd76a9d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1029,6 +1029,7 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) } static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked); +static bool igrab_try_lockless(struct inode *inode); /* * Called with the inode lock held. @@ -1053,6 +1054,11 @@ static struct inode *find_inode(struct super_block *sb, continue; if (!test(inode, data)) continue; + if (igrab_try_lockless(inode)) { + rcu_read_unlock(); + *isnew = false; + return inode; + } spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, hash_locked, true); @@ -1095,6 +1101,11 @@ static struct inode *find_inode_fast(struct super_block *sb, continue; if (inode->i_sb != sb) continue; + if (igrab_try_lockless(inode)) { + rcu_read_unlock(); + *isnew = false; + return inode; + } spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, hash_locked, true); @@ -1212,6 +1223,10 @@ void unlock_new_inode(struct inode *inode) lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode_state_read(inode) & I_NEW)); + /* + * Paired with igrab_try_lockless() + */ + smp_wmb(); inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); @@ -1223,6 +1238,10 @@ void discard_new_inode(struct inode *inode) lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode_state_read(inode) & I_NEW)); + /* + * Paired with igrab_try_lockless() + */ + smp_wmb(); inode_state_clear(inode, I_NEW); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); @@ -1582,6 +1601,14 @@ EXPORT_SYMBOL(ihold); struct inode *igrab(struct inode *inode) { + /* + * Read commentary above igrab_try_lockless() for an explanation why this works. + */ + if (atomic_add_unless(&inode->i_count, 1, 0)) { + VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode); + return inode; + } + spin_lock(&inode->i_lock); if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) { __iget(inode); @@ -1599,6 +1626,44 @@ struct inode *igrab(struct inode *inode) } EXPORT_SYMBOL(igrab); +/* + * igrab_try_lockless - special inode refcount acquire primitive for the inode hash + * (don't use elsewhere!) + * + * It provides lockless refcount acquire in the common case of no problematic + * flags being set and the count being > 0. + * + * There are 4 state flags to worry about and the routine makes sure to not bump the + * ref if any of them is present. + * + * I_NEW and I_CREATING can only legally get set *before* the inode becomes visible + * during lookup. Thus if the flags are not spotted, they are guaranteed to not be + * a factor. However, we need an acquire fence before returning the inode just + * in case we raced against clearing the state to make sure our consumer picks up + * any other changes made prior. atomic_add_unless provides a full fence, which + * takes care of it. + * + * I_FREEING and I_WILL_FREE can only legally get set if ->i_count == 0 and it is + * illegal to bump the ref if either is present. Consequently if atomic_add_unless + * managed to replaced a non-0 value with a bigger one, we have a guarantee neither + * of these flags is set. Note this means explicitly checking of these flags below + * is not necessary, it is only done because it does not cost anything on top of the + * load which already needs to be done to handle the other flags. + */ +static bool igrab_try_lockless(struct inode *inode) +{ + if (inode_state_read_once(inode) & (I_NEW | I_CREATING | I_FREEING | I_WILL_FREE)) + return false; + /* + * Paired with routines clearing I_NEW + */ + if (atomic_add_unless(&inode->i_count, 1, 0)) { + VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode); + return true; + } + return false; +} + /** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search diff --git a/include/linux/fs.h b/include/linux/fs.h index 07363fce4406..119e0a3d2f42 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2234,8 +2234,8 @@ static inline int icount_read_once(const struct inode *inode) } /* - * returns the refcount on the inode. The lock guarantees no new references - * are added, but references can be dropped as long as the result is > 0. + * returns the refcount on the inode. The lock guarantees no 0->1 or 1->0 transitions + * of the count are going to take place, otherwise it changes arbitrarily. */ static inline int icount_read(const struct inode *inode) { -- 2.48.1