Introduce a linked list tracking all inode connectors for a superblock. We will use this list when the superblock is getting shutdown to properly clean up all the inode marks instead of relying on scanning all inodes in the superblock which can get rather slow. Suggested-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fsnotify.c | 8 ++- fs/notify/fsnotify.h | 5 +- fs/notify/mark.c | 97 +++++++++++++++++++++++++++++--- include/linux/fsnotify_backend.h | 6 +- 4 files changed, 102 insertions(+), 14 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 71bd44e5ab6d..706484fb3bf3 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -112,7 +112,10 @@ void fsnotify_sb_delete(struct super_block *sb) void fsnotify_sb_free(struct super_block *sb) { - kfree(sb->s_fsnotify_info); + if (sb->s_fsnotify_info) { + WARN_ON_ONCE(!list_empty(&sb->s_fsnotify_info->inode_conn_list)); + kfree(sb->s_fsnotify_info); + } } /* @@ -777,8 +780,7 @@ static __init int fsnotify_init(void) if (ret) panic("initializing fsnotify_mark_srcu"); - fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, - SLAB_PANIC); + fsnotify_init_connector_caches(); return 0; } diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 5950c7a67f41..4e271875dcad 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -67,6 +67,9 @@ static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb) return sbinfo ? &sbinfo->sb_marks : NULL; } +struct fsnotify_mark_connector *fsnotify_inode_connector_from_list( + struct list_head *head); + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -106,6 +109,6 @@ static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns) */ extern void fsnotify_set_children_dentry_flags(struct inode *inode); -extern struct kmem_cache *fsnotify_mark_connector_cachep; +void fsnotify_init_connector_caches(void); #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 55a03bb05aa1..eb26bb8c5c63 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -79,7 +79,8 @@ #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; -struct kmem_cache *fsnotify_mark_connector_cachep; +static struct kmem_cache *fsnotify_mark_connector_cachep; +static struct kmem_cache *fsnotify_inode_mark_connector_cachep; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); @@ -323,10 +324,12 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) while (conn) { free = conn; conn = conn->destroy_next; - kmem_cache_free(fsnotify_mark_connector_cachep, free); + kfree(free); } } +static void fsnotify_untrack_connector(struct fsnotify_mark_connector *conn); + static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) @@ -342,6 +345,7 @@ static void *fsnotify_detach_connector_from_object( if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; + fsnotify_untrack_connector(conn); /* Unpin inode when detaching from connector */ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) @@ -644,6 +648,8 @@ static int fsnotify_attach_info_to_sb(struct super_block *sb) if (!sbinfo) return -ENOMEM; + INIT_LIST_HEAD(&sbinfo->inode_conn_list); + spin_lock_init(&sbinfo->list_lock); /* * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() * will observe an initialized structure @@ -655,20 +661,83 @@ static int fsnotify_attach_info_to_sb(struct super_block *sb) return 0; } -static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - void *obj, unsigned int obj_type) +struct fsnotify_inode_mark_connector { + struct fsnotify_mark_connector common; + struct list_head conns_list; +}; + +struct fsnotify_mark_connector *fsnotify_inode_connector_from_list( + struct list_head *head) { - struct fsnotify_mark_connector *conn; + return &list_entry(head, struct fsnotify_inode_mark_connector, + conns_list)->common; +} - conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); - if (!conn) - return -ENOMEM; +static void fsnotify_init_connector(struct fsnotify_mark_connector *conn, + void *obj, unsigned int obj_type) +{ spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; conn->prio = 0; conn->type = obj_type; conn->obj = obj; +} + +static struct fsnotify_mark_connector * +fsnotify_alloc_inode_connector(struct inode *inode) +{ + struct fsnotify_inode_mark_connector *iconn; + struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(inode->i_sb); + + iconn = kmem_cache_alloc(fsnotify_inode_mark_connector_cachep, + GFP_KERNEL); + if (!iconn) + return NULL; + + fsnotify_init_connector(&iconn->common, inode, FSNOTIFY_OBJ_TYPE_INODE); + spin_lock(&sbinfo->list_lock); + list_add(&iconn->conns_list, &sbinfo->inode_conn_list); + spin_unlock(&sbinfo->list_lock); + iconn->common.flags |= FSNOTIFY_CONN_FLAG_TRACKED; + + return &iconn->common; +} + +static void fsnotify_untrack_connector(struct fsnotify_mark_connector *conn) +{ + struct fsnotify_inode_mark_connector *iconn; + struct fsnotify_sb_info *sbinfo; + + if (!(conn->flags & FSNOTIFY_CONN_FLAG_TRACKED)) + return; + + WARN_ON_ONCE(conn->type != FSNOTIFY_OBJ_TYPE_INODE); + iconn = container_of(conn, struct fsnotify_inode_mark_connector, common); + sbinfo = fsnotify_sb_info(fsnotify_conn_inode(conn)->i_sb); + spin_lock(&sbinfo->list_lock); + list_del(&iconn->conns_list); + spin_unlock(&sbinfo->list_lock); + conn->flags &= ~FSNOTIFY_CONN_FLAG_TRACKED; +} + +static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, + void *obj, unsigned int obj_type) +{ + struct fsnotify_mark_connector *conn; + + if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { + struct inode *inode = obj; + + conn = fsnotify_alloc_inode_connector(inode); + } else { + conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, + GFP_KERNEL); + if (conn) + fsnotify_init_connector(conn, obj, obj_type); + } + if (!conn) + return -ENOMEM; /* * cmpxchg() provides the barrier so that readers of *connp can see @@ -676,7 +745,8 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ - kmem_cache_free(fsnotify_mark_connector_cachep, conn); + fsnotify_untrack_connector(conn); + kfree(conn); } return 0; } @@ -1007,3 +1077,12 @@ void fsnotify_wait_marks_destroyed(void) flush_delayed_work(&reaper_work); } EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed); + +__init void fsnotify_init_connector_caches(void) +{ + fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, + SLAB_PANIC); + fsnotify_inode_mark_connector_cachep = KMEM_CACHE( + fsnotify_inode_mark_connector, + SLAB_PANIC); +} diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0d954ea7b179..7f2157d06043 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -546,6 +546,7 @@ struct fsnotify_mark_connector { unsigned char prio; /* Highest priority group */ #define FSNOTIFY_CONN_FLAG_IS_WATCHED 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 +#define FSNOTIFY_CONN_FLAG_TRACKED 0x04 unsigned short flags; /* flags [lock] */ union { /* Object pointer [lock] */ @@ -553,7 +554,7 @@ struct fsnotify_mark_connector { /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; - struct hlist_head list; + struct hlist_head list; /* List of marks */ }; /* @@ -562,6 +563,9 @@ struct fsnotify_mark_connector { */ struct fsnotify_sb_info { struct fsnotify_mark_connector __rcu *sb_marks; + /* List of connectors for inode marks */ + struct list_head inode_conn_list; + spinlock_t list_lock; /* Lock protecting inode_conn_list */ /* * Number of inode/mount/sb objects that are being watched in this sb. * Note that inodes objects are currently double-accounted. -- 2.51.0 Instead of iterating all inodes belonging to a superblock to find inode marks and remove them on umount, iterate all inode connectors for the superblock. This may be substantially faster since there are generally much less inodes with fsnotify marks than all inodes. It also removes one use of sb->s_inodes list which we strive to ultimately remove. Signed-off-by: Jan Kara --- fs/notify/fsnotify.c | 74 +++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 49 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 706484fb3bf3..16a4a537d8c3 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -34,62 +34,38 @@ void __fsnotify_mntns_delete(struct mnt_namespace *mntns) } /** - * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @sb: superblock being unmounted. + * fsnotify_unmount_inodes - an sb is unmounting. Handle any watched inodes. + * @sbinfo: fsnotify info for superblock being unmounted. * - * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. + * Walk all inode connectors for the superblock and free all associated marks. */ -static void fsnotify_unmount_inodes(struct super_block *sb) +static void fsnotify_unmount_inodes(struct fsnotify_sb_info *sbinfo) { - struct inode *inode, *iput_inode = NULL; - - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - /* - * We cannot __iget() an inode in state I_FREEING, - * I_WILL_FREE, or I_NEW which is fine because by that point - * the inode cannot have any associated watches. - */ - spin_lock(&inode->i_lock); - if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { - spin_unlock(&inode->i_lock); - continue; - } - - /* - * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with SB_ACTIVE clear would actually - * evict all inodes with zero i_count from icache which is - * unnecessarily violent and may in fact be illegal to do. - * However, we should have been called /after/ evict_inodes - * removed all zero refcount inodes, in any case. Test to - * be sure. - */ - if (!icount_read(inode)) { - spin_unlock(&inode->i_lock); - continue; - } + int idx; + struct fsnotify_mark_connector *conn; + struct inode *inode; + /* + * We hold srcu over the iteration so that returned connectors stay + * allocated until we can grab them in fsnotify_destroy_conn_marks() + */ + idx = srcu_read_lock(&fsnotify_mark_srcu); + spin_lock(&sbinfo->list_lock); + while (!list_empty(&sbinfo->inode_conn_list)) { + conn = fsnotify_inode_connector_from_list( + sbinfo->inode_conn_list.next); + /* All connectors on the list are still attached to an inode */ + inode = conn->obj; __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - iput(iput_inode); - - /* for each watch, send FS_UNMOUNT and then remove it */ + spin_unlock(&sbinfo->list_lock); fsnotify_inode(inode, FS_UNMOUNT); - - fsnotify_inode_delete(inode); - - iput_inode = inode; - + fsnotify_destroy_marks(&inode->i_fsnotify_marks); + iput(inode); cond_resched(); - spin_lock(&sb->s_inode_list_lock); + spin_lock(&sbinfo->list_lock); } - spin_unlock(&sb->s_inode_list_lock); - - iput(iput_inode); + spin_unlock(&sbinfo->list_lock); + srcu_read_unlock(&fsnotify_mark_srcu, idx); } void fsnotify_sb_delete(struct super_block *sb) @@ -100,7 +76,7 @@ void fsnotify_sb_delete(struct super_block *sb) if (!sbinfo) return; - fsnotify_unmount_inodes(sb); + fsnotify_unmount_inodes(sbinfo); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), -- 2.51.0 Currently fsnotify_sb_delete() was called after we have evicted superblock's dcache and inode cache. This was done mainly so that we iterate as few inodes as possible when removing inode marks. However, as Jakub reported, this is problematic because for some filesystems encoding of file handles uses sb->s_root which gets cleared as part of dcache eviction. And either delayed fsnotify events or reading fdinfo for fsnotify group with marks on fs being unmounted may trigger encoding of file handles during unmount. So move shutdown of fsnotify subsystem before shrinking of dcache. Reported-by: Jakub Acs Signed-off-by: Jan Kara --- fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/super.c b/fs/super.c index 3d85265d1400..9c13e68277dd 100644 --- a/fs/super.c +++ b/fs/super.c @@ -618,6 +618,7 @@ void generic_shutdown_super(struct super_block *sb) const struct super_operations *sop = sb->s_op; if (sb->s_root) { + fsnotify_sb_delete(sb); shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; @@ -629,9 +630,8 @@ void generic_shutdown_super(struct super_block *sb) /* * Clean up and evict any inodes that still have references due - * to fsnotify or the security policy. + * to the security policy. */ - fsnotify_sb_delete(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { -- 2.51.0