If shrink_dcache_tree() runs into a potential victim that is already dying, it must wait for that dentry to go away. To avoid busy-waiting we need some object to wait on and a way for dentry_unlist() to see that we need to be notified. The obvious place for the object to wait on would be on our stack frame. We will store a pointer to that object (struct completion_list) in victim dentry; if there's more than one thread wanting to wait for the same dentry to finish dying, we'll have their instances linked into a list, with reference in dentry pointing to the head of that list. * new object - struct completion_list. A pair of struct completion and pointer to the next instance. That's what shrink_dcache_tree() will wait on if needed. * add a new member (->waiters, opaque pointer to struct completion_list) to struct dentry. It is defined for negative live dentries that are not in-lookup ones and it will remain NULL for almost all of them. It does not conflict with ->d_rcu (defined for killed dentries), ->d_alias (defined for positive dentries, all live) or ->d_in_lookup_hash (defined for in-lookup dentries, all live negative). That allows to colocate all four members. * make sure that all places where dentry enters the state where ->waiters is defined (live, negative, not-in-lookup) initialize ->waiters to NULL. * if select_collect2() runs into a dentry that is already dying, have its caller insert a local instance of struct completion_list into the head of the list hanging off dentry->waiters and wait for completion. * if dentry_unlist() sees non-NULL ->waiters, have it carefully walk through the completion_list instances in that list, calling complete() for each. For now struct completion_list is local to fs/dcache.c; it's obviously dentry-agnostic, and it can be trivially lifted into linux/completion.h if somebody finds a reason to do so... Signed-off-by: Al Viro --- fs/dcache.c | 79 ++++++++++++++++++++++++++++++++++++++---- include/linux/dcache.h | 18 ++++++++-- 2 files changed, 88 insertions(+), 9 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 616a445ec720..0c8faeee02e2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -456,6 +456,15 @@ static void dentry_unlink_inode(struct dentry * dentry) raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_alias); + /* + * dentry becomes negative, so the space occupied by ->d_alias + * belongs to ->waiters now; we could use __hlist_del() instead + * of hlist_del_init(), if not for the stunt pulled by nfs + * dummy root dentries - positive dentry *not* included into + * the alias list of its inode. Open-coding hlist_del_init() + * and removing zeroing would be too clumsy... + */ + dentry->waiters = NULL; raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); @@ -605,6 +614,44 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); +struct completion_list { + struct completion_list *next; + struct completion completion; +}; + +/* + * shrink_dcache_tree() needs to be notified when dentry in process of + * being evicted finally gets unlisted. Such dentries are + * already with negative ->d_count + * already negative + * already not in in-lookup hash + * reachable only via ->d_sib. + * + * Use ->waiters for a single-linked list of struct completion_list of + * waiters. + */ +static inline void d_add_waiter(struct dentry *dentry, struct completion_list *p) +{ + struct completion_list *v = dentry->waiters; + init_completion(&p->completion); + p->next = v; + dentry->waiters = p; +} + +static inline void d_complete_waiters(struct dentry *dentry) +{ + struct completion_list *v = dentry->waiters; + if (unlikely(v)) { + /* some shrink_dcache_tree() instances are waiting */ + dentry->waiters = NULL; + while (v) { + struct completion *r = &v->completion; + v = v->next; + complete(r); + } + } +} + static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; @@ -613,6 +660,7 @@ static inline void dentry_unlist(struct dentry *dentry) * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; + d_complete_waiters(dentry); if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); @@ -1569,6 +1617,10 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); + } else if (dentry->d_lockref.count < 0) { + rcu_read_lock(); + data->victim = dentry; + return D_WALK_QUIT; } /* * We can return to the caller if we have found some (this @@ -1608,12 +1660,27 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { - spin_lock(&data.victim->d_lock); - if (!lock_for_kill(data.victim)) { - spin_unlock(&data.victim->d_lock); + struct dentry *v = data.victim; + + spin_lock(&v->d_lock); + if (v->d_lockref.count < 0 && + !(v->d_flags & DCACHE_DENTRY_KILLED)) { + struct completion_list wait; + // It's busy dying; have it notify us once + // it becomes invisible to d_walk(). + d_add_waiter(v, &wait); + spin_unlock(&v->d_lock); + rcu_read_unlock(); + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + wait_for_completion(&wait.completion); + continue; + } + if (!lock_for_kill(v)) { + spin_unlock(&v->d_lock); rcu_read_unlock(); } else { - shrink_kill(data.victim); + shrink_kill(v); } } if (!list_empty(&data.dispose)) @@ -1787,7 +1854,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); - INIT_HLIST_NODE(&dentry->d_alias); + dentry->waiters = NULL; INIT_HLIST_NODE(&dentry->d_sib); if (dentry->d_op && dentry->d_op->d_init) { @@ -2729,7 +2796,7 @@ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); - INIT_HLIST_NODE(&dentry->d_alias); + dentry->waiters = NULL; INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f939d2ed10a3..19098253f2dd 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -88,6 +88,7 @@ union shortname_store { #define d_lock d_lockref.lock #define d_iname d_shortname.string +struct completion_list; struct dentry { /* RCU lookup touched fields */ @@ -122,12 +123,23 @@ struct dentry { struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* - * d_alias and d_rcu can share memory + * the following members can share memory - their uses are + * mutually exclusive. */ union { - struct hlist_node d_alias; /* inode alias list */ - struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ + /* positives: inode alias list */ + struct hlist_node d_alias; + /* in-lookup ones (all negative, live): hash chain */ + struct hlist_bl_node d_in_lookup_hash; + /* killed ones: (already negative) used to schedule freeing */ struct rcu_head d_rcu; + /* + * live non-in-lookup negatives: used if shrink_dcache_tree() + * races with eviction by another thread and needs to wait for + * this dentry to get killed . Remains NULL for almost all + * negative dentries. + */ + struct completion_list *waiters; }; }; -- 2.47.3