There's a case in which shrink_dcache_tree() ends up busy-waiting: if some dentry in the subtree in question is found to be in process of being evicted by another thread. We need to wait for that to finish so that parent would no longer be pinned, to avoid the situations when nothing in the tree is busy, but shrink_dcache_tree() fails to evict some directory only because memory pressure initiated eviction of some of its children before we got to evicting those ourselves. That would be bogus both for shrink_dcache_parent() and for shrink_dcache_for_umount(). Unfortunately, we have nothing to wait on. That had led to the possibility of busy-waiting - getting through the iteration of shrink_dcache_tree() main loop without having made any progress. That's Not Nice(tm) and that had been discussed quite a few times since at least 2018. Recently it became obvious that this goes beyond "not nice" - on sufficiently contrieved setup it's possible to get a livelock there, with both threads involved tied to the same CPU, shrink_dcache_tree() one with higher priority than the thread that has given CPU up on may_sleep() in the very beginning of iput() during eviction of the dentry in the tree shrink_dcache_tree() is busy-waiting for. Let's get rid of that busy-waiting. Constraints are * don't grow struct dentry * don't slow the normal case of __dentry_kill() down and it turns out to be doable. Dentries in question are * already marked dead (negative ->d_count) * already negative * already unhashed * already not in in-lookup hash * yet to get removed from ->d_sib and get DCACHE_DENTRY_KILLED in flags. Neither ->d_alias nor the fields overlapping it (->d_rcu and ->d_in_lookup_hash) are going to be accessed for these dentries until after dentry_unlist(). What's more, ->d_alias.next is guaranteed to be NULL. So we can embed struct completion into struct select_data and (ab)use ->d_alias.next for linked list of struct select_data instances. If dentry_unlist() finds ->d_alias.next non-NULL, it carefully goes over that list and calls complete() for each of those. That way select_collect2() can treat negative ->d_count the same way it deals with dentries on other thread's shrink list - grab rcu_read_lock(), stash the dentry into data.victim and tell d_walk() to stop. If shrink_dcache_parent() runs into that case, it should attach its select_data to victim dentry, evict whatever normal eviction candidates it has gathered and wait for completion. Voila... Signed-off-by: Al Viro --- diff --git a/fs/dcache.c b/fs/dcache.c index dc2fff4811d1..6db72a684d8d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -605,6 +605,54 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); +struct select_data { + struct dentry *start; + union { + long found; + struct { + struct dentry *victim; + struct select_data *next; + struct completion completion; + }; + }; + struct list_head dispose; +}; + +/* + * shrink_dcache_parent() needs to be notified when dentry in process of + * being evicted finally gets unlisted. Such dentries are + * already with negative ->d_count + * already negative + * already not in in-lookup hash + * reachable only via ->d_sib. + * + * Neither ->d_alias, nor ->d_rcu, nor ->d_in_lookup_hash are going to be + * accessed for those, so we can (ab)use ->d_alias.next for list of + * select_data of waiters. Initially it's going to be NULL and as long + * as dentry_unlist() returns it to that state we are fine. + */ +static inline void d_add_waiter(struct dentry *dentry, struct select_data *p) +{ + struct select_data *v = (void *)dentry->d_u.d_alias.next; + init_completion(&p->completion); + p->next = v; + dentry->d_u.d_alias.next = (void *)p; +} + +static inline void d_complete_waiters(struct dentry *dentry) +{ + struct select_data *v = (void *)dentry->d_u.d_alias.next; + if (unlikely(v)) { + /* some shrink_dcache_tree() instances are waiting */ + dentry->d_u.d_alias.next = NULL; + while (v) { + struct completion *r = &v->completion; + v = v->next; + complete(r); + } + } +} + static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; @@ -613,6 +661,7 @@ static inline void dentry_unlist(struct dentry *dentry) * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; + d_complete_waiters(dentry); if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); @@ -1499,15 +1548,6 @@ int d_set_mounted(struct dentry *dentry) * constraints. */ -struct select_data { - struct dentry *start; - union { - long found; - struct dentry *victim; - }; - struct list_head dispose; -}; - static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; @@ -1559,6 +1599,10 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); + } else if (dentry->d_lockref.count < 0) { + rcu_read_lock(); + data->victim = dentry; + return D_WALK_QUIT; } /* * We can return to the caller if we have found some (this @@ -1598,12 +1642,26 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { - spin_lock(&data.victim->d_lock); - if (!lock_for_kill(data.victim)) { - spin_unlock(&data.victim->d_lock); + struct dentry *v = data.victim; + + spin_lock(&v->d_lock); + if (v->d_lockref.count < 0 && + !(v->d_flags & DCACHE_DENTRY_KILLED)) { + // It's busy dying; have it notify us once + // it becomes invisible to d_walk(). + d_add_waiter(v, &data); + spin_unlock(&v->d_lock); + rcu_read_unlock(); + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + wait_for_completion(&data.completion); + continue; + } + if (!lock_for_kill(v)) { + spin_unlock(&v->d_lock); rcu_read_unlock(); } else { - shrink_kill(data.victim); + shrink_kill(v); } } if (!list_empty(&data.dispose))