From: Chuck Lever Async COPY operations hold copy stateids that represent NFSv4 state. Thus, when the NFS server administrator revokes all NFSv4 state for a filesystem via the unlock_fs interface, ongoing async COPY operations referencing that filesystem must also be canceled. Each cancelled copy triggers a CB_OFFLOAD callback carrying the NFS4ERR_ADMIN_REVOKED status to notify the client that the server terminated the operation. The static drop_client() function is renamed to nfsd4_put_client() and exported. The function must be exported because both the new nfsd4_cancel_copy_by_sb() and the CB_OFFLOAD release callback in nfs4proc.c need to release client references. Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 124 ++++++++++++++++++++++++++++++++++++++++---- fs/nfsd/nfs4state.c | 20 ++++--- fs/nfsd/nfsctl.c | 1 + fs/nfsd/state.h | 2 + fs/nfsd/xdr4.h | 1 + 5 files changed, 130 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2b805fc51262..e7ec87b6c331 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1427,14 +1427,26 @@ static void nfs4_put_copy(struct nfsd4_copy *copy) kfree(copy); } +static void release_copy_files(struct nfsd4_copy *copy); + static void nfsd4_stop_copy(struct nfsd4_copy *copy) { trace_nfsd_copy_async_cancel(copy); if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) { kthread_stop(copy->copy_task); - copy->nfserr = nfs_ok; + if (!test_bit(NFSD4_COPY_F_CB_ERROR, ©->cp_flags)) + copy->nfserr = nfs_ok; set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); } + + /* + * The copy was removed from async_copies before this function + * was called, so the reaper cannot clean it up. Release files + * here regardless of who won the STOPPED race. If the thread + * set STOPPED, it has finished using the files. If STOPPED + * was set here, kthread_stop() waited for the thread to exit. + */ + release_copy_files(copy); nfs4_put_copy(copy); } @@ -1462,6 +1474,72 @@ void nfsd4_shutdown_copy(struct nfs4_client *clp) while ((copy = nfsd4_unhash_copy(clp)) != NULL) nfsd4_stop_copy(copy); } + +static bool nfsd4_copy_on_sb(const struct nfsd4_copy *copy, + const struct super_block *sb) +{ + if (copy->nf_src && + file_inode(copy->nf_src->nf_file)->i_sb == sb) + return true; + if (copy->nf_dst && + file_inode(copy->nf_dst->nf_file)->i_sb == sb) + return true; + return false; +} + +/** + * nfsd4_cancel_copy_by_sb - cancel async copy operations on @sb + * @net: net namespace containing the copy operations + * @sb: targeted superblock + */ +void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_copy *copy, *tmp; + struct nfs4_client *clp; + unsigned int idhashval; + LIST_HEAD(to_cancel); + + spin_lock(&nn->client_lock); + for (idhashval = 0; idhashval < CLIENT_HASH_SIZE; idhashval++) { + struct list_head *head = &nn->conf_id_hashtbl[idhashval]; + + list_for_each_entry(clp, head, cl_idhash) { + spin_lock(&clp->async_lock); + list_for_each_entry_safe(copy, tmp, + &clp->async_copies, copies) { + if (nfsd4_copy_on_sb(copy, sb)) { + refcount_inc(©->refcount); + /* + * Hold a reference on the client while + * nfsd4_stop_copy() runs. Unlike + * nfsd4_unhash_copy(), cp_clp is not + * NULLed here because nfsd4_send_cb_offload() + * needs a valid client to send CB_OFFLOAD. + * That function takes its own reference to + * survive callback flight. + */ + kref_get(&clp->cl_nfsdfs.cl_ref); + copy->nfserr = nfserr_admin_revoked; + set_bit(NFSD4_COPY_F_CB_ERROR, + ©->cp_flags); + list_move(©->copies, &to_cancel); + } + } + spin_unlock(&clp->async_lock); + } + } + spin_unlock(&nn->client_lock); + + list_for_each_entry_safe(copy, tmp, &to_cancel, copies) { + struct nfs4_client *clp = copy->cp_clp; + + list_del_init(©->copies); + nfsd4_stop_copy(copy); + nfsd4_put_client(clp); + } +} + #ifdef CONFIG_NFSD_V4_2_INTER_SSC extern struct file *nfs42_ssc_open(struct vfsmount *ss_mnt, @@ -1751,6 +1829,7 @@ static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) container_of(cbo, struct nfsd4_copy, cp_cb_offload); set_bit(NFSD4_COPY_F_OFFLOAD_DONE, ©->cp_flags); + nfsd4_put_client(cb->cb_clp); } static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, @@ -1870,10 +1949,14 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) static void release_copy_files(struct nfsd4_copy *copy) { - if (copy->nf_src) + if (copy->nf_src) { nfsd_file_put(copy->nf_src); - if (copy->nf_dst) + copy->nf_src = NULL; + } + if (copy->nf_dst) { nfsd_file_put(copy->nf_dst); + copy->nf_dst = NULL; + } } static void cleanup_async_copy(struct nfsd4_copy *copy) @@ -1892,18 +1975,34 @@ static void cleanup_async_copy(struct nfsd4_copy *copy) static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) { struct nfsd4_cb_offload *cbo = ©->cp_cb_offload; + struct nfs4_client *clp = copy->cp_clp; + + /* + * cp_clp is NULL when called via nfsd4_shutdown_copy() during + * client destruction. Skip the callback; the client is gone. + */ + if (!clp) { + set_bit(NFSD4_COPY_F_OFFLOAD_DONE, ©->cp_flags); + return; + } memcpy(&cbo->co_res, ©->cp_res, sizeof(copy->cp_res)); memcpy(&cbo->co_fh, ©->fh, sizeof(copy->fh)); cbo->co_nfserr = copy->nfserr; cbo->co_retries = 5; - nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops, + /* + * Hold a reference on the client while the callback is in flight. + * Released in nfsd4_cb_offload_release(). + */ + kref_get(&clp->cl_nfsdfs.cl_ref); + + nfsd4_init_cb(&cbo->co_cb, clp, &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); nfsd41_cb_referring_call(&cbo->co_cb, &cbo->co_referring_sessionid, cbo->co_referring_slotid, cbo->co_referring_seqno); - trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid, + trace_nfsd_cb_offload(clp, &cbo->co_res.cb_stateid, &cbo->co_fh, copy->cp_count, copy->nfserr); nfsd4_try_run_cb(&cbo->co_cb); } @@ -1918,6 +2017,7 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) static int nfsd4_do_async_copy(void *data) { struct nfsd4_copy *copy = (struct nfsd4_copy *)data; + __be32 nfserr = nfs_ok; trace_nfsd_copy_async(copy); if (nfsd4_ssc_is_inter(copy)) { @@ -1928,23 +2028,25 @@ static int nfsd4_do_async_copy(void *data) if (IS_ERR(filp)) { switch (PTR_ERR(filp)) { case -EBADF: - copy->nfserr = nfserr_wrong_type; + nfserr = nfserr_wrong_type; break; default: - copy->nfserr = nfserr_offload_denied; + nfserr = nfserr_offload_denied; } /* ss_mnt will be unmounted by the laundromat */ goto do_callback; } - copy->nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file, - false); + nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file, + false); nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst); } else { - copy->nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file, - copy->nf_dst->nf_file, false); + nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file, + copy->nf_dst->nf_file, false); } do_callback: + if (!test_bit(NFSD4_COPY_F_CB_ERROR, ©->cp_flags)) + copy->nfserr = nfserr; /* The kthread exits forthwith. Ensure that a subsequent * OFFLOAD_CANCEL won't try to kill it again. */ set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4fc94f2de7ba..1efab85c647d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2416,7 +2416,13 @@ static void __free_client(struct kref *k) kmem_cache_free(client_slab, clp); } -static void drop_client(struct nfs4_client *clp) +/** + * nfsd4_put_client - release a reference on an nfs4_client + * @clp: the client to be released + * + * When the last reference is released, the client is freed. + */ +void nfsd4_put_client(struct nfs4_client *clp) { kref_put(&clp->cl_nfsdfs.cl_ref, __free_client); } @@ -2438,7 +2444,7 @@ free_client(struct nfs4_client *clp) clp->cl_nfsd_dentry = NULL; wake_up_all(&expiry_wq); } - drop_client(clp); + nfsd4_put_client(clp); } /* must be called under the client_lock */ @@ -2839,7 +2845,7 @@ static int client_info_show(struct seq_file *m, void *v) spin_unlock(&clp->cl_lock); seq_puts(m, "\n"); - drop_client(clp); + nfsd4_put_client(clp); return 0; } @@ -3105,7 +3111,7 @@ static int client_states_open(struct inode *inode, struct file *file) ret = seq_open(file, &states_seq_ops); if (ret) { - drop_client(clp); + nfsd4_put_client(clp); return ret; } s = file->private_data; @@ -3119,7 +3125,7 @@ static int client_opens_release(struct inode *inode, struct file *file) struct nfs4_client *clp = m->private; /* XXX: alternatively, we could get/drop in seq start/stop */ - drop_client(clp); + nfsd4_put_client(clp); return seq_release(inode, file); } @@ -3175,7 +3181,7 @@ static ssize_t client_ctl_write(struct file *file, const char __user *buf, if (!clp) return -ENXIO; force_expire_client(clp); - drop_client(clp); + nfsd4_put_client(clp); return 7; } @@ -3210,7 +3216,7 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; - drop_client(clp); + nfsd4_put_client(clp); } static int diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 64da7e731fe7..30caefb2522f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -288,6 +288,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) * 2. Is that directory a mount point, or * 3. Is that directory the root of an exported file system? */ + nfsd4_cancel_copy_by_sb(netns(file), path.dentry->d_sb); error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); mutex_lock(&nfsd_mutex); nn = net_generic(netns(file), nfsd_net_id); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 037f4ccd2e87..e415b8200fff 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -825,6 +825,8 @@ static inline void nfsd4_try_run_cb(struct nfsd4_callback *cb) extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); +void nfsd4_put_client(struct nfs4_client *clp); +void nfsd4_cancel_copy_by_sb(struct net *net, struct super_block *sb); void nfsd4_async_copy_reaper(struct nfsd_net *nn); bool nfsd4_has_active_async_copies(struct nfs4_client *clp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index ae75846b3cd7..1be2814b5288 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -732,6 +732,7 @@ struct nfsd4_copy { #define NFSD4_COPY_F_COMMITTED (3) #define NFSD4_COPY_F_COMPLETED (4) #define NFSD4_COPY_F_OFFLOAD_DONE (5) +#define NFSD4_COPY_F_CB_ERROR (6) /* response */ __be32 nfserr; -- 2.52.0 From: Chuck Lever Modular filesystems currently have no notification mechanism for mount teardown. When an NFS export is unexported then unmounted, NFSD cannot detect this event to revoke associated state, state which holds open file references that pin the mount. The existing fs_pin infrastructure provides unmount callbacks, but pin_insert() and pin_remove() lack EXPORT_SYMBOL_GPL(), restricting this facility to built-in subsystems. This restriction appears historical rather than intentional; fs_pin.h is already a public header, and the mechanism's purpose (coordinating mount lifetimes with filesystem state) applies equally to modular subsystems. Export both symbols with EXPORT_SYMBOL_GPL() to permit modular filesystems to register fs_pin callbacks. NFSD requires this to revoke NFSv4 delegations, layouts, and open state when the underlying filesystem is unmounted, preventing use-after-free conditions in the state tracking layer. Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/fs_pin.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 47ef3c71ce90..972f34558b97 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -7,6 +8,15 @@ static DEFINE_SPINLOCK(pin_lock); +/** + * pin_remove - detach an fs_pin from its mount and superblock + * @pin: the pin to remove + * + * Removes @pin from the mount and superblock pin lists and marks it + * done. Must be called from the pin's kill callback before returning. + * The caller must keep @pin valid until this function returns; after + * that, VFS will not reference @pin again. + */ void pin_remove(struct fs_pin *pin) { spin_lock(&pin_lock); @@ -18,7 +28,17 @@ void pin_remove(struct fs_pin *pin) wake_up_locked(&pin->wait); spin_unlock_irq(&pin->wait.lock); } +EXPORT_SYMBOL_GPL(pin_remove); +/** + * pin_insert - register an fs_pin for unmount notification + * @pin: the pin to register (must be initialized with init_fs_pin()) + * @m: the vfsmount to monitor + * + * Registers @pin to receive notification when @m is unmounted. When + * unmount occurs, the pin's kill callback is invoked with the RCU + * read lock held. The callback must call pin_remove() before returning. + */ void pin_insert(struct fs_pin *pin, struct vfsmount *m) { spin_lock(&pin_lock); @@ -26,6 +46,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); spin_unlock(&pin_lock); } +EXPORT_SYMBOL_GPL(pin_insert); void pin_kill(struct fs_pin *p) { -- 2.52.0 From: Chuck Lever The fs_pin mechanism notifies interested subsystems when a filesystem is remounted read-only or unmounted. Currently, BSD process accounting uses this to halt accounting when the target filesystem goes away. Registered pins receive callbacks from both group_pin_kill() (during remount read-only) and mnt_pin_kill() (during mount teardown). NFSD maintains NFSv4 client state associated with the superblocks of exported filesystems. Revoking this state during unmount requires lock ordering that conflicts with mnt_pin_kill() context: mnt_pin_kill() runs during cleanup_mnt() with namespace locks held, but NFSD's state revocation path acquires these same locks for mount table lookups, creating AB-BA deadlock potential. Add pin_insert_sb() to register pins on the superblock's s_pins list only. Pins registered this way do not receive mnt_pin_kill() callbacks during mount teardown. After pin insertion, checking SB_ACTIVE detects racing unmounts. When the superblock remains active, normal unmount cleanup occurs through the subsystem's own shutdown path (outside the problematic locking context) without pin callbacks. Signed-off-by: Chuck Lever --- fs/fs_pin.c | 29 +++++++++++++++++++++++++++++ include/linux/fs_pin.h | 1 + 2 files changed, 30 insertions(+) diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 972f34558b97..7204b4a5891f 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -48,6 +48,35 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) } EXPORT_SYMBOL_GPL(pin_insert); +/** + * pin_insert_sb - register an fs_pin on the superblock only + * @pin: the pin to register (must be initialized with init_fs_pin()) + * @m: the vfsmount whose superblock to monitor + * + * Registers @pin on the superblock's s_pins list only. Callbacks arrive + * only from group_pin_kill() (invoked during remount read-only), not + * from mnt_pin_kill() (invoked during mount namespace teardown). + * + * Use this instead of pin_insert() when mnt_pin_kill() callbacks would + * execute in problematic locking contexts. Because mnt_pin_kill() runs + * during cleanup_mnt(), callbacks cannot acquire locks also taken during + * mount table operations without risking AB-BA deadlock. + * + * After insertion, check SB_ACTIVE to detect racing unmounts. If clear, + * call pin_remove() and abort. Normal unmount cleanup then occurs through + * subsystem-specific shutdown paths without pin callback involvement. + * + * The callback must call pin_remove() before returning. Callbacks execute + * with the RCU read lock held. + */ +void pin_insert_sb(struct fs_pin *pin, struct vfsmount *m) +{ + spin_lock(&pin_lock); + hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); + spin_unlock(&pin_lock); +} +EXPORT_SYMBOL_GPL(pin_insert_sb); + void pin_kill(struct fs_pin *p) { wait_queue_entry_t wait; diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h index bdd09fd2520c..24c55329b15f 100644 --- a/include/linux/fs_pin.h +++ b/include/linux/fs_pin.h @@ -21,4 +21,5 @@ static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *)) void pin_remove(struct fs_pin *); void pin_insert(struct fs_pin *, struct vfsmount *); +void pin_insert_sb(struct fs_pin *, struct vfsmount *); void pin_kill(struct fs_pin *); -- 2.52.0 From: Chuck Lever The group_pin_kill() function iterates the superblock's s_pins list and invokes each pin's kill callback. Previously, this function was called only during remount read-only (in reconfigure_super). Add a group_pin_kill() call in cleanup_mnt() so that pins registered via pin_insert_sb() receive callbacks during mount teardown as well. This call runs after mnt_pin_kill() processes the per-mount m_list, ensuring: - Pins registered via pin_insert() receive their callback from mnt_pin_kill() (which also removes them from s_list via pin_remove()), so group_pin_kill() skips them. - Pins registered via pin_insert_sb() are only on s_list, so mnt_pin_kill() skips them and group_pin_kill() invokes their callback. This enables subsystems to use pin_insert_sb() for receiving unmount notifications while avoiding any problematic locking context that mnt_pin_kill() callbacks must handle. Because group_pin_kill() operates on the superblock's s_pins list, unmounting any mount of a filesystem--including bind mounts--triggers callbacks for all pins registered on that superblock. For NFSD, this means unmounting an exported bind mount revokes NFSv4 state for the entire filesystem, even if other mounts remain. Signed-off-by: Chuck Lever --- fs/fs_pin.c | 14 +++++++------- fs/namespace.c | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 7204b4a5891f..54c1163a9cde 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -54,17 +54,17 @@ EXPORT_SYMBOL_GPL(pin_insert); * @m: the vfsmount whose superblock to monitor * * Registers @pin on the superblock's s_pins list only. Callbacks arrive - * only from group_pin_kill() (invoked during remount read-only), not - * from mnt_pin_kill() (invoked during mount namespace teardown). + * from group_pin_kill(), invoked during both remount read-only and mount + * teardown. Unlike pin_insert(), the pin is not added to the per-mount + * mnt_pins list, so mnt_pin_kill() does not invoke the callback. * * Use this instead of pin_insert() when mnt_pin_kill() callbacks would - * execute in problematic locking contexts. Because mnt_pin_kill() runs - * during cleanup_mnt(), callbacks cannot acquire locks also taken during - * mount table operations without risking AB-BA deadlock. + * execute in problematic locking contexts. Callbacks registered via this + * function run from group_pin_kill() instead, which may execute under + * different locking conditions. * * After insertion, check SB_ACTIVE to detect racing unmounts. If clear, - * call pin_remove() and abort. Normal unmount cleanup then occurs through - * subsystem-specific shutdown paths without pin callback involvement. + * call pin_remove() and abort. * * The callback must call pin_remove() before returning. Callbacks execute * with the RCU read lock held. diff --git a/fs/namespace.c b/fs/namespace.c index c58674a20cad..a887d45636f5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1309,6 +1309,8 @@ static void cleanup_mnt(struct mount *mnt) WARN_ON(mnt_get_writers(mnt)); if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); + if (unlikely(!hlist_empty(&mnt->mnt.mnt_sb->s_pins))) + group_pin_kill(&mnt->mnt.mnt_sb->s_pins); hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); -- 2.52.0 From: Chuck Lever When an NFS server's local filesystem is unmounted while NFS clients are still accessing it, NFSv4 state holds files open which pins the filesystem, preventing unmount. Currently, administrators have to manually revoke that state via /proc/fs/nfsd/unlock_fs before a formerly exported filesystem can be unmounted. Use the kernel's fs_pin mechanism to detect filesystem unmounts and revoke NFSv4 state and NLM locks associated with that filesystem. An xarray in nfsd_net tracks per-superblock pins. When any NFS state is created, a pin is registered (idempotently) for that superblock. When the filesystem is unmounted, VFS invokes the kill callback which queues work to: - Cancel ongoing async COPY operations (nfsd4_cancel_copy_by_sb) - Release NLM locks (nlmsvc_unlock_all_by_sb) - Revoke NFSv4 state (nfsd4_revoke_states) The code uses pin_insert_sb() to register superblock-only pins rather than pin_insert() which registers both mount and superblock pins. This is necessary because the VFS unmount sequence calls mnt_pin_kill() before clearing SB_ACTIVE, but group_pin_kill() after. Callers of nfsd_pin_sb() hold open file references, so SB_ACTIVE cannot be cleared during pin registration; a WARN_ON_ONCE guards against unexpected violations. The revocation work runs on a dedicated workqueue (nfsd_pin_wq) to avoid deadlocks since the VFS kill callback runs with locks held. Synchronization between VFS unmount and NFSD shutdown uses xa_erase() atomicity: the path that successfully erases the xarray entry triggers work. If state revocation takes an unexpectedly long time (e.g., when re-exporting an NFS mount whose backend server is unresponsive), periodic warnings are emitted every 30 seconds. The wait is interruptible: if interrupted before work starts, cancel_work() removes the queued work and revocation runs directly in the unmount context; if work is already running, the kill callback returns and revocation continues in the background. Open files keep the superblock alive until revocation closes them. Note that NFSD remains pinned until revocation completes. The pin infrastructure is placed in a new file (pin.c) because it is agnostic to NFS protocol version. This will become more apparent with the next patch. Signed-off-by: Chuck Lever --- fs/nfsd/Makefile | 2 +- fs/nfsd/netns.h | 4 + fs/nfsd/nfs4state.c | 26 +++++ fs/nfsd/nfsctl.c | 10 +- fs/nfsd/pin.c | 272 ++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/state.h | 7 ++ 6 files changed, 318 insertions(+), 3 deletions(-) create mode 100644 fs/nfsd/pin.c diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index f0da4d69dc74..b9ef1fe13164 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -13,7 +13,7 @@ nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o \ stats.o filecache.o nfs3proc.o nfs3xdr.o \ - netlink.o + netlink.o pin.o nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 9fa600602658..d9cf8e4f8ae9 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -219,6 +220,9 @@ struct nfsd_net { /* last time an admin-revoke happened for NFSv4.0 */ time64_t nfs40_last_revoke; + /* fs_pin tracking for automatic state revocation on unmount */ + struct xarray nfsd_sb_pins; + #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* Local clients to be invalidated when net is shut down */ spinlock_t local_clients_lock; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1efab85c647d..dc4ff2035bf0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6451,6 +6451,16 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfserr_bad_stateid; if (nfsd4_is_deleg_cur(open)) goto out; + /* + * Pin the superblock so unmount can trigger revocation + * of NFSv4 state (opens, locks, delegations) held by + * clients on this filesystem. nfsd_pin_sb() returns + * immediately if a pin already exists for this sb. + */ + status = nfsd_pin_sb(SVC_NET(rqstp), + current_fh->fh_export->ex_path.mnt); + if (status) + goto out; } if (!stp) { @@ -8987,6 +8997,8 @@ static int nfs4_state_create_net(struct net *net) spin_lock_init(&nn->blocked_locks_lock); INIT_LIST_HEAD(&nn->blocked_locks_lru); + nfsd_sb_pins_init(nn); + INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); /* Make sure this cannot run until client tracking is initialised */ disable_delayed_work(&nn->laundromat_work); @@ -9104,6 +9116,8 @@ nfs4_state_shutdown_net(struct net *net) struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + nfsd_sb_pins_shutdown(nn); + shrinker_free(nn->nfsd_client_shrinker); cancel_work_sync(&nn->nfsd_shrinker_work); disable_delayed_work_sync(&nn->laundromat_work); @@ -9458,6 +9472,18 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, if (rfp != fp) { put_nfs4_file(fp); fp = rfp; + } else { + /* + * Pin the superblock so unmount can trigger revocation + * of directory delegations held by clients on this + * filesystem. nfsd_pin_sb() returns immediately if a + * pin already exists for this sb. + */ + if (nfsd_pin_sb(clp->net, + cstate->current_fh.fh_export->ex_path.mnt)) { + put_nfs4_file(fp); + return ERR_PTR(-EAGAIN); + } } /* if this client already has one, return that it's unavailable */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 30caefb2522f..5fccc88ece76 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2325,9 +2325,12 @@ static int __init init_nfsd(void) retval = nfsd4_create_laundry_wq(); if (retval) goto out_free_cld; + retval = nfsd_pin_init(); + if (retval) + goto out_free_laundry; retval = register_filesystem(&nfsd_fs_type); if (retval) - goto out_free_nfsd4; + goto out_free_pin; retval = genl_register_family(&nfsd_nl_family); if (retval) goto out_free_filesystem; @@ -2341,7 +2344,9 @@ static int __init init_nfsd(void) genl_unregister_family(&nfsd_nl_family); out_free_filesystem: unregister_filesystem(&nfsd_fs_type); -out_free_nfsd4: +out_free_pin: + nfsd_pin_exit(); +out_free_laundry: nfsd4_destroy_laundry_wq(); out_free_cld: unregister_cld_notifier(); @@ -2364,6 +2369,7 @@ static void __exit exit_nfsd(void) remove_proc_entry("fs/nfs", NULL); genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); + nfsd_pin_exit(); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); diff --git a/fs/nfsd/pin.c b/fs/nfsd/pin.c new file mode 100644 index 000000000000..eefa4baff82c --- /dev/null +++ b/fs/nfsd/pin.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Filesystem pin management for NFSD. + * + * When a local filesystem is unmounted while NFS clients hold state, + * this code automatically revokes that state so the unmount can proceed. + * + * Copyright (C) 2025 Oracle. All rights reserved. + * + * Author: Chuck Lever + */ + +#include +#include +#include +#include +#include + +#include "nfsd.h" +#include "netns.h" +#include "state.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static struct workqueue_struct *nfsd_pin_wq; + +/* + * Structure to track fs_pin per superblock for automatic state revocation + * when a filesystem is unmounted. + */ +struct nfsd_fs_pin { + struct fs_pin pin; + struct super_block *sb; + struct net *net; + struct work_struct work; + struct completion done; + struct rcu_head rcu; +}; + +static void nfsd_fs_pin_kill(struct fs_pin *pin); + +static void nfsd_fs_pin_free_rcu(struct rcu_head *rcu) +{ + struct nfsd_fs_pin *p = container_of(rcu, struct nfsd_fs_pin, rcu); + + put_net(p->net); + kfree(p); +} + +/* + * Work function for nfsd_fs_pin - runs in process context. + * Cancels async COPYs, releases NLM locks, and revokes NFSv4 state for + * the superblock. + */ +static void nfsd_fs_pin_work(struct work_struct *work) +{ + struct nfsd_fs_pin *p = container_of(work, struct nfsd_fs_pin, work); + struct nfsd_net *nn = net_generic(p->net, nfsd_net_id); + + pr_info("nfsd: unmount of %s, revoking NFS state\n", p->sb->s_id); + + nfsd4_cancel_copy_by_sb(p->net, p->sb); + /* Errors are logged by lockd; no recovery is possible. */ + (void)nlmsvc_unlock_all_by_sb(p->sb); + nfsd4_revoke_states(nn, p->sb); + + pr_info("nfsd: state revocation for %s complete\n", p->sb->s_id); + + pin_remove(&p->pin); + complete(&p->done); + call_rcu(&p->rcu, nfsd_fs_pin_free_rcu); +} + +/* Interval for progress warnings during unmount (in seconds) */ +#define NFSD_STATE_REVOKE_INTERVAL 30 + +/** + * nfsd_fs_pin_kill - Kill callback for nfsd_fs_pin + * @pin: fs_pin representing filesystem to be unmounted + * + * Queues state revocation and waits for completion. If interrupted, + * returns early; the work function handles cleanup. Open files keep + * the superblock alive until revocation closes them. + * + * Synchronization with nfsd_sb_pins_destroy(): xa_erase() is atomic, + * so exactly one of the two paths erases the entry and triggers work. + */ +static void nfsd_fs_pin_kill(struct fs_pin *pin) +{ + struct nfsd_fs_pin *p = container_of(pin, struct nfsd_fs_pin, pin); + struct nfsd_net *nn = net_generic(p->net, nfsd_net_id); + unsigned int elapsed = 0; + long ret; + + if (!xa_erase(&nn->nfsd_sb_pins, (unsigned long)p->sb)) + return; + + queue_work(nfsd_pin_wq, &p->work); + + /* + * Block until state revocation completes. Periodic warnings help + * diagnose stuck operations (e.g., re-exports of an NFS mount + * whose backend server is unresponsive). + * + * The work function handles pin_remove() and freeing, so this + * callback can return early on interrupt. Open files keep the + * superblock alive until revocation closes them. Note that NFSD + * remains pinned until revocation completes. + */ + for (;;) { + ret = wait_for_completion_interruptible_timeout(&p->done, + NFSD_STATE_REVOKE_INTERVAL * HZ); + if (ret > 0) + return; + + if (ret == -ERESTARTSYS) { + /* + * Interrupted by signal. If the work has not yet + * started, cancel it and run in this context: a + * successful cancel_work() means no other context + * will execute the work function, so it must run + * here to ensure state revocation occurs. + * + * If already running, return and let work complete + * in background; open files keep superblock alive. + */ + if (cancel_work(&p->work)) { + pr_warn("nfsd: unmount of %s interrupted, revoking state in unmount context\n", + p->sb->s_id); + nfsd_fs_pin_work(&p->work); + return; + } + pr_warn("nfsd: unmount of %s interrupted; mount remains pinned until state revocation completes\n", + p->sb->s_id); + return; + } + + /* Timed out - print warning and continue waiting */ + elapsed += NFSD_STATE_REVOKE_INTERVAL; + pr_warn("nfsd: unmount of %s blocked for %u seconds waiting for NFS state revocation\n", + p->sb->s_id, elapsed); + } +} + +/** + * nfsd_pin_sb - register a superblock to enable state revocation + * @net: network namespace + * @mnt: vfsmount for the filesystem + * + * If NFS state is created for a file on this filesystem, pin the + * superblock so the kill callback can revoke that state on unmount. + * Returns nfs_ok on success, or an NFS error on failure. + * + * This function is idempotent - if a pin already exists for the + * superblock, no new pin is created. + */ +__be32 nfsd_pin_sb(struct net *net, struct vfsmount *mnt) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct super_block *sb = mnt->mnt_sb; + struct nfsd_fs_pin *new, *old; + + old = xa_load(&nn->nfsd_sb_pins, (unsigned long)sb); + if (old) + return nfs_ok; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return nfserr_jukebox; + + new->sb = sb; + new->net = get_net(net); + init_fs_pin(&new->pin, nfsd_fs_pin_kill); + INIT_WORK(&new->work, nfsd_fs_pin_work); + init_completion(&new->done); + + old = xa_cmpxchg(&nn->nfsd_sb_pins, (unsigned long)sb, NULL, new, + GFP_KERNEL); + if (old) { + /* + * Another task beat us to it. Even if the winner has not + * yet called pin_insert_sb(), returning here is safe: the + * caller holds an open file reference that prevents + * unmount from completing until state creation finishes. + */ + put_net(new->net); + kfree(new); + return nfs_ok; + } + + pin_insert_sb(&new->pin, mnt); + + /* + * Callers hold an open file reference, so unmount cannot clear + * SB_ACTIVE while this function executes. Warn if this assumption + * is violated, but handle it gracefully by cleaning up and + * returning an error. + */ + if (WARN_ON_ONCE(!(READ_ONCE(sb->s_flags) & SB_ACTIVE))) { + new = xa_erase(&nn->nfsd_sb_pins, (unsigned long)sb); + if (new) { + pin_remove(&new->pin); + call_rcu(&new->rcu, nfsd_fs_pin_free_rcu); + } + return nfserr_stale; + } + + return nfs_ok; +} + +/** + * nfsd_sb_pins_init - initialize the superblock pins xarray + * @nn: nfsd_net for this network namespace + */ +void nfsd_sb_pins_init(struct nfsd_net *nn) +{ + xa_init(&nn->nfsd_sb_pins); +} + +/* + * Clean up all fs_pins during NFSD shutdown. + * + * xa_erase() synchronizes with nfsd_fs_pin_kill(): the path that + * successfully erases an xarray entry performs cleanup for that pin. + * A NULL return indicates the VFS unmount path is performing cleanup. + */ +static void nfsd_sb_pins_destroy(struct nfsd_net *nn) +{ + struct nfsd_fs_pin *p; + unsigned long index; + + xa_for_each(&nn->nfsd_sb_pins, index, p) { + p = xa_erase(&nn->nfsd_sb_pins, index); + if (!p) + continue; /* VFS unmount path handling this pin */ + pin_remove(&p->pin); + call_rcu(&p->rcu, nfsd_fs_pin_free_rcu); + } + xa_destroy(&nn->nfsd_sb_pins); +} + +/** + * nfsd_sb_pins_shutdown - shutdown superblock pins for a network namespace + * @nn: nfsd_net for this network namespace + * + * Must be called during nfsd shutdown before tearing down client state. + * Flushes any pending work and waits for RCU callbacks to complete. + */ +void nfsd_sb_pins_shutdown(struct nfsd_net *nn) +{ + nfsd_sb_pins_destroy(nn); + flush_workqueue(nfsd_pin_wq); + /* + * Wait for RCU callbacks from nfsd_sb_pins_destroy() to complete. + * These callbacks release network namespace references via put_net() + * which must happen before the namespace teardown continues. + */ + rcu_barrier(); +} + +int nfsd_pin_init(void) +{ + nfsd_pin_wq = alloc_workqueue("nfsd_pin", WQ_UNBOUND, 0); + if (!nfsd_pin_wq) + return -ENOMEM; + return 0; +} + +void nfsd_pin_exit(void) +{ + destroy_workqueue(nfsd_pin_wq); +} diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e415b8200fff..1494dd34759f 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -853,6 +853,13 @@ static inline void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block * } #endif +/* superblock pin management (pin.c) */ +int nfsd_pin_init(void); +void nfsd_pin_exit(void); +__be32 nfsd_pin_sb(struct net *net, struct vfsmount *mnt); +void nfsd_sb_pins_init(struct nfsd_net *nn); +void nfsd_sb_pins_shutdown(struct nfsd_net *nn); + /* grace period management */ bool nfsd4_force_end_grace(struct nfsd_net *nn); -- 2.52.0 From: Chuck Lever When a filesystem is unmounted while NFS is exporting it, the unmount can fail with EBUSY even after NFSv4 state has been revoked. This is because the nfsd_file cache can hold open NFSv2/3 file handles that pin the filesystem, preventing the unmount from completing. Extend the mechanism that revokes NFSv4 state on unmount to also close cached file handles. nfsd_file_close_sb() walks the nfsd_file cache and disposes of entries belonging to the target superblock. It runs after NFSv4 state revocation, so it handles only NFSv2/3 file handles that remain in the cache. Entries still under construction (with nf_file not yet set) are skipped; these have no open file to close. Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/filecache.h | 1 + fs/nfsd/pin.c | 6 ++++-- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 93798575b807..b921a9553f36 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -894,6 +894,50 @@ __nfsd_file_cache_purge(struct net *net) nfsd_file_dispose_list(&dispose); } +/** + * nfsd_file_close_sb - close GC-managed cached files for a superblock + * @sb: target superblock + * + * Walk the nfsd_file cache and close out GC-managed entries (those + * acquired via nfsd_file_acquire_gc) that belong to @sb. Called during + * filesystem unmount after NFSv4 state revocation to release remaining + * cached file handles that may be pinning the filesystem. + */ +void nfsd_file_close_sb(struct super_block *sb) +{ + struct rhashtable_iter iter; + struct nfsd_file *nf; + LIST_HEAD(dispose); + + mutex_lock(&nfsd_mutex); + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0) { + mutex_unlock(&nfsd_mutex); + return; + } + + rhltable_walk_enter(&nfsd_file_rhltable, &iter); + do { + rhashtable_walk_start(&iter); + + nf = rhashtable_walk_next(&iter); + while (!IS_ERR_OR_NULL(nf)) { + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) == 0) + goto next; + /* Skip entries under construction (nf_file not yet set) */ + if (nf->nf_file && file_inode(nf->nf_file)->i_sb == sb) + nfsd_file_cond_queue(nf, &dispose); +next: + nf = rhashtable_walk_next(&iter); + } + + rhashtable_walk_stop(&iter); + } while (nf == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); + mutex_unlock(&nfsd_mutex); + + nfsd_file_dispose_list(&dispose); +} + static struct nfsd_fcache_disposal * nfsd_alloc_fcache_disposal(void) { diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index b383dbc5b921..66ca7fc6189b 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -70,6 +70,7 @@ struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); +void nfsd_file_close_sb(struct super_block *sb); void nfsd_file_net_dispose(struct nfsd_net *nn); bool nfsd_file_is_cached(struct inode *inode); __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, diff --git a/fs/nfsd/pin.c b/fs/nfsd/pin.c index eefa4baff82c..a404611c20a0 100644 --- a/fs/nfsd/pin.c +++ b/fs/nfsd/pin.c @@ -19,6 +19,7 @@ #include "nfsd.h" #include "netns.h" #include "state.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -49,8 +50,8 @@ static void nfsd_fs_pin_free_rcu(struct rcu_head *rcu) /* * Work function for nfsd_fs_pin - runs in process context. - * Cancels async COPYs, releases NLM locks, and revokes NFSv4 state for - * the superblock. + * Cancels async COPYs, releases NLM locks, revokes NFSv4 state, and closes + * cached NFSv2/3 files for the superblock. */ static void nfsd_fs_pin_work(struct work_struct *work) { @@ -63,6 +64,7 @@ static void nfsd_fs_pin_work(struct work_struct *work) /* Errors are logged by lockd; no recovery is possible. */ (void)nlmsvc_unlock_all_by_sb(p->sb); nfsd4_revoke_states(nn, p->sb); + nfsd_file_close_sb(p->sb); pr_info("nfsd: state revocation for %s complete\n", p->sb->s_id); -- 2.52.0