When a layout conflict triggers a recall, enforcing a timeout is necessary to prevent excessive nfsd threads from being blocked in __break_lease ensuring the server continues servicing incoming requests efficiently. This patch introduces a new function to lease_manager_operations: lm_breaker_timedout: Invoked when a lease recall times out and is about to be disposed of. This function enables the lease manager to inform the caller whether the file_lease should remain on the flc_list or be disposed of. For the NFSD lease manager, this function now handles layout recall timeouts. If the layout type supports fencing and the client has not been fenced, a fence operation is triggered to prevent the client from accessing the block device. Fencing operation is done asynchronously using a system worker. This is to allow lease_modify to trigger the fencing opeation when layout recall timed out. To ensure layout stateid remains valid while the fencing operation is in progress, a reference count is added to layout stateid before schedule the system worker to do the fencing operation. The reference count is released after the fencing operation is complete. While the fencing operation is in progress, the conflicting file_lease remains on the flc_list until fencing is complete. This guarantees that no other clients can access the file, and the client with exclusive access is properly blocked before disposal. Signed-off-by: Dai Ngo --- Documentation/filesystems/locking.rst | 2 + fs/locks.c | 29 ++++++++++- fs/nfsd/blocklayout.c | 19 ++++++- fs/nfsd/nfs4layouts.c | 74 ++++++++++++++++++++++++--- fs/nfsd/nfs4state.c | 1 + fs/nfsd/state.h | 3 ++ include/linux/filelock.h | 2 + 7 files changed, 122 insertions(+), 8 deletions(-) v2: . Update Subject line to include fencing operation. . Allow conflicting lease to remain on flc_list until fencing is complete. . Use system worker to perform fencing operation asynchronously. . Use nfs4_stid.sc_count to ensure layout stateid remains valid before starting the fencing operation, nfs4_stid.sc_count is released after fencing operation is complete. . Rework nfsd4_scsi_fence_client to: . wait until fencing to complete before exiting. . wait until fencing in progress to complete before checking the NFSD_MDS_PR_FENCED flag. . Remove lm_need_to_retry from lease_manager_operations. diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 04c7691e50e0..f7fe2c1ee32b 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -403,6 +403,7 @@ prototypes:: bool (*lm_breaker_owns_lease)(struct file_lock *); bool (*lm_lock_expirable)(struct file_lock *); void (*lm_expire_lock)(void); + void (*lm_breaker_timedout)(struct file_lease *); locking rules: @@ -417,6 +418,7 @@ lm_breaker_owns_lease: yes no no lm_lock_expirable yes no no lm_expire_lock no no yes lm_open_conflict yes no no +lm_breaker_timedout no no yes ====================== ============= ================= ========= buffer_head diff --git a/fs/locks.c b/fs/locks.c index 46f229f740c8..28e63aa87f74 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1487,6 +1487,25 @@ static void lease_clear_pending(struct file_lease *fl, int arg) } } +/* + * A layout lease is about to be disposed. If the disposal is + * due to a layout recall timeout, consult the lease manager + * to see whether the operation should continue. + * + * Return true if the lease should be disposed else return + * false. + */ +static bool lease_want_dispose(struct file_lease *fl) +{ + if (!(fl->c.flc_flags & FL_BREAKER_TIMEDOUT)) + return true; + + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout && + (!fl->fl_lmops->lm_breaker_timedout(fl))) + return false; + return true; +} + /* We already had a lease on this file; just change its type */ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { @@ -1494,6 +1513,11 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) if (error) return error; + + if ((fl->c.flc_flags & FL_LAYOUT) && (arg == F_UNLCK) && + (!lease_want_dispose(fl))) + return 0; + lease_clear_pending(fl, arg); locks_wake_up_blocks(&fl->c); if (arg == F_UNLCK) { @@ -1531,8 +1555,11 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(fl, F_RDLCK, dispose); - if (past_time(fl->fl_break_time)) + + if (past_time(fl->fl_break_time)) { + fl->c.flc_flags |= FL_BREAKER_TIMEDOUT; lease_modify(fl, F_UNLCK, dispose); + } } } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 7ba9e2dd0875..05ddff5a4005 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -443,6 +443,14 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } +/* + * Perform the fence operation to prevent the client from accessing the + * block device. If a fence operation is already in progress, wait for + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, + * update the layout stateid by setting the ls_fenced flag to indicate + * that the client has been fenced. + */ static void nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { @@ -450,8 +458,13 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; int status; - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) + mutex_lock(&clp->cl_fence_mutex); + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { + ls->ls_fenced = true; + mutex_unlock(&clp->cl_fence_mutex); + nfs4_put_stid(&ls->ls_stid); return; + } status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), @@ -475,6 +488,10 @@ nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) status == PR_STS_PATH_FAST_FAILED || status == PR_STS_RETRY_PATH_FAILURE) nfsd4_scsi_fence_clear(clp, bdev->bd_dev); + else + ls->ls_fenced = true; + mutex_unlock(&clp->cl_fence_mutex); + nfs4_put_stid(&ls->ls_stid); trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); } diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ad7af8cfcf1f..4a11ccd5b0a5 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -222,6 +222,27 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) return 0; } +static void +nfsd4_layout_fence_worker(struct work_struct *work) +{ + struct nfsd_file *nf; + struct delayed_work *dwork = to_delayed_work(work); + struct nfs4_layout_stateid *ls = container_of(dwork, + struct nfs4_layout_stateid, ls_fence_work); + u32 type; + + rcu_read_lock(); + nf = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (!nf) + return; + + type = ls->ls_layout_type; + if (nfsd4_layout_ops[type]->fence_client) + nfsd4_layout_ops[type]->fence_client(ls, nf); + nfsd_file_put(nf); +} + static struct nfs4_layout_stateid * nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid *parent, u32 layout_type) @@ -271,6 +292,9 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, list_add(&ls->ls_perfile, &fp->fi_lo_states); spin_unlock(&fp->fi_lock); + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); + ls->ls_fenced = false; + trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); return ls; } @@ -708,9 +732,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) rcu_read_unlock(); if (fl) { ops = nfsd4_layout_ops[ls->ls_layout_type]; - if (ops->fence_client) + if (ops->fence_client) { + refcount_inc(&ls->ls_stid.sc_count); ops->fence_client(ls, fl); - else + } else nfsd4_cb_layout_fail(ls, fl); nfsd_file_put(fl); } @@ -747,11 +772,9 @@ static bool nfsd4_layout_lm_break(struct file_lease *fl) { /* - * We don't want the locks code to timeout the lease for us; - * we'll remove it ourself if a layout isn't returned - * in time: + * Enforce break lease timeout to prevent NFSD + * thread from hanging in __break_lease. */ - fl->fl_break_time = 0; nfsd4_recall_file_layout(fl->c.flc_owner); return false; } @@ -782,10 +805,49 @@ nfsd4_layout_lm_open_conflict(struct file *filp, int arg) return 0; } +/** + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. + * If the layout type supports a fence operation, schedule a worker to + * fence the client from accessing the block device. + * + * @fl: file to check + * + * Return true if the file lease should be disposed of by the caller; + * otherwise, return false. + */ +static bool +nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) +{ + struct nfs4_layout_stateid *ls = fl->c.flc_owner; + bool ret; + + if (!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) + return true; + if (ls->ls_fenced) + return true; + + if (work_busy(&ls->ls_fence_work.work)) + return false; + /* Schedule work to do the fence operation */ + ret = mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); + if (!ret) { + /* + * If there is no pending work, mod_delayed_work queues + * new task. While fencing is in progress, a reference + * count is added to the layout stateid to ensure its + * validity. This reference count is released once fencing + * has been completed. + */ + refcount_inc(&ls->ls_stid.sc_count); + } + return false; +} + static const struct lease_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, .lm_open_conflict = nfsd4_layout_lm_open_conflict, + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, }; int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 583c13b5aaf3..a57fa3318362 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2385,6 +2385,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, #endif #ifdef CONFIG_NFSD_SCSILAYOUT xa_init(&clp->cl_dev_fences); + mutex_init(&clp->cl_fence_mutex); #endif INIT_LIST_HEAD(&clp->async_copies); spin_lock_init(&clp->async_lock); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 713f55ef6554..d9a3c966a35f 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -529,6 +529,7 @@ struct nfs4_client { time64_t cl_ra_time; #ifdef CONFIG_NFSD_SCSILAYOUT struct xarray cl_dev_fences; + struct mutex cl_fence_mutex; #endif }; @@ -738,6 +739,8 @@ struct nfs4_layout_stateid { stateid_t ls_recall_sid; bool ls_recalled; struct mutex ls_mutex; + struct delayed_work ls_fence_work; + bool ls_fenced; }; static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 2f5e5588ee07..6939952f2088 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -17,6 +17,7 @@ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ #define FL_RECLAIM 4096 /* reclaiming from a reboot server */ +#define FL_BREAKER_TIMEDOUT 8192 /* lease breaker timed out */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) @@ -50,6 +51,7 @@ struct lease_manager_operations { void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); int (*lm_open_conflict)(struct file *, int); + bool (*lm_breaker_timedout)(struct file_lease *fl); }; struct lock_manager { -- 2.47.3