Convert wait_caps_flush() from an unbounded wait_event() to a bounded wait_event_timeout() with a 60-second period. If the flush hasn't completed after each timeout, dump the pending cap_flush list (up to 5 times) to aid debugging hung flush scenarios. Add a ci back-pointer to struct ceph_cap_flush so the diagnostic dump can identify which inodes have outstanding flushes. Add i_last_cap_flush_ack to ceph_inode_info for tracking the latest acknowledged flush tid per inode. Signed-off-by: Alex Markuze --- fs/ceph/caps.c | 7 +++++++ fs/ceph/mds_client.c | 41 +++++++++++++++++++++++++++++++++++++++-- fs/ceph/super.h | 2 ++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d51454e995a8..be030fb8e864 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1648,6 +1648,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, spin_lock(&mdsc->cap_dirty_lock); capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; + capsnap->cap_flush.ci = ci; list_add_tail(&capsnap->cap_flush.g_list, &mdsc->cap_flush_list); if (oldest_flush_tid == 0) @@ -1846,6 +1847,9 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void) return NULL; cf->is_capsnap = false; + cf->ci = NULL; + cf->tid = 0; + cf->wake = false; return cf; } @@ -1931,6 +1935,7 @@ static u64 __mark_caps_flushing(struct inode *inode, doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode)); swap(cf, ci->i_prealloc_cap_flush); + cf->ci = ci; cf->caps = flushing; cf->wake = wake; @@ -3826,6 +3831,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, bool wake_ci = false; bool wake_mdsc = false; + ci->i_last_cap_flush_ack = flush_tid; + list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { /* Is this the one that was flushed? */ if (cf->tid == flush_tid) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 28bb27b09b40..e27f2f148dea 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -27,6 +27,8 @@ #include #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) +#define CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC 60 +#define CEPH_CAP_FLUSH_MAX_DUMP_COUNT 5 /* * A cluster of MDS (metadata server) daemons is responsible for @@ -2285,6 +2287,34 @@ static int check_caps_flush(struct ceph_mds_client *mdsc, return ret; } +static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_cap_flush *cf; + + pr_info_client(cl, "still waiting for cap flushes through %llu:\n", + want_tid); + spin_lock(&mdsc->cap_dirty_lock); + list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) { + if (cf->tid > want_tid) + break; + if (!cf->ci) { + pr_info_client(cl, + "(null ci) %s tid=%llu wake=%d%s\n", + ceph_cap_string(cf->caps), cf->tid, + cf->wake, + cf->is_capsnap ? " is_capsnap" : ""); + continue; + } + pr_info_client(cl, "%llx:%llx %s %llu %llu %d%s\n", + ceph_vinop(&cf->ci->netfs.inode), + ceph_cap_string(cf->caps), cf->tid, + cf->ci->i_last_cap_flush_ack, cf->wake, + cf->is_capsnap ? " is_capsnap" : ""); + } + spin_unlock(&mdsc->cap_dirty_lock); +} + /* * flush all dirty inode data to disk. * @@ -2294,11 +2324,18 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { struct ceph_client *cl = mdsc->fsc->client; + int i = 0; + long ret; doutc(cl, "want %llu\n", want_flush_tid); - wait_event(mdsc->cap_flushing_wq, - check_caps_flush(mdsc, want_flush_tid)); + do { + ret = wait_event_timeout(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid), + CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC * HZ); + if (ret == 0 && i++ < CEPH_CAP_FLUSH_MAX_DUMP_COUNT) + dump_cap_flushes(mdsc, want_flush_tid); + } while (ret == 0); doutc(cl, "ok, flushed thru %llu\n", want_flush_tid); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 69a71848240f..9e80c816aa7a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -238,6 +238,7 @@ struct ceph_cap_flush { bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode + struct ceph_inode_info *ci; }; /* @@ -443,6 +444,7 @@ struct ceph_inode_info { struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ + u64 i_last_cap_flush_ack; /* latest cap flush_ack tid for this inode */ unsigned long i_last_rd; unsigned long i_last_wr; -- 2.34.1