Commit 670d21c6e17f ("fuse: remove reliance on bdi congestion") introduced a FUSE-specific solution for limiting number of background requests outstanding. Unlike previous bdi congestion, this algorithm actually works and limits the number of outstanding background requests based on the congestion threshold. As a result, some workloads such as buffered sequential reads over FUSE got slower (from ~1.3 GB/s to ~1.05 GB/s). The fio command to reproduce is: fio --filename=//file.250g --rw=read --bs=4K \ --numjobs=32 --ioengine=libaio --iodepth=4 \ --offset_increment=1G --size=1G This happens because FUSE sends requests up to the congestion threshold and throttles any further async readahead until the number of background requests drops below the threshold. By the time this happens, the congestion has eased and the disk is idle. To fix this problem and make FUSE react faster to eased congestion, block waiting for congestion to resolve instead of aborting async readahead. This improves the buffered sequential read throughput back to 1.3 GB/s. This approach is inspired by the fix made for NFS writeback in commit 2f1f31042ef0 ("nfs: Block on write congestion"). Signed-off-by: Abhishek Angale --- fs/fuse/dev.c | 2 ++ fs/fuse/file.c | 14 ++++++++++---- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 1 + 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6d59cbc877c6..b4befe21165f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -496,6 +496,8 @@ void fuse_request_end(struct fuse_req *req) fc->num_background--; fc->active_background--; + if (fc->num_background < fc->congestion_threshold) + wake_up_all(&fc->bg_congestion_wait); flush_bg_queue(fc); spin_unlock(&fc->bg_lock); } else { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6014d588845c..0cfcd27e7991 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -975,12 +975,18 @@ static void fuse_readahead(struct readahead_control *rac) unsigned int pages = 0; if (fc->num_background >= fc->congestion_threshold && - rac->ra->async_size >= readahead_count(rac)) + rac->ra->async_size >= readahead_count(rac)) { /* - * Congested and only async pages left, so skip the - * rest. + * Congested and only async pages left, wait + * until congestion eases. */ - break; + int err; + + err = wait_event_killable(fc->bg_congestion_wait, + fc->num_background < fc->congestion_threshold); + if (err) + break; + } ia = fuse_io_alloc(NULL, cur_pages); if (!ia) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 026c6c0de3f4..008ac2fa6a76 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -722,6 +722,9 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; + /** waitq for async readaheads until congestion eases */ + wait_queue_head_t bg_congestion_wait; + /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 21e04c394a80..973f70064e89 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -979,6 +979,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, atomic_set(&fc->epoch, 1); INIT_WORK(&fc->epoch_work, fuse_epoch_work); init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->bg_congestion_wait); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); -- 2.34.1