For block devices, sequential write performance is significantly
better than random write. Currently, zram's write-back function
only supports single-page operations, which fails to leverage
the sequential write advantage and leads to suboptimal performance.
This patch implements multi-page batch write-back for zram to
leverage sequential write performance of block devices.
After applying this patch, a large number of pages being merged
into batch write operations can be observed via the following test
code, which effectively improves write-back performance.

mount -t debugfs none /sys/kernel/debug/
echo "block:block_bio_frontmerge" >> /sys/kernel/debug/tracing/set_event
echo "block:block_bio_backmerge" >> /sys/kernel/debug/tracing/set_event
cat /sys/kernel/debug/tracing/trace_pipe &
echo "page_indexes=1-10000" > /sys/block/zram0/writeback

Signed-off-by: Yuwen Chen <ywen.chen@foxmail.com>
Reviewed-by: Fengyu Lian <licayy@outlook.com>
---
 drivers/block/zram/zram_drv.c | 214 +++++++++++++++++++++++++---------
 1 file changed, 161 insertions(+), 53 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a43074657531..85ef07455eda 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -734,21 +734,125 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
 	submit_bio(bio);
 }
 
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
-{
-	unsigned long blk_idx = 0;
-	struct page *page = NULL;
+enum {
+	ZRAM_WB_WORK_ALLOCATED = 0,
+	ZRAM_WB_WORK_COMPLETED,
+};
+struct zram_writeback_work {
+	struct completion *done;
+	unsigned long blk_idx;
+	struct page *page;
 	struct zram_pp_slot *pps;
 	struct bio_vec bio_vec;
 	struct bio bio;
-	int ret = 0, err;
-	u32 index;
+	unsigned long flags;
+};
 
-	page = alloc_page(GFP_KERNEL);
-	if (!page)
+static int zram_writeback_complete(struct zram *zram, struct zram_writeback_work *work)
+{
+	u32 index = 0;
+	int err;
+
+	if (!test_and_clear_bit(ZRAM_WB_WORK_COMPLETED, &work->flags))
+		return 0;
+
+	err = blk_status_to_errno(work->bio.bi_status);
+	if (err)
+		return err;
+
+	index = work->pps->index;
+	atomic64_inc(&zram->stats.bd_writes);
+	zram_slot_lock(zram, index);
+	/*
+	 * Same as above, we release slot lock during writeback so
+	 * slot can change under us: slot_free() or slot_free() and
+	 * reallocation (zram_write_page()). In both cases slot loses
+	 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
+	 * ZRAM_PP_SLOT on such slots until current post-processing
+	 * finishes.
+	 */
+	if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
+		goto next;
+
+	zram_free_page(zram, index);
+	zram_set_flag(zram, index, ZRAM_WB);
+	zram_set_handle(zram, index, work->blk_idx);
+	work->blk_idx = 0;
+	atomic64_inc(&zram->stats.pages_stored);
+	spin_lock(&zram->wb_limit_lock);
+	if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+		zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
+	spin_unlock(&zram->wb_limit_lock);
+
+next:
+	zram_slot_unlock(zram, index);
+	release_pp_slot(zram, work->pps);
+	work->pps = NULL;
+	return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+	struct zram_writeback_work *work = bio->bi_private;
+
+	set_bit(ZRAM_WB_WORK_COMPLETED, &work->flags);
+	clear_bit(ZRAM_WB_WORK_ALLOCATED, &work->flags);
+	complete(work->done);
+}
+
+static struct zram_writeback_work *zram_writeback_next_work(struct zram_writeback_work **pool,
+	int size, int *off)
+{
+	struct zram_writeback_work *work = NULL;
+	int i = 0;
+
+	for (i = *off; i < size + *off; i++) {
+		work = pool[i % size];
+		if (!work->page)
+			continue;
+
+		if (!test_and_set_bit(ZRAM_WB_WORK_ALLOCATED, &work->flags)) {
+			*off = (i + 1) % size;
+			return work;
+		}
+	}
+	return NULL;
+}
+
+#define ZRAM_WRITEBACK_BIO_SIZE (32)
+static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
+{
+	int ret = 0, err, i = 0, off = 0;
+	int work_pool_size = 0;
+	struct zram_writeback_work work_prealloc[2] = {0};
+	struct zram_writeback_work *work_pool[ZRAM_WRITEBACK_BIO_SIZE] = {NULL};
+	struct zram_writeback_work *work = NULL;
+	DECLARE_COMPLETION_ONSTACK(done);
+	u32 index = 0;
+	struct blk_plug plug;
+
+	for (i = 0; i < ARRAY_SIZE(work_pool); i++) {
+		if (i < ARRAY_SIZE(work_prealloc)) {
+			work_pool[i] = &work_prealloc[i];
+		} else {
+			work_pool[i] = kzalloc(sizeof(*work), GFP_KERNEL);
+			if (!work_pool[i])
+				break;
+		}
+		work_pool[i]->done = &done;
+		work_pool[i]->flags = 0;
+		work_pool[i]->page = alloc_page(GFP_KERNEL);
+		if (!work_pool[i]->page)
+			break;
+		work = work_pool[i];
+	}
+	if (!work)
 		return -ENOMEM;
+	work_pool_size = i;
+	set_bit(ZRAM_WB_WORK_ALLOCATED, &work->flags);
 
-	while ((pps = select_pp_slot(ctl))) {
+	blk_start_plug(&plug);
+	while ((work->pps = select_pp_slot(ctl))) {
 		spin_lock(&zram->wb_limit_lock);
 		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
 			spin_unlock(&zram->wb_limit_lock);
@@ -757,15 +861,15 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		}
 		spin_unlock(&zram->wb_limit_lock);
 
-		if (!blk_idx) {
-			blk_idx = alloc_block_bdev(zram);
-			if (!blk_idx) {
+		if (!work->blk_idx) {
+			work->blk_idx = alloc_block_bdev(zram);
+			if (!work->blk_idx) {
 				ret = -ENOSPC;
 				break;
 			}
 		}
 
-		index = pps->index;
+		index = work->pps->index;
 		zram_slot_lock(zram, index);
 		/*
 		 * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
@@ -775,22 +879,32 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 		 */
 		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
 			goto next;
-		if (zram_read_from_zspool(zram, page, index))
+		if (zram_read_from_zspool(zram, work->page, index))
 			goto next;
 		zram_slot_unlock(zram, index);
 
-		bio_init(&bio, zram->bdev, &bio_vec, 1,
+		bio_init(&work->bio, zram->bdev, &work->bio_vec, 1,
 			 REQ_OP_WRITE | REQ_SYNC);
-		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
-		__bio_add_page(&bio, page, PAGE_SIZE, 0);
-
-		/*
-		 * XXX: A single page IO would be inefficient for write
-		 * but it would be not bad as starter.
-		 */
-		err = submit_bio_wait(&bio);
+		work->bio.bi_iter.bi_sector = work->blk_idx * (PAGE_SIZE >> 9);
+		work->bio.bi_end_io = zram_writeback_endio;
+		work->bio.bi_private = work;
+		__bio_add_page(&work->bio, work->page, PAGE_SIZE, 0);
+
+		list_del_init(&work->pps->entry);
+		submit_bio(&work->bio);
+
+		do {
+			work = zram_writeback_next_work(work_pool, work_pool_size, &off);
+			if (!work) {
+				blk_finish_plug(&plug);
+				wait_for_completion_io(&done);
+				blk_start_plug(&plug);
+			}
+		} while (!work);
+		err = zram_writeback_complete(zram, work);
 		if (err) {
-			release_pp_slot(zram, pps);
+			release_pp_slot(zram, work->pps);
+			work->pps = NULL;
 			/*
 			 * BIO errors are not fatal, we continue and simply
 			 * attempt to writeback the remaining objects (pages).
@@ -800,43 +914,37 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
 			 * the most recent BIO error.
 			 */
 			ret = err;
-			continue;
 		}
+		cond_resched();
+		continue;
 
-		atomic64_inc(&zram->stats.bd_writes);
-		zram_slot_lock(zram, index);
-		/*
-		 * Same as above, we release slot lock during writeback so
-		 * slot can change under us: slot_free() or slot_free() and
-		 * reallocation (zram_write_page()). In both cases slot loses
-		 * ZRAM_PP_SLOT flag. No concurrent post-processing can set
-		 * ZRAM_PP_SLOT on such slots until current post-processing
-		 * finishes.
-		 */
-		if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
-			goto next;
-
-		zram_free_page(zram, index);
-		zram_set_flag(zram, index, ZRAM_WB);
-		zram_set_handle(zram, index, blk_idx);
-		blk_idx = 0;
-		atomic64_inc(&zram->stats.pages_stored);
-		spin_lock(&zram->wb_limit_lock);
-		if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
-			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
-		spin_unlock(&zram->wb_limit_lock);
 next:
 		zram_slot_unlock(zram, index);
-		release_pp_slot(zram, pps);
-
+		release_pp_slot(zram, work->pps);
+		work->pps = NULL;
 		cond_resched();
 	}
+	blk_finish_plug(&plug);
 
-	if (blk_idx)
-		free_block_bdev(zram, blk_idx);
-	if (page)
-		__free_page(page);
+	if (work)
+		clear_bit(ZRAM_WB_WORK_ALLOCATED, &work->flags);
+	for (i = 0; i < work_pool_size; i++) {
+		while (test_bit(ZRAM_WB_WORK_ALLOCATED, &work_pool[i]->flags))
+			wait_for_completion_io(&done);
+		err = zram_writeback_complete(zram, work_pool[i]);
+		if (err) {
+			release_pp_slot(zram, work_pool[i]->pps);
+			work->pps = NULL;
+			ret = err;
+		}
 
+		if (work_pool[i]->blk_idx)
+			free_block_bdev(zram, work_pool[i]->blk_idx);
+		if (work_pool[i]->page)
+			__free_page(work_pool[i]->page);
+		if (i >= ARRAY_SIZE(work_prealloc))
+			kfree(work_pool[i]);
+	}
 	return ret;
 }
 
-- 
2.34.1