From: Jihan LIN <linjh22s@gmail.com>

Crrent per-CPU streams limit write concurrency to the number of online
CPUs. For zcomp backends that support zcomp-managed streams, we can
utilize them for async write requests to get better parallelism.

Modify zram_write_page() to accept a flag indicating the request is
asynchronous. If the bio request is considered as non-synchronous and
the backend supports zcomp-managed streams, attempt to acquire a
zcomp-managed stream.

Although zram_write_page() currently waits for compression to complete,
making the operation appear synchronous, using zcomp-managed streams
has the potential to improve parallelism for async write requests,
provided the backend utilizes efficient streams management or
specialized implementations.

zcomp_stream_get() handles the fallback to per-CPU streams.
zram_read_page() remains using the generic per-CPU streams, since reads
are always treated as synchronous in op_is_sync().

Support multiple pages within a single bio request is deferred to keep
simple and focused.

Signed-off-by: Jihan LIN <linjh22s@gmail.com>
---
 drivers/block/zram/zram_drv.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 2e5a1415e9034674e14e619f486052cd21098f50..655d0e141c621ca38ca1059780c1a8a00258c868 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1966,6 +1966,7 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index)
 	size = zram_get_obj_size(zram, index);
 	prio = zram_get_priority(zram, index);
 
+	/* Reads are treated as synchronous, see op_is_sync(). */
 	zstrm = zcomp_stream_get(zram->comps[prio], ZSTRM_DEFAULT);
 	src = zs_obj_read_begin(zram->mem_pool, handle, zstrm->local_copy);
 	dst = kmap_local_page(page);
@@ -2105,7 +2106,8 @@ static int write_incompressible_page(struct zram *zram, struct page *page,
 	return 0;
 }
 
-static int zram_write_page(struct zram *zram, struct page *page, u32 index)
+static int zram_write_page(struct zram *zram, struct page *page, u32 index,
+			   bool is_async)
 {
 	int ret = 0;
 	unsigned long handle;
@@ -2121,7 +2123,19 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 	if (same_filled)
 		return write_same_filled_page(zram, element, index);
 
-	zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP], ZSTRM_DEFAULT);
+	/*
+	 * Using zcomp-managed stream and waiting for compress done makes
+	 * this appear synchronous.
+	 *
+	 * At this time, zram_bio_write handles pages one by one.
+	 * However, zcomp-managed streams allow threads to submit jobs
+	 * to zcomp without the lock contention in per-cpu streams.
+	 * This might give us better parallelism than the generic per-cpu
+	 * streams could, assuming zcomp uses efficient streams management
+	 * or utilizes the specialized implementations.
+	 */
+	zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP],
+				 is_async ? ZSTRM_PREFER_MGMT : ZSTRM_DEFAULT);
 	mem = kmap_local_page(page);
 	ret = zcomp_compress(zram->comps[ZRAM_PRIMARY_COMP], zstrm,
 			     mem, &comp_len);
@@ -2183,7 +2197,8 @@ static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec,
 	ret = zram_read_page(zram, page, index, bio);
 	if (!ret) {
 		memcpy_from_bvec(page_address(page) + offset, bvec);
-		ret = zram_write_page(zram, page, index);
+		ret = zram_write_page(zram, page, index,
+				      !op_is_sync(bio->bi_opf));
 	}
 	__free_page(page);
 	return ret;
@@ -2194,7 +2209,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 {
 	if (is_partial_io(bvec))
 		return zram_bvec_write_partial(zram, bvec, index, offset, bio);
-	return zram_write_page(zram, bvec->bv_page, index);
+	return zram_write_page(zram, bvec->bv_page, index,
+			       !op_is_sync(bio->bi_opf));
 }
 
 #ifdef CONFIG_ZRAM_MULTI_COMP

-- 
2.51.0