From: Yuwen Chen Currently, zram writeback supports only a single bio writeback operation, waiting for bio completion before post-processing next pp-slot. This works, in general, but has certain throughput limitations. Implement batched (multiple) bio writeback support to take advantage of parallel requests processing and better requests scheduling. For the time being the writeback batch size (maximum number of in-flight bio requests) is set to 1, so the behaviors is the same as the previous single-bio writeback. This is addressed in a follow up patch, which adds a writeback_batch_size device attribute. Please refer to [1] and [2] for benchmarks. [1] https://lore.kernel.org/linux-block/tencent_B2DC37E3A2AED0E7F179365FCB5D82455B08@qq.com [2] https://lore.kernel.org/linux-block/tencent_0FBBFC8AE0B97BC63B5D47CE1FF2BABFDA09@qq.com [senozhatsky: significantly reworked the initial patch so that the approach and implementation resemble current zram post-processing code] Signed-off-by: Yuwen Chen Co-developed-by: Richard Chang Co-developed-by: Sergey Senozhatsky --- drivers/block/zram/zram_drv.c | 323 +++++++++++++++++++++++++++------- 1 file changed, 255 insertions(+), 68 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a43074657531..92af848d81f5 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -734,20 +734,206 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, submit_bio(bio); } -static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) -{ - unsigned long blk_idx = 0; - struct page *page = NULL; +struct zram_wb_ctl { + struct list_head idle_reqs; + struct list_head inflight_reqs; + + atomic_t num_inflight; + struct completion done; + struct blk_plug plug; +}; + +struct zram_wb_req { + unsigned long blk_idx; + struct page *page; struct zram_pp_slot *pps; struct bio_vec bio_vec; struct bio bio; - int ret = 0, err; + + struct list_head entry; +}; + +static void release_wb_req(struct zram_wb_req *req) +{ + __free_page(req->page); + kfree(req); +} + +static void release_wb_ctl(struct zram_wb_ctl *wb_ctl) +{ + /* We should never have inflight requests at this point */ + WARN_ON(!list_empty(&wb_ctl->inflight_reqs)); + + while (!list_empty(&wb_ctl->idle_reqs)) { + struct zram_wb_req *req; + + req = list_first_entry(&wb_ctl->idle_reqs, + struct zram_wb_req, entry); + list_del(&req->entry); + release_wb_req(req); + } + + kfree(wb_ctl); +} + +/* XXX: should be a per-device sysfs attr */ +#define ZRAM_WB_REQ_CNT 1 + +static struct zram_wb_ctl *init_wb_ctl(void) +{ + struct zram_wb_ctl *wb_ctl; + int i; + + wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL); + if (!wb_ctl) + return NULL; + + INIT_LIST_HEAD(&wb_ctl->idle_reqs); + INIT_LIST_HEAD(&wb_ctl->inflight_reqs); + atomic_set(&wb_ctl->num_inflight, 0); + init_completion(&wb_ctl->done); + + for (i = 0; i < ZRAM_WB_REQ_CNT; i++) { + struct zram_wb_req *req; + + /* + * This is fatal condition only if we couldn't allocate + * any requests at all. Otherwise we just work with the + * requests that we have successfully allocated, so that + * writeback can still proceed, even if there is only one + * request on the idle list. + */ + req = kzalloc(sizeof(*req), GFP_NOIO | __GFP_NOWARN); + if (!req) + break; + + req->page = alloc_page(GFP_NOIO | __GFP_NOWARN); + if (!req->page) { + kfree(req); + break; + } + + INIT_LIST_HEAD(&req->entry); + list_add(&req->entry, &wb_ctl->idle_reqs); + } + + /* We couldn't allocate any requests, so writeabck is not possible */ + if (list_empty(&wb_ctl->idle_reqs)) + goto release_wb_ctl; + + return wb_ctl; + +release_wb_ctl: + release_wb_ctl(wb_ctl); + return NULL; +} + +static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) +{ u32 index; + int err; - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; + index = req->pps->index; + release_pp_slot(zram, req->pps); + req->pps = NULL; + + err = blk_status_to_errno(req->bio.bi_status); + if (err) + return err; + + atomic64_inc(&zram->stats.bd_writes); + zram_slot_lock(zram, index); + /* + * We release slot lock during writeback so slot can change under us: + * slot_free() or slot_free() and zram_write_page(). In both cases + * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can + * set ZRAM_PP_SLOT on such slots until current post-processing + * finishes. + */ + if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) + goto out; + + zram_free_page(zram, index); + zram_set_flag(zram, index, ZRAM_WB); + zram_set_handle(zram, index, req->blk_idx); + atomic64_inc(&zram->stats.pages_stored); + spin_lock(&zram->wb_limit_lock); + if (zram->wb_limit_enable && zram->bd_wb_limit > 0) + zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); + spin_unlock(&zram->wb_limit_lock); + +out: + zram_slot_unlock(zram, index); + return 0; +} + +static void zram_writeback_endio(struct bio *bio) +{ + struct zram_wb_ctl *wb_ctl = bio->bi_private; + + if (atomic_dec_return(&wb_ctl->num_inflight) == 0) + complete(&wb_ctl->done); +} + +static void zram_submit_wb_request(struct zram_wb_ctl *wb_ctl, + struct zram_wb_req *req) +{ + atomic_inc(&wb_ctl->num_inflight); + list_add_tail(&req->entry, &wb_ctl->inflight_reqs); + submit_bio(&req->bio); +} + +static struct zram_wb_req *select_idle_req(struct zram_wb_ctl *wb_ctl) +{ + struct zram_wb_req *req = NULL; + + if (!list_empty(&wb_ctl->idle_reqs)) { + req = list_first_entry(&wb_ctl->idle_reqs, + struct zram_wb_req, entry); + list_del(&req->entry); + } + + return req; +} + +static int zram_wb_wait_for_completion(struct zram *zram, + struct zram_wb_ctl *wb_ctl) +{ + int ret = 0; + + if (atomic_read(&wb_ctl->num_inflight) == 0) + return 0; + + wait_for_completion_io(&wb_ctl->done); + reinit_completion(&wb_ctl->done); + + while (!list_empty(&wb_ctl->inflight_reqs)) { + struct zram_wb_req *req; + int err; + + req = list_first_entry(&wb_ctl->inflight_reqs, + struct zram_wb_req, entry); + list_move(&req->entry, &wb_ctl->idle_reqs); + + err = zram_writeback_complete(zram, req); + if (err) + ret = err; + } + + return ret; +} + +static int zram_writeback_slots(struct zram *zram, + struct zram_pp_ctl *ctl, + struct zram_wb_ctl *wb_ctl) +{ + struct zram_wb_req *req = NULL; + unsigned long blk_idx = 0; + struct zram_pp_slot *pps; + int ret = 0, err; + u32 index = 0; + blk_start_plug(&wb_ctl->plug); while ((pps = select_pp_slot(ctl))) { spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { @@ -757,15 +943,34 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) } spin_unlock(&zram->wb_limit_lock); + while (!req) { + req = select_idle_req(wb_ctl); + if (req) + break; + + blk_finish_plug(&wb_ctl->plug); + err = zram_wb_wait_for_completion(zram, wb_ctl); + blk_start_plug(&wb_ctl->plug); + /* + * BIO errors are not fatal, we continue and simply + * attempt to writeback the remaining objects (pages). + * At the same time we need to signal user-space that + * some writes (at least one, but also could be all of + * them) were not successful and we do so by returning + * the most recent BIO error. + */ + if (err) + ret = err; + } + if (!blk_idx) { blk_idx = alloc_block_bdev(zram); - if (!blk_idx) { + if (blk_idx) { ret = -ENOSPC; break; } } - index = pps->index; zram_slot_lock(zram, index); /* * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so @@ -775,67 +980,41 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) */ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; - if (zram_read_from_zspool(zram, page, index)) + if (zram_read_from_zspool(zram, req->page, index)) goto next; zram_slot_unlock(zram, index); - bio_init(&bio, zram->bdev, &bio_vec, 1, + req->blk_idx = blk_idx; + req->pps = pps; + bio_init(&req->bio, zram->bdev, &req->bio_vec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); - __bio_add_page(&bio, page, PAGE_SIZE, 0); + req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9); + req->bio.bi_end_io = zram_writeback_endio; + req->bio.bi_private = wb_ctl; + __bio_add_page(&req->bio, req->page, PAGE_SIZE, 0); - /* - * XXX: A single page IO would be inefficient for write - * but it would be not bad as starter. - */ - err = submit_bio_wait(&bio); - if (err) { - release_pp_slot(zram, pps); - /* - * BIO errors are not fatal, we continue and simply - * attempt to writeback the remaining objects (pages). - * At the same time we need to signal user-space that - * some writes (at least one, but also could be all of - * them) were not successful and we do so by returning - * the most recent BIO error. - */ - ret = err; - continue; - } - - atomic64_inc(&zram->stats.bd_writes); - zram_slot_lock(zram, index); - /* - * Same as above, we release slot lock during writeback so - * slot can change under us: slot_free() or slot_free() and - * reallocation (zram_write_page()). In both cases slot loses - * ZRAM_PP_SLOT flag. No concurrent post-processing can set - * ZRAM_PP_SLOT on such slots until current post-processing - * finishes. - */ - if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) - goto next; - - zram_free_page(zram, index); - zram_set_flag(zram, index, ZRAM_WB); - zram_set_handle(zram, index, blk_idx); + zram_submit_wb_request(wb_ctl, req); blk_idx = 0; - atomic64_inc(&zram->stats.pages_stored); - spin_lock(&zram->wb_limit_lock); - if (zram->wb_limit_enable && zram->bd_wb_limit > 0) - zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); - spin_unlock(&zram->wb_limit_lock); + req = NULL; + continue; + next: zram_slot_unlock(zram, index); release_pp_slot(zram, pps); - cond_resched(); } - if (blk_idx) - free_block_bdev(zram, blk_idx); - if (page) - __free_page(page); + /* + * Selected idle req, but never submitted it due to some error or + * wb limit. + */ + if (req) + release_wb_req(req); + + blk_finish_plug(&wb_ctl->plug); + err = zram_wb_wait_for_completion(zram, wb_ctl); + if (err) + ret = err; return ret; } @@ -948,7 +1127,8 @@ static ssize_t writeback_store(struct device *dev, struct zram *zram = dev_to_zram(dev); u64 nr_pages = zram->disksize >> PAGE_SHIFT; unsigned long lo = 0, hi = nr_pages; - struct zram_pp_ctl *ctl = NULL; + struct zram_pp_ctl *pp_ctl = NULL; + struct zram_wb_ctl *wb_ctl = NULL; char *args, *param, *val; ssize_t ret = len; int err, mode = 0; @@ -970,8 +1150,14 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - ctl = init_pp_ctl(); - if (!ctl) { + pp_ctl = init_pp_ctl(); + if (!pp_ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + wb_ctl = init_wb_ctl(); + if (!wb_ctl) { ret = -ENOMEM; goto release_init_lock; } @@ -1000,7 +1186,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); break; } @@ -1011,7 +1197,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); break; } @@ -1022,7 +1208,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); continue; } @@ -1033,17 +1219,18 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); continue; } } - err = zram_writeback_slots(zram, ctl); + err = zram_writeback_slots(zram, pp_ctl, wb_ctl); if (err) ret = err; release_init_lock: - release_pp_ctl(zram, ctl); + release_pp_ctl(zram, pp_ctl); + release_wb_ctl(wb_ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); -- 2.51.2.1041.gc1ab5b90ca-goog Introduce writeback_batch_size device attribute so that the maximum number of in-flight writeback bio requests can be configured at run-time per-device. This essentially enables batched bio writeback. Signed-off-by: Sergey Senozhatsky --- drivers/block/zram/zram_drv.c | 50 ++++++++++++++++++++++++++++++----- drivers/block/zram/zram_drv.h | 1 + 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 92af848d81f5..d5afe5956a1f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -570,6 +570,44 @@ static ssize_t writeback_limit_show(struct device *dev, return sysfs_emit(buf, "%llu\n", val); } +static ssize_t writeback_batch_size_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + u32 val; + ssize_t ret = -EINVAL; + + if (kstrtouint(buf, 10, &val)) + return ret; + + if (!val) + val = 1; + + down_read(&zram->init_lock); + zram->wb_batch_size = val; + up_read(&zram->init_lock); + ret = len; + + return ret; +} + +static ssize_t writeback_batch_size_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + u32 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + spin_lock(&zram->wb_limit_lock); + val = zram->wb_batch_size; + spin_unlock(&zram->wb_limit_lock); + up_read(&zram->init_lock); + + return sysfs_emit(buf, "%u\n", val); +} + static void reset_bdev(struct zram *zram) { if (!zram->backing_dev) @@ -776,10 +814,7 @@ static void release_wb_ctl(struct zram_wb_ctl *wb_ctl) kfree(wb_ctl); } -/* XXX: should be a per-device sysfs attr */ -#define ZRAM_WB_REQ_CNT 1 - -static struct zram_wb_ctl *init_wb_ctl(void) +static struct zram_wb_ctl *init_wb_ctl(struct zram *zram) { struct zram_wb_ctl *wb_ctl; int i; @@ -793,7 +828,7 @@ static struct zram_wb_ctl *init_wb_ctl(void) atomic_set(&wb_ctl->num_inflight, 0); init_completion(&wb_ctl->done); - for (i = 0; i < ZRAM_WB_REQ_CNT; i++) { + for (i = 0; i < zram->wb_batch_size; i++) { struct zram_wb_req *req; /* @@ -1156,7 +1191,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - wb_ctl = init_wb_ctl(); + wb_ctl = init_wb_ctl(zram); if (!wb_ctl) { ret = -ENOMEM; goto release_init_lock; @@ -2797,6 +2832,7 @@ static DEVICE_ATTR_RW(backing_dev); static DEVICE_ATTR_WO(writeback); static DEVICE_ATTR_RW(writeback_limit); static DEVICE_ATTR_RW(writeback_limit_enable); +static DEVICE_ATTR_RW(writeback_batch_size); #endif #ifdef CONFIG_ZRAM_MULTI_COMP static DEVICE_ATTR_RW(recomp_algorithm); @@ -2818,6 +2854,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_writeback.attr, &dev_attr_writeback_limit.attr, &dev_attr_writeback_limit_enable.attr, + &dev_attr_writeback_batch_size.attr, #endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, @@ -2879,6 +2916,7 @@ static int zram_add(void) init_rwsem(&zram->init_lock); #ifdef CONFIG_ZRAM_WRITEBACK + zram->wb_batch_size = 1; spin_lock_init(&zram->wb_limit_lock); #endif diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 6cee93f9c0d0..1a647f42c1a4 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -129,6 +129,7 @@ struct zram { struct file *backing_dev; spinlock_t wb_limit_lock; bool wb_limit_enable; + u32 wb_batch_size; u64 bd_wb_limit; struct block_device *bdev; unsigned long *bitmap; -- 2.51.2.1041.gc1ab5b90ca-goog