Some bio completion handlers need to run in task context but bio_endio() can be called from IRQ context (e.g. buffer_head writeback). Add a BIO_COMPLETE_IN_TASK flag that bio submitters can set to request task-context completion of their bi_end_io callback. When bio_endio() sees this flag and is running in non-task context, it queues the bio to a per-cpu lockless list and schedules a delayed work item to call bi_end_io() from task context. The delayed work uses a 1-jiffie delay to allow batches of completions to accumulate before processing. A CPU hotplug dead callback drains any remaining bios from the departing CPU's batch. This will be used to enable RWF_DONTCACHE for block devices, and could be used for other subsystems like fscrypt that need task-context bio completion. Suggested-by: Matthew Wilcox Signed-off-by: Tal Zussman --- block/bio.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/blk_types.h | 7 +++- 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 8203bb7455a9..21b403eb1c04 100644 --- a/block/bio.c +++ b/block/bio.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "blk.h" @@ -1714,6 +1715,51 @@ void bio_check_pages_dirty(struct bio *bio) } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); +struct bio_complete_batch { + struct llist_head list; + struct delayed_work work; + int cpu; +}; + +static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch); +static struct workqueue_struct *bio_complete_wq; + +static void bio_complete_work_fn(struct work_struct *w) +{ + struct delayed_work *dw = to_delayed_work(w); + struct bio_complete_batch *batch = + container_of(dw, struct bio_complete_batch, work); + struct llist_node *node; + struct bio *bio, *next; + + do { + node = llist_del_all(&batch->list); + if (!node) + break; + + node = llist_reverse_order(node); + llist_for_each_entry_safe(bio, next, node, bi_llist) + bio->bi_end_io(bio); + + if (need_resched()) { + if (!llist_empty(&batch->list)) + mod_delayed_work_on(batch->cpu, + bio_complete_wq, + &batch->work, 0); + break; + } + } while (1); +} + +static void bio_queue_completion(struct bio *bio) +{ + struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch); + + if (llist_add(&bio->bi_llist, &batch->list)) + mod_delayed_work_on(batch->cpu, bio_complete_wq, + &batch->work, 1); +} + static inline bool bio_remaining_done(struct bio *bio) { /* @@ -1788,7 +1834,9 @@ void bio_endio(struct bio *bio) } #endif - if (bio->bi_end_io) + if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK)) + bio_queue_completion(bio); + else if (bio->bi_end_io) bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); @@ -1974,6 +2022,24 @@ int bioset_init(struct bio_set *bs, } EXPORT_SYMBOL(bioset_init); +/* + * Drain a dead CPU's deferred bio completions. + */ +static int bio_complete_batch_cpu_dead(unsigned int cpu) +{ + struct bio_complete_batch *batch = + per_cpu_ptr(&bio_complete_batch, cpu); + struct llist_node *node; + struct bio *bio, *next; + + node = llist_del_all(&batch->list); + node = llist_reverse_order(node); + llist_for_each_entry_safe(bio, next, node, bi_llist) + bio->bi_end_io(bio); + + return 0; +} + static int __init init_bio(void) { int i; @@ -1988,6 +2054,21 @@ static int __init init_bio(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } + for_each_possible_cpu(i) { + struct bio_complete_batch *batch = + per_cpu_ptr(&bio_complete_batch, i); + + init_llist_head(&batch->list); + INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn); + batch->cpu = i; + } + + bio_complete_wq = alloc_workqueue("bio_complete", WQ_MEM_RECLAIM, 0); + if (!bio_complete_wq) + panic("bio: can't allocate bio_complete workqueue\n"); + + cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead", + NULL, bio_complete_batch_cpu_dead); cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8808ee76e73c..0b55159d110d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -11,6 +11,7 @@ #include #include #include +#include struct bio_set; struct bio; @@ -208,7 +209,10 @@ typedef unsigned int blk_qc_t; * stacking drivers) */ struct bio { - struct bio *bi_next; /* request queue link */ + union { + struct bio *bi_next; /* request queue link */ + struct llist_node bi_llist; /* deferred completion */ + }; struct block_device *bi_bdev; blk_opf_t bi_opf; /* bottom bits REQ_OP, top bits * req_flags. @@ -322,6 +326,7 @@ enum { BIO_REMAPPED, BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ + BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */ BIO_FLAG_LAST }; -- 2.39.5