GPU use-cases for mmu_interval_notifiers with hmm often involve starting a gpu operation and then waiting for it to complete. These operations are typically context preemption or TLB flushing. With single-pass notifiers per GPU this doesn't scale in multi-gpu scenarios. In those scenarios we'd want to first start preemption- or TLB flushing on all GPUs and as a second pass wait for them to complete. One can do this on per-driver basis multiplexing per-driver notifiers but that would mean sharing the notifier "user" lock across all GPUs and that doesn't scale well either, so adding support for multi-pass in the core appears to be the right choice. Implement two-pass capability in the mmu_interval_notifier. Use a linked list for the final passes to minimize the impact for use-cases that don't need the multi-pass functionality by avoiding a second interval tree walk, and to be able to easily pass data between the two passes. v1: - Restrict to two passes (Jason Gunthorpe) - Improve on documentation (Jason Gunthorpe) - Improve on function naming (Alistair Popple) Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Simona Vetter Cc: Dave Airlie Cc: Alistair Popple Cc: Cc: Cc: Signed-off-by: Thomas Hellström --- include/linux/mmu_notifier.h | 42 ++++++++++++++++++++++++ mm/mmu_notifier.c | 63 ++++++++++++++++++++++++++++++------ 2 files changed, 96 insertions(+), 9 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d1094c2d5fb6..14cfb3735699 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -233,16 +233,58 @@ struct mmu_notifier { unsigned int users; }; +/** + * struct mmu_interval_notifier_finish - mmu_interval_notifier two-pass abstraction + * @link: List link for the notifiers pending pass list + * + * Allocate, typically using GFP_NOWAIT in the interval notifier's first pass. + * If allocation fails (which is not unlikely under memory pressure), fall back + * to single-pass operation. Note that with a large number of notifiers + * implementing two passes, allocation with GFP_NOWAIT will become increasingly + * likely to fail, so consider implementing a small pool instead of using + * kmalloc() allocations. + * + * If the implementation needs to pass data between the two passes, + * the recommended way is to embed strct mmu_interval_notifier_finish into a larger + * structure that also contains the data needed to be shared. Keep in mind that + * a notifier callback can be invoked in parallel, and each invocation needs its + * own struct mmu_interval_notifier_finish. + */ +struct mmu_interval_notifier_finish { + struct list_head link; + /** + * @finish: Driver callback for the finish pass. + * @final: Pointer to the mmu_interval_notifier_finish structure. + * @range: The mmu_notifier_range. + * @cur_seq: The current sequence set by the first pass. + * + * Note that there is no error reporting for additional passes. + */ + void (*finish)(struct mmu_interval_notifier_finish *final, + const struct mmu_notifier_range *range, + unsigned long cur_seq); +}; + /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. + * @invalidate_start: Similar to @invalidate, but intended for two-pass notifier + * callbacks where the callto @invalidate_start is the first + * pass and any struct mmu_interval_notifier_finish pointer + * returned in the @fini parameter describes the final pass. + * If @fini is %NULL on return, then no final pass will be + * called. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); + bool (*invalidate_start)(struct mmu_interval_notifier *interval_sub, + const struct mmu_notifier_range *range, + unsigned long cur_seq, + struct mmu_interval_notifier_finish **final); }; struct mmu_interval_notifier { diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8e0125dc0522..fceadcd8ca24 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -260,6 +260,18 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) } EXPORT_SYMBOL_GPL(mmu_interval_read_begin); +static void mn_itree_final_pass(struct list_head *final_passes, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct mmu_interval_notifier_finish *f, *next; + + list_for_each_entry_safe(f, next, final_passes, link) { + list_del(&f->link); + f->finish(f, range, cur_seq); + } +} + static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, struct mm_struct *mm) { @@ -271,6 +283,7 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, .end = ULONG_MAX, }; struct mmu_interval_notifier *interval_sub; + LIST_HEAD(final_passes); unsigned long cur_seq; bool ret; @@ -278,11 +291,25 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, mn_itree_inv_start_range(subscriptions, &range, &cur_seq); interval_sub; interval_sub = mn_itree_inv_next(interval_sub, &range)) { - ret = interval_sub->ops->invalidate(interval_sub, &range, - cur_seq); + if (interval_sub->ops->invalidate_start) { + struct mmu_interval_notifier_finish *final = NULL; + + ret = interval_sub->ops->invalidate_start(interval_sub, + &range, + cur_seq, + &final); + if (ret && final) + list_add_tail(&final->link, &final_passes); + + } else { + ret = interval_sub->ops->invalidate(interval_sub, + &range, + cur_seq); + } WARN_ON(!ret); } + mn_itree_final_pass(&final_passes, &range, cur_seq); mn_itree_inv_end(subscriptions); } @@ -430,7 +457,9 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { struct mmu_interval_notifier *interval_sub; + LIST_HEAD(final_passes); unsigned long cur_seq; + int err = 0; for (interval_sub = mn_itree_inv_start_range(subscriptions, range, &cur_seq); @@ -438,23 +467,39 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, interval_sub = mn_itree_inv_next(interval_sub, range)) { bool ret; - ret = interval_sub->ops->invalidate(interval_sub, range, - cur_seq); + if (interval_sub->ops->invalidate_start) { + struct mmu_interval_notifier_finish *final = NULL; + + ret = interval_sub->ops->invalidate_start(interval_sub, + range, + cur_seq, + &final); + if (ret && final) + list_add_tail(&final->link, &final_passes); + + } else { + ret = interval_sub->ops->invalidate(interval_sub, + range, + cur_seq); + } if (!ret) { if (WARN_ON(mmu_notifier_range_blockable(range))) continue; - goto out_would_block; + err = -EAGAIN; + break; } } - return 0; -out_would_block: + mn_itree_final_pass(&final_passes, range, cur_seq); + /* * On -EAGAIN the non-blocking caller is not allowed to call * invalidate_range_end() */ - mn_itree_inv_end(subscriptions); - return -EAGAIN; + if (err) + mn_itree_inv_end(subscriptions); + + return err; } static int mn_hlist_invalidate_range_start( -- 2.50.1