This commit makes the necessary changes for correctly integrating IAA compress/decompress batching with the crypto_acomp API as per the discussions in [1]. Further, IAA sets crypto_alg flags to indicate support for segmentation. To provide context from the perspective of a kernel user such as zswap, the zswap interface to these batching API will be done by setting up the acomp_req through these crypto API to designate multiple src/dst SG lists representing the batch being sent to iaa_crypto: acomp_request_set_src_folio() acomp_request_set_dst_sg() acomp_request_set_unit_size() before proceeding to invoke batch compression using the existing crypto_acomp_compress() interface. Within crypto_acomp_compress(), an acomp_req whose tfm supports segmentation is further tested for an "slen" that is greater than the request's unit_size. If so, we invoke "acomp_do_req_batch_parallel()", similar to the "acomp_do_req_chain()" case. acomp_do_req_batch_parallel() creates a wait_queue_head "batch_parallel_wq", stores it in the acomp_req's "__ctx", then calls tfm->compress()/tfm->decompress(). Next, the iaa_crypto driver alg's compress() implementation submits the batch's requests and immediately returns to acomp_do_req_batch_parallel(); which then waits for the "batch_parallel_wq" to be notified by a tfm->batch_completed() event. To support this, a "batch_completed()" API is added to "struct crypto_acomp" and "struct acomp_alg". The iaa_crypto driver alg's batch_completed() implementation waits for each batch sub-request to complete and notifies the batch_parallel_wq. If any sub-request has an error, -EINVAL is returned to the acomp_req's callback, else 0. [1]: https://lore.kernel.org/all/aRqSqQxR4eHzvb2g@gondor.apana.org.au/ Suggested-by: Herbert Xu Signed-off-by: Kanchana P Sridhar --- crypto/acompress.c | 63 ++++++++++ drivers/crypto/intel/iaa/iaa_crypto.h | 3 + drivers/crypto/intel/iaa/iaa_crypto_main.c | 137 +++++++++++++++++++-- include/crypto/acompress.h | 7 ++ include/crypto/internal/acompress.h | 7 ++ 5 files changed, 210 insertions(+), 7 deletions(-) diff --git a/crypto/acompress.c b/crypto/acompress.c index cfb8ede02cf4..c48a1a20e21f 100644 --- a/crypto/acompress.c +++ b/crypto/acompress.c @@ -105,6 +105,7 @@ static int crypto_acomp_init_tfm(struct crypto_tfm *tfm) acomp->compress = alg->compress; acomp->decompress = alg->decompress; + acomp->batch_completed = alg->batch_completed; acomp->reqsize = alg->base.cra_reqsize; acomp->base.exit = crypto_acomp_exit_tfm; @@ -291,6 +292,65 @@ static __always_inline int acomp_do_req_chain(struct acomp_req *req, bool comp) return acomp_reqchain_finish(req, err); } +static int acomp_do_req_batch_parallel(struct acomp_req *req, bool comp) +{ + struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); + unsigned long *bpwq_addr = acomp_request_ctx(req); + wait_queue_head_t batch_parallel_wq; + int ret; + + init_waitqueue_head(&batch_parallel_wq); + *bpwq_addr = (unsigned long)&batch_parallel_wq; + + ret = comp ? tfm->compress(req) : tfm->decompress(req); + + wait_event(batch_parallel_wq, tfm->batch_completed(req, comp)); + + if (req->slen < 0) + ret |= -EINVAL; + + return ret; +} + +/** + * Please note: + * ============ + * + * 1) If @req->unit_size is 0, there is no impact to existing acomp users. + * + * 2) If @req->unit_size is non-0 (for e.g. zswap compress batching) and + * @req->src and @req->dst are scatterlists: + * + * a) Algorithms that do not support segmentation: + * + * We call acomp_do_req_chain() that handles the trivial case when + * the caller has passed exactly one segment. The dst SG list's length is + * set to the compression error/compressed length for that segment. + * + * b) Algorithms that support segmentation: + * + * If the source length is more than @req->unit_size, + * acomp_do_req_batch_parallel() is invoked: this calls the tfm's + * compress() API, which uses the @req->unit_size being greater than + * @req->slen to ascertain that it needs to do batching. The algorithm's + * compress() implementation submits the batch's sub-requests for + * compression and returns. + * + * Algorithms that support batching must provide a batch_completed() API. + * When the batch's compression sub-requests have completed, they must + * notify a wait_queue using the batch_completed() API. The batching tfm + * implementation must set the dst SG lists to contain the individual + * sub-requests' error/compressed lengths. + * + * If the source length == @req->unit_size, the tfm's compress() API is + * invoked. The assumption is that segmentation algorithms will internally + * set the dst SG list's length to indicate error/compressed length in + * this case, similar to the batching case. + * + * 3) To prevent functional/performance regressions, we preserve existing + * behavior in all other cases, such as, when @req->unit_size is non-0 and + * @req->src and/or @req->dst is virtual; instead of returning an error. + */ int crypto_acomp_compress(struct acomp_req *req) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); @@ -302,6 +362,9 @@ int crypto_acomp_compress(struct acomp_req *req) if (!crypto_acomp_req_seg(tfm)) return acomp_do_req_chain(req, true); + if (likely((req->slen > req->unit_size) && tfm->batch_completed)) + return acomp_do_req_batch_parallel(req, true); + return tfm->compress(req); } diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h index db83c21e92f1..d85a8f1cbb93 100644 --- a/drivers/crypto/intel/iaa/iaa_crypto.h +++ b/drivers/crypto/intel/iaa/iaa_crypto.h @@ -69,10 +69,13 @@ * IAA. In other words, don't make any assumptions, and protect * compression/decompression data. * + * @data: Driver internal data to interface with crypto_acomp. + * */ struct iaa_batch_ctx { struct iaa_req **reqs; struct mutex mutex; + void *data; }; #define IAA_COMP_MODES_MAX IAA_MODE_NONE diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c index 8d83a1ea15d7..915bf9b17b39 100644 --- a/drivers/crypto/intel/iaa/iaa_crypto_main.c +++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c @@ -2524,6 +2524,71 @@ static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa_mode * Interfaces to crypto_alg and crypto_acomp. *********************************************/ +static __always_inline int iaa_crypto_acomp_acompress_batch( + struct iaa_compression_ctx *ctx, + struct iaa_req *parent_req, + struct iaa_req **reqs, + unsigned int unit_size) +{ + int nr_reqs = parent_req->slen / unit_size; + + return iaa_comp_submit_acompress_batch(ctx, parent_req, reqs, nr_reqs, unit_size); +} + +static __always_inline int iaa_crypto_acomp_adecompress_batch( + struct iaa_compression_ctx *ctx, + struct iaa_req *parent_req, + struct iaa_req **reqs, + unsigned int unit_size) +{ + int nr_reqs = parent_req->dlen / unit_size; + + return iaa_comp_submit_adecompress_batch(ctx, parent_req, reqs, nr_reqs); +} + +static bool iaa_crypto_acomp_batch_completed(struct acomp_req *areq, bool comp) +{ + unsigned long *cpu_ctx_addr = acomp_request_ctx(areq); + struct iaa_batch_ctx *cpu_ctx = (struct iaa_batch_ctx *)*cpu_ctx_addr; + wait_queue_head_t *batch_parallel_wq = (wait_queue_head_t *)cpu_ctx->data; + struct iaa_req **reqs = cpu_ctx->reqs; + int nr_reqs = (comp ? areq->slen : areq->dlen) / areq->unit_size; + + /* + * Since both, compress and decompress require the eventual + * caller (zswap) to verify @areq->dlen, we use @areq->slen to + * flag the batch's success/error to crypto_acomp, which will + * return this as the @err status to the crypto_acomp callback + * function. + */ + if (iaa_comp_batch_completed(NULL, reqs, nr_reqs)) + areq->slen = -EINVAL; + + /* + * Set the acomp_req's dlen to be the first SG list's + * compressed/decompressed length/error value to enable zswap code + * equivalence for non-batching and batching acomp_algs. + */ + areq->dlen = areq->dst->length; + + /* All sub-requests have finished. Notify the @batch_parallel_wq. */ + if (waitqueue_active(batch_parallel_wq)) + wake_up(batch_parallel_wq); + + mutex_unlock(&cpu_ctx->mutex); + + return true; +} + +/* + * Main compression API for kernel users of crypto_acomp, such as zswap. + * + * crypto_acomp_compress() calls into this procedure for: + * - Sequential compression of a single page, + * - Parallel batch compression of multiple pages. + * + * @areq: asynchronous compress request + */ static int iaa_crypto_acomp_acompress_main(struct acomp_req *areq) { struct crypto_tfm *tfm = areq->base.tfm; @@ -2534,14 +2599,47 @@ static int iaa_crypto_acomp_acompress_main(struct acomp_req *areq) if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) { ctx = iaa_ctx[idx]; - acomp_to_iaa(areq, &parent_req, ctx); - ret = iaa_comp_acompress(ctx, &parent_req); - iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq); + if (likely(areq->slen == areq->unit_size) || !areq->unit_size) { + acomp_to_iaa(areq, &parent_req, ctx); + ret = iaa_comp_acompress(ctx, &parent_req); + iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq); + } else { + struct iaa_batch_ctx *cpu_ctx = raw_cpu_ptr(iaa_batch_ctx); + struct iaa_req **reqs; + unsigned long *cpu_ctx_addr, *bpwq_addr; + + acomp_to_iaa(areq, &parent_req, ctx); + + mutex_lock(&cpu_ctx->mutex); + + bpwq_addr = acomp_request_ctx(areq); + /* Save the wait_queue_head. */ + cpu_ctx->data = (wait_queue_head_t *)*bpwq_addr; + + reqs = cpu_ctx->reqs; + + ret = iaa_crypto_acomp_acompress_batch(ctx, + &parent_req, + reqs, + areq->unit_size); + + cpu_ctx_addr = acomp_request_ctx(areq); + *cpu_ctx_addr = (unsigned long)cpu_ctx; + } } return ret; } +/* + * Main decompression API for kernel users of crypto_acomp, such as zswap. + * + * crypto_acomp_decompress() calls into this procedure for: + * - Sequential decompression of a single buffer, + * - Parallel batch decompression of multiple buffers. + * + * @areq: asynchronous decompress request + */ static int iaa_crypto_acomp_adecompress_main(struct acomp_req *areq) { struct crypto_tfm *tfm = areq->base.tfm; @@ -2552,9 +2650,33 @@ static int iaa_crypto_acomp_adecompress_main(struct acomp_req *areq) if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) { ctx = iaa_ctx[idx]; - acomp_to_iaa(areq, &parent_req, ctx); - ret = iaa_comp_adecompress(ctx, &parent_req); - iaa_to_acomp(parent_req.dlen, areq); + if (likely(areq->dlen == areq->unit_size) || !areq->unit_size) { + acomp_to_iaa(areq, &parent_req, ctx); + ret = iaa_comp_adecompress(ctx, &parent_req); + iaa_to_acomp(parent_req.dlen, areq); + } else { + struct iaa_batch_ctx *cpu_ctx = raw_cpu_ptr(iaa_batch_ctx); + struct iaa_req **reqs; + unsigned long *cpu_ctx_addr, *bpwq_addr; + + acomp_to_iaa(areq, &parent_req, ctx); + + mutex_lock(&cpu_ctx->mutex); + + bpwq_addr = acomp_request_ctx(areq); + /* Save the wait_queue_head. */ + cpu_ctx->data = (wait_queue_head_t *)*bpwq_addr; + + reqs = cpu_ctx->reqs; + + ret = iaa_crypto_acomp_adecompress_batch(ctx, + &parent_req, + reqs, + areq->unit_size); + + cpu_ctx_addr = acomp_request_ctx(areq); + *cpu_ctx_addr = (unsigned long)cpu_ctx; + } } return ret; @@ -2574,10 +2696,11 @@ static struct acomp_alg iaa_acomp_fixed_deflate = { .init = iaa_crypto_acomp_init_fixed, .compress = iaa_crypto_acomp_acompress_main, .decompress = iaa_crypto_acomp_adecompress_main, + .batch_completed = iaa_crypto_acomp_batch_completed, .base = { .cra_name = "deflate", .cra_driver_name = "deflate-iaa", - .cra_flags = CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_SEG, .cra_ctxsize = sizeof(struct iaa_compression_ctx), .cra_reqsize = sizeof(u32), .cra_module = THIS_MODULE, diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h index 86e4932cd112..752110a7719c 100644 --- a/include/crypto/acompress.h +++ b/include/crypto/acompress.h @@ -109,6 +109,12 @@ struct acomp_req { * * @compress: Function performs a compress operation * @decompress: Function performs a de-compress operation + * @batch_completed: Waits for batch completion of parallel + * compress/decompress requests submitted via + * @compress/@decompress. Returns bool status + * of all batch sub-requests having completed. + * Returns an error code in @req->slen if any + * of the sub-requests completed with an error. * @reqsize: Context size for (de)compression requests * @fb: Synchronous fallback tfm * @base: Common crypto API algorithm data structure @@ -116,6 +122,7 @@ struct acomp_req { struct crypto_acomp { int (*compress)(struct acomp_req *req); int (*decompress)(struct acomp_req *req); + bool (*batch_completed)(struct acomp_req *req, bool comp); unsigned int reqsize; struct crypto_tfm base; }; diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h index 366dbdb987e8..7c4e14491d59 100644 --- a/include/crypto/internal/acompress.h +++ b/include/crypto/internal/acompress.h @@ -28,6 +28,12 @@ * * @compress: Function performs a compress operation * @decompress: Function performs a de-compress operation + * @batch_completed: Waits for batch completion of parallel + * compress/decompress requests submitted via + * @compress/@decompress. Returns bool status + * of all batch sub-requests having completed. + * Returns an error code in @req->slen if any + * of the sub-requests completed with an error. * @init: Initialize the cryptographic transformation object. * This function is used to initialize the cryptographic * transformation object. This function is called only once at @@ -46,6 +52,7 @@ struct acomp_alg { int (*compress)(struct acomp_req *req); int (*decompress)(struct acomp_req *req); + bool (*batch_completed)(struct acomp_req *req, bool comp); int (*init)(struct crypto_acomp *tfm); void (*exit)(struct crypto_acomp *tfm); -- 2.27.0