Prepare for introducing a second caller of __blk_mq_tagset_iter(). No functionality has been changed. Cc: Jens Axboe Cc: Christoph Hellwig Cc: Ming Lei Cc: John Garry Cc: Hannes Reinecke Signed-off-by: Bart Van Assche --- block/blk-mq-tag.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8a61c481015e..f169beeded64 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -419,6 +419,24 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, blk_mq_rq_iter_fn *fn, __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); } +static void __blk_mq_tagset_iter(struct blk_mq_tag_set *tagset, + blk_mq_rq_iter_fn *fn, void *priv, unsigned long flags) +{ + int i, nr_tags, srcu_idx; + + srcu_idx = srcu_read_lock(&tagset->tags_srcu); + + nr_tags = blk_mq_is_shared_tags(tagset->flags) ? 1 : + tagset->nr_hw_queues; + + for (i = 0; i < nr_tags; i++) { + if (tagset->tags && tagset->tags[i]) + __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, + flags); + } + srcu_read_unlock(&tagset->tags_srcu, srcu_idx); +} + /** * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. @@ -434,19 +452,7 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, blk_mq_rq_iter_fn *fn, void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, blk_mq_rq_iter_fn *fn, void *priv) { - unsigned int flags = tagset->flags; - int i, nr_tags, srcu_idx; - - srcu_idx = srcu_read_lock(&tagset->tags_srcu); - - nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; - - for (i = 0; i < nr_tags; i++) { - if (tagset->tags && tagset->tags[i]) - __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, - BT_TAG_ITER_STARTED); - } - srcu_read_unlock(&tagset->tags_srcu, srcu_idx); + __blk_mq_tagset_iter(tagset, fn, priv, BT_TAG_ITER_STARTED); } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); Support iterating over all requests in a tag set, including requests that have not yet been started. A later patch will call this function from scsi_device_busy(). Cc: Jens Axboe Cc: Christoph Hellwig Cc: Ming Lei Cc: John Garry Cc: Hannes Reinecke Signed-off-by: Bart Van Assche --- block/blk-mq-tag.c | 19 +++++++++++++++++++ include/linux/blk-mq.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f169beeded64..f277ed7e7743 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -456,6 +456,25 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); +/** + * blk_mq_tagset_iter - iterate over all requests in a tag set + * @tagset: Tag set to iterate over. + * @fn: Pointer to the function that will be called for each request. + * @fn will be called as follows: @fn(rq, @priv) where rq is a + * pointer to a request. Return true to continue iterating tags, + * false to stop. + * @priv: Will be passed as second argument to @fn. + * + * We grab one request reference before calling @fn and release it after + * @fn returns. + */ +void blk_mq_tagset_iter(struct blk_mq_tag_set *tagset, blk_mq_rq_iter_fn *fn, + void *priv) +{ + __blk_mq_tagset_iter(tagset, fn, priv, 0); +} +EXPORT_SYMBOL(blk_mq_tagset_iter); + static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data) { unsigned *count = data; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3467cacb281c..20a22c1cd067 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -927,6 +927,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, blk_mq_rq_iter_fn *fn, void *priv); +void blk_mq_tagset_iter(struct blk_mq_tag_set *tagset, blk_mq_rq_iter_fn *fn, + void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); void blk_mq_freeze_queue_nomemsave(struct request_queue *q); void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q); Since a single hardware queue is used by ATA drivers, the request tag uniquely identifies in-flight commands. Stop using the SCSI budget token to prepare for no longer allocating a budget token if possible. The modified code was introduced by commit 4f1a22ee7b57 ("libata: Improve ATA queued command allocation"). Cc: Damien Le Moal Cc: Niklas Cassel Cc: John Garry Cc: Christoph Hellwig Signed-off-by: Bart Van Assche --- drivers/ata/libata-scsi.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 026122bb6f2f..90f5422fa08b 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -744,22 +744,16 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev, { struct ata_port *ap = dev->link->ap; struct ata_queued_cmd *qc; - int tag; + int tag = scsi_cmd_to_rq(cmd)->tag; if (unlikely(ata_port_is_frozen(ap))) goto fail; - if (ap->flags & ATA_FLAG_SAS_HOST) { - /* - * SAS hosts may queue > ATA_MAX_QUEUE commands so use - * unique per-device budget token as a tag. - */ - if (WARN_ON_ONCE(cmd->budget_token >= ATA_MAX_QUEUE)) - goto fail; - tag = cmd->budget_token; - } else { - tag = scsi_cmd_to_rq(cmd)->tag; - } + WARN_ON_ONCE(cmd->device->host->tag_set.nr_hw_queues > 1); + + /* SAS hosts may queue > ATA_MAX_QUEUE commands. */ + if (ap->flags & ATA_FLAG_SAS_HOST && WARN_ON_ONCE(tag >= ATA_MAX_QUEUE)) + goto fail; qc = __ata_qc_from_tag(ap, tag); qc->tag = qc->hw_tag = tag; Instead of only handling dev->budget_map.map != NULL, also handle dev->budget_map.map == NULL. This patch prepares for supporting logical units without budget map (sdev->budget_map.map == NULL). Cc: Jens Axboe Cc: Christoph Hellwig Cc: Ming Lei Cc: John Garry Cc: Hannes Reinecke Signed-off-by: Bart Van Assche --- drivers/scsi/scsi_lib.c | 38 ++++++++++++++++++++++++++++++++++++++ include/scsi/scsi_device.h | 5 +---- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 51ad2ad07e43..ddc51472b5eb 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -446,6 +446,44 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev) spin_unlock_irqrestore(shost->host_lock, flags); } +struct sdev_cmds_allocated_data { + const struct scsi_device *sdev; + int count; +}; + +static bool scsi_device_check_allocated(struct request *rq, void *data) +{ + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); + struct sdev_cmds_allocated_data *sifd = data; + + if (cmd->device == sifd->sdev) + sifd->count++; + + return true; +} + +/** + * scsi_device_busy() - Number of commands allocated for a SCSI device + * @sdev: SCSI device. + * + * Note: There is a subtle difference between this function and + * scsi_host_busy(). scsi_host_busy() counts the number of commands that have + * been started. This function counts the number of commands that have been + * allocated. At least the UFS driver depends on this function counting commands + * that have already been allocated but that have not yet been started. + */ +int scsi_device_busy(const struct scsi_device *sdev) +{ + struct sdev_cmds_allocated_data sifd = { .sdev = sdev }; + struct blk_mq_tag_set *set = &sdev->host->tag_set; + + if (sdev->budget_map.map) + return sbitmap_weight(&sdev->budget_map); + blk_mq_tagset_iter(set, scsi_device_check_allocated, &sifd); + return sifd.count; +} +EXPORT_SYMBOL(scsi_device_busy); + static inline bool scsi_device_is_busy(struct scsi_device *sdev) { if (scsi_device_busy(sdev) >= sdev->queue_depth) diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index d32f5841f4f8..0dd078ac9b89 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -713,10 +713,7 @@ static inline int scsi_device_supports_vpd(struct scsi_device *sdev) return 0; } -static inline int scsi_device_busy(struct scsi_device *sdev) -{ - return sbitmap_weight(&sdev->budget_map); -} +int scsi_device_busy(const struct scsi_device *sdev); /* Macros to access the UNIT ATTENTION counters */ #define scsi_get_ua_new_media_ctr(sdev) atomic_read(&sdev->ua_new_media_ctr) The SCSI core uses the budget map to restrict the number of commands that are in flight per logical unit. That limit check can be left out if host->cmd_per_lun >= host->can_queue and if the host tag set is shared across all hardware queues or if there is only one hardware queue Since scsi_mq_get_budget() shows up in all CPU profiles for fast SCSI devices, do not allocate a budget map if cmd_per_lun >= can_queue and if the host tag set is shared across all hardware queues. For the following test this patch increases IOPS by 5%: modprobe scsi_debug delay=0 no_rwlock=1 host_max_queue=192 submit_queues=$(nproc) fio --bs=4096 --disable_clat=1 --disable_slat=1 --group_reporting=1 \ --gtod_reduce=1 --invalidate=1 --ioengine=io_uring --ioscheduler=none \ --norandommap --runtime=60 --rw=randread --thread --time_based=1 \ --buffered=0 --numjobs=1 --iodepth=192 --iodepth_batch=24 --name=/dev/sda \ --filename=/dev/sda Cc: Jens Axboe Cc: Christoph Hellwig Cc: Ming Lei Cc: John Garry Cc: Hannes Reinecke Signed-off-by: Bart Van Assche --- drivers/scsi/scsi.c | 6 ++---- drivers/scsi/scsi_scan.c | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 76cdad063f7b..3dc93dd9fda2 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -216,9 +216,6 @@ int scsi_device_max_queue_depth(struct scsi_device *sdev) */ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) { - if (!sdev->budget_map.map) - return -EINVAL; - depth = min_t(int, depth, scsi_device_max_queue_depth(sdev)); if (depth > 0) { @@ -229,7 +226,8 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) if (sdev->request_queue) blk_set_queue_depth(sdev->request_queue, depth); - sbitmap_resize(&sdev->budget_map, sdev->queue_depth); + if (sdev->budget_map.map) + sbitmap_resize(&sdev->budget_map, sdev->queue_depth); return sdev->queue_depth; } diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 7acbfcfc2172..99b82e28f292 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -215,9 +215,17 @@ static void scsi_unlock_floptical(struct scsi_device *sdev, SCSI_TIMEOUT, 3, NULL); } +static bool scsi_needs_budget_map(struct Scsi_Host *shost, unsigned int depth) +{ + if (shost->host_tagset || shost->tag_set.nr_hw_queues == 1) + return depth < shost->can_queue; + return true; +} + static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev, unsigned int depth) { + struct Scsi_Host *shost = sdev->host; int new_shift = sbitmap_calculate_shift(depth); bool need_alloc = !sdev->budget_map.map; bool need_free = false; @@ -225,6 +233,13 @@ static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev, int ret; struct sbitmap sb_backup; + if (!scsi_needs_budget_map(shost, depth)) { + memflags = blk_mq_freeze_queue(sdev->request_queue); + sbitmap_free(&sdev->budget_map); + blk_mq_unfreeze_queue(sdev->request_queue, memflags); + return 0; + } + depth = min_t(unsigned int, depth, scsi_device_max_queue_depth(sdev)); /* @@ -1120,7 +1135,8 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result, scsi_cdl_check(sdev); sdev->max_queue_depth = sdev->queue_depth; - WARN_ON_ONCE(sdev->max_queue_depth > sdev->budget_map.depth); + WARN_ON_ONCE(sdev->budget_map.map && + sdev->max_queue_depth > sdev->budget_map.depth); /* * Ok, the device is now all set up, we can