Prepare for introducing a second caller of __blk_mq_tagset_iter(). No
functionality has been changed.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-mq-tag.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 8a61c481015e..f169beeded64 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -419,6 +419,24 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, blk_mq_rq_iter_fn *fn,
 	__blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
 }
 
+static void __blk_mq_tagset_iter(struct blk_mq_tag_set *tagset,
+			blk_mq_rq_iter_fn *fn, void *priv, unsigned long flags)
+{
+	int i, nr_tags, srcu_idx;
+
+	srcu_idx = srcu_read_lock(&tagset->tags_srcu);
+
+	nr_tags = blk_mq_is_shared_tags(tagset->flags) ? 1 :
+		tagset->nr_hw_queues;
+
+	for (i = 0; i < nr_tags; i++) {
+		if (tagset->tags && tagset->tags[i])
+			__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
+					      flags);
+	}
+	srcu_read_unlock(&tagset->tags_srcu, srcu_idx);
+}
+
 /**
  * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
  * @tagset:	Tag set to iterate over.
@@ -434,19 +452,7 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, blk_mq_rq_iter_fn *fn,
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		blk_mq_rq_iter_fn *fn, void *priv)
 {
-	unsigned int flags = tagset->flags;
-	int i, nr_tags, srcu_idx;
-
-	srcu_idx = srcu_read_lock(&tagset->tags_srcu);
-
-	nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
-
-	for (i = 0; i < nr_tags; i++) {
-		if (tagset->tags && tagset->tags[i])
-			__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
-					      BT_TAG_ITER_STARTED);
-	}
-	srcu_read_unlock(&tagset->tags_srcu, srcu_idx);
+	__blk_mq_tagset_iter(tagset, fn, priv, BT_TAG_ITER_STARTED);
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
Support iterating over all requests in a tag set, including requests
that have not yet been started. A later patch will call this function
from scsi_device_busy().

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-mq-tag.c     | 19 +++++++++++++++++++
 include/linux/blk-mq.h |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index f169beeded64..f277ed7e7743 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -456,6 +456,25 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
+/**
+ * blk_mq_tagset_iter - iterate over all requests in a tag set
+ * @tagset:	Tag set to iterate over.
+ * @fn:		Pointer to the function that will be called for each request.
+ *		@fn will be called as follows: @fn(rq, @priv) where rq is a
+ *		pointer to a request. Return true to continue iterating tags,
+ *		false to stop.
+ * @priv:	Will be passed as second argument to @fn.
+ *
+ * We grab one request reference before calling @fn and release it after
+ * @fn returns.
+ */
+void blk_mq_tagset_iter(struct blk_mq_tag_set *tagset, blk_mq_rq_iter_fn *fn,
+			void *priv)
+{
+	__blk_mq_tagset_iter(tagset, fn, priv, 0);
+}
+EXPORT_SYMBOL(blk_mq_tagset_iter);
+
 static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data)
 {
 	unsigned *count = data;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3467cacb281c..20a22c1cd067 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -927,6 +927,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		blk_mq_rq_iter_fn *fn, void *priv);
+void blk_mq_tagset_iter(struct blk_mq_tag_set *tagset, blk_mq_rq_iter_fn *fn,
+		void *priv);
 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
 void blk_mq_freeze_queue_nomemsave(struct request_queue *q);
 void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q);
Since a single hardware queue is used by ATA drivers, the request tag
uniquely identifies in-flight commands. Stop using the SCSI budget token
to prepare for no longer allocating a budget token if possible. The
modified code was introduced by commit 4f1a22ee7b57 ("libata: Improve ATA
queued command allocation").

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Niklas Cassel <cassel@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 drivers/ata/libata-scsi.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 026122bb6f2f..90f5422fa08b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -744,22 +744,16 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
 {
 	struct ata_port *ap = dev->link->ap;
 	struct ata_queued_cmd *qc;
-	int tag;
+	int tag = scsi_cmd_to_rq(cmd)->tag;
 
 	if (unlikely(ata_port_is_frozen(ap)))
 		goto fail;
 
-	if (ap->flags & ATA_FLAG_SAS_HOST) {
-		/*
-		 * SAS hosts may queue > ATA_MAX_QUEUE commands so use
-		 * unique per-device budget token as a tag.
-		 */
-		if (WARN_ON_ONCE(cmd->budget_token >= ATA_MAX_QUEUE))
-			goto fail;
-		tag = cmd->budget_token;
-	} else {
-		tag = scsi_cmd_to_rq(cmd)->tag;
-	}
+	WARN_ON_ONCE(cmd->device->host->tag_set.nr_hw_queues > 1);
+
+	/* SAS hosts may queue > ATA_MAX_QUEUE commands. */
+	if (ap->flags & ATA_FLAG_SAS_HOST && WARN_ON_ONCE(tag >= ATA_MAX_QUEUE))
+		goto fail;
 
 	qc = __ata_qc_from_tag(ap, tag);
 	qc->tag = qc->hw_tag = tag;
Instead of only handling dev->budget_map.map != NULL, also handle
dev->budget_map.map == NULL. This patch prepares for supporting logical
units without budget map (sdev->budget_map.map == NULL).

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 drivers/scsi/scsi_lib.c    | 38 ++++++++++++++++++++++++++++++++++++++
 include/scsi/scsi_device.h |  5 +----
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 51ad2ad07e43..ddc51472b5eb 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -446,6 +446,44 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
 	spin_unlock_irqrestore(shost->host_lock, flags);
 }
 
+struct sdev_cmds_allocated_data {
+	const struct scsi_device *sdev;
+	int count;
+};
+
+static bool scsi_device_check_allocated(struct request *rq, void *data)
+{
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
+	struct sdev_cmds_allocated_data *sifd = data;
+
+	if (cmd->device == sifd->sdev)
+		sifd->count++;
+
+	return true;
+}
+
+/**
+ * scsi_device_busy() - Number of commands allocated for a SCSI device
+ * @sdev: SCSI device.
+ *
+ * Note: There is a subtle difference between this function and
+ * scsi_host_busy(). scsi_host_busy() counts the number of commands that have
+ * been started. This function counts the number of commands that have been
+ * allocated. At least the UFS driver depends on this function counting commands
+ * that have already been allocated but that have not yet been started.
+ */
+int scsi_device_busy(const struct scsi_device *sdev)
+{
+	struct sdev_cmds_allocated_data sifd = { .sdev = sdev };
+	struct blk_mq_tag_set *set = &sdev->host->tag_set;
+
+	if (sdev->budget_map.map)
+		return sbitmap_weight(&sdev->budget_map);
+	blk_mq_tagset_iter(set, scsi_device_check_allocated, &sifd);
+	return sifd.count;
+}
+EXPORT_SYMBOL(scsi_device_busy);
+
 static inline bool scsi_device_is_busy(struct scsi_device *sdev)
 {
 	if (scsi_device_busy(sdev) >= sdev->queue_depth)
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index d32f5841f4f8..0dd078ac9b89 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -713,10 +713,7 @@ static inline int scsi_device_supports_vpd(struct scsi_device *sdev)
 	return 0;
 }
 
-static inline int scsi_device_busy(struct scsi_device *sdev)
-{
-	return sbitmap_weight(&sdev->budget_map);
-}
+int scsi_device_busy(const struct scsi_device *sdev);
 
 /* Macros to access the UNIT ATTENTION counters */
 #define scsi_get_ua_new_media_ctr(sdev)	atomic_read(&sdev->ua_new_media_ctr)
The SCSI core uses the budget map to restrict the number of commands
that are in flight per logical unit. That limit check can be left out if
host->cmd_per_lun >= host->can_queue and if the host tag set is shared
across all hardware queues or if there is only one hardware queue  Since
scsi_mq_get_budget() shows up in all CPU profiles for fast SCSI devices,
do not allocate a budget map if cmd_per_lun >= can_queue and if the host
tag set is shared across all hardware queues.

For the following test this patch increases IOPS by 5%:

modprobe scsi_debug delay=0 no_rwlock=1 host_max_queue=192 submit_queues=$(nproc)

fio --bs=4096 --disable_clat=1 --disable_slat=1 --group_reporting=1 \
  --gtod_reduce=1 --invalidate=1 --ioengine=io_uring --ioscheduler=none \
  --norandommap --runtime=60 --rw=randread --thread --time_based=1 \
  --buffered=0 --numjobs=1 --iodepth=192 --iodepth_batch=24 --name=/dev/sda \
  --filename=/dev/sda

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 drivers/scsi/scsi.c      |  6 ++----
 drivers/scsi/scsi_scan.c | 18 +++++++++++++++++-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 76cdad063f7b..3dc93dd9fda2 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -216,9 +216,6 @@ int scsi_device_max_queue_depth(struct scsi_device *sdev)
  */
 int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
 {
-	if (!sdev->budget_map.map)
-		return -EINVAL;
-
 	depth = min_t(int, depth, scsi_device_max_queue_depth(sdev));
 
 	if (depth > 0) {
@@ -229,7 +226,8 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
 	if (sdev->request_queue)
 		blk_set_queue_depth(sdev->request_queue, depth);
 
-	sbitmap_resize(&sdev->budget_map, sdev->queue_depth);
+	if (sdev->budget_map.map)
+		sbitmap_resize(&sdev->budget_map, sdev->queue_depth);
 
 	return sdev->queue_depth;
 }
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 7acbfcfc2172..99b82e28f292 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -215,9 +215,17 @@ static void scsi_unlock_floptical(struct scsi_device *sdev,
 			 SCSI_TIMEOUT, 3, NULL);
 }
 
+static bool scsi_needs_budget_map(struct Scsi_Host *shost, unsigned int depth)
+{
+	if (shost->host_tagset || shost->tag_set.nr_hw_queues == 1)
+		return depth < shost->can_queue;
+	return true;
+}
+
 static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev,
 					unsigned int depth)
 {
+	struct Scsi_Host *shost = sdev->host;
 	int new_shift = sbitmap_calculate_shift(depth);
 	bool need_alloc = !sdev->budget_map.map;
 	bool need_free = false;
@@ -225,6 +233,13 @@ static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev,
 	int ret;
 	struct sbitmap sb_backup;
 
+	if (!scsi_needs_budget_map(shost, depth)) {
+		memflags = blk_mq_freeze_queue(sdev->request_queue);
+		sbitmap_free(&sdev->budget_map);
+		blk_mq_unfreeze_queue(sdev->request_queue, memflags);
+		return 0;
+	}
+
 	depth = min_t(unsigned int, depth, scsi_device_max_queue_depth(sdev));
 
 	/*
@@ -1120,7 +1135,8 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 	scsi_cdl_check(sdev);
 
 	sdev->max_queue_depth = sdev->queue_depth;
-	WARN_ON_ONCE(sdev->max_queue_depth > sdev->budget_map.depth);
+	WARN_ON_ONCE(sdev->budget_map.map &&
+		     sdev->max_queue_depth > sdev->budget_map.depth);
 
 	/*
 	 * Ok, the device is now all set up, we can