linux-blktests · blktests-ci · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
@@ -162,6 +162,7 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
 	HCTX_STATE_NAME(INACTIVE),
+	HCTX_STATE_NAME(IDLE),
 };
 #undef HCTX_STATE_NAME
 

diff --git a/block/blk-mq.c b/block/blk-mq.c
@@ -112,6 +112,31 @@ void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
 	inflight[WRITE] = mi.inflight[WRITE];
 }
 
+static void __blk_update_hw_queue_idle(struct request_queue *q, bool idle)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (idle)
+			set_bit(BLK_MQ_S_IDLE, &hctx->state);
+		else
+			clear_bit(BLK_MQ_S_IDLE, &hctx->state);
+	}
+}
+
+void blk_mq_set_hw_queues_idle(struct request_queue *q)
+{
+	__blk_update_hw_queue_idle(q, true);
+}
+EXPORT_SYMBOL_GPL(blk_mq_set_hw_queues_idle);
+
+void blk_mq_clear_hw_queues_idle(struct request_queue *q)
+{
+	__blk_update_hw_queue_idle(q, false);
+}
+EXPORT_SYMBOL_GPL(blk_mq_clear_hw_queues_idle);
+
 #ifdef CONFIG_LOCKDEP
 static bool blk_freeze_set_owner(struct request_queue *q,
 				 struct task_struct *owner)
@@ -3679,6 +3704,17 @@ static bool blk_mq_has_request(struct request *rq, void *data)
 
 	if (rq->mq_hctx != iter_data->hctx)
 		return true;
+
+	/*
+	 * The driver ensures that all hardware queue resources are freed, even
+	 * if a request has a tag allocated to a CPU that is going offline. This
+	 * applies to requests not yet handed to the hardware. Essentially those
+	 * 'in-flight' between the block layer and the hardware (e.g., a request
+	 * blocked because the queue is quiesced).
+	 */
+	if (test_bit(BLK_MQ_S_IDLE, &iter_data->hctx->state))
+		return false;
+
 	iter_data->has_rq = true;
 	return false;
 }

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
@@ -454,6 +454,51 @@ void nvme_end_req(struct request *req)
 	blk_mq_end_request(req, status);
 }
 
+static void nvme_failover_req(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
+	unsigned long flags;
+	struct bio *bio;
+
+	if (nvme_ns_head_multipath(ns->head))
+		nvme_mpath_clear_current_path(ns);
+
+	/*
+	 * If we got back an ANA error, we know the controller is alive but not
+	 * ready to serve this namespace.  Kick of a re-read of the ANA
+	 * information page, and just try any other available path for now.
+	 */
+	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
+		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+		queue_work(nvme_wq, &ns->ctrl->ana_work);
+	}
+
+	spin_lock_irqsave(&ns->head->requeue_lock, flags);
+	for (bio = req->bio; bio; bio = bio->bi_next) {
+		if (nvme_ns_head_multipath(ns->head))
+			bio_set_dev(bio, ns->head->disk->part0);
+		if (bio->bi_opf & REQ_POLLED) {
+			bio->bi_opf &= ~REQ_POLLED;
+			bio->bi_cookie = BLK_QC_T_NONE;
+		}
+		/*
+		 * The alternate request queue that we may end up submitting
+		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
+		 * will fail the I/O immediately with EAGAIN to the issuer.
+		 * We are not in the issuer context which cannot block. Clear
+		 * the flag to avoid spurious EAGAIN I/O failures.
+		 */
+		bio->bi_opf &= ~REQ_NOWAIT;
+	}
+	blk_steal_bios(&ns->head->requeue_list, req);
+	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+
+	nvme_req(req)->status = 0;
+	nvme_end_req(req);
+	kblockd_schedule_work(&ns->head->requeue_work);
+}
+
 void nvme_complete_rq(struct request *req)
 {
 	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
@@ -762,8 +807,13 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
 	    state != NVME_CTRL_DELETING &&
 	    state != NVME_CTRL_DEAD &&
 	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
-	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) {
+		if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state)) {
+			nvme_failover_req(rq);
+			return BLK_STS_OK;
+		}
 		return BLK_STS_RESOURCE;
+	}
 
 	if (!(rq->rq_flags & RQF_DONTPREP))
 		nvme_clear_nvme_request(rq);
@@ -809,6 +859,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 		}
 	}
 
+	if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state))
+		return false;
+
 	return queue_live;
 }
 EXPORT_SYMBOL_GPL(__nvme_check_ready);
@@ -5297,6 +5350,34 @@ void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
 
+static void __nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl, bool idle)
+{
+	struct nvme_ns *ns;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+				 srcu_read_lock_held(&ctrl->srcu)) {
+		if (idle)
+			blk_mq_set_hw_queues_idle(ns->queue);
+		else
+			blk_mq_clear_hw_queues_idle(ns->queue);
+	}
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
+}
+
+void nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl)
+{
+	__nvme_set_hw_queues_idle(ctrl, true);
+}
+EXPORT_SYMBOL_GPL(nvme_set_hw_queues_idle);
+
+void nvme_clear_hw_queues_idle(struct nvme_ctrl *ctrl)
+{
+	__nvme_set_hw_queues_idle(ctrl, false);
+}
+EXPORT_SYMBOL_GPL(nvme_clear_hw_queues_idle);
+
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
@@ -134,49 +134,6 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 			blk_freeze_queue_start(h->disk->queue);
 }
 
-void nvme_failover_req(struct request *req)
-{
-	struct nvme_ns *ns = req->q->queuedata;
-	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
-	unsigned long flags;
-	struct bio *bio;
-
-	nvme_mpath_clear_current_path(ns);
-
-	/*
-	 * If we got back an ANA error, we know the controller is alive but not
-	 * ready to serve this namespace.  Kick of a re-read of the ANA
-	 * information page, and just try any other available path for now.
-	 */
-	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
-		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-		queue_work(nvme_wq, &ns->ctrl->ana_work);
-	}
-
-	spin_lock_irqsave(&ns->head->requeue_lock, flags);
-	for (bio = req->bio; bio; bio = bio->bi_next) {
-		bio_set_dev(bio, ns->head->disk->part0);
-		if (bio->bi_opf & REQ_POLLED) {
-			bio->bi_opf &= ~REQ_POLLED;
-			bio->bi_cookie = BLK_QC_T_NONE;
-		}
-		/*
-		 * The alternate request queue that we may end up submitting
-		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
-		 * will fail the I/O immediately with EAGAIN to the issuer.
-		 * We are not in the issuer context which cannot block. Clear
-		 * the flag to avoid spurious EAGAIN I/O failures.
-		 */
-		bio->bi_opf &= ~REQ_NOWAIT;
-	}
-	blk_steal_bios(&ns->head->requeue_list, req);
-	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
-
-	nvme_req(req)->status = 0;
-	nvme_end_req(req);
-	kblockd_schedule_work(&ns->head->requeue_work);
-}
-
 void nvme_mpath_start_request(struct request *rq)
 {
 	struct nvme_ns *ns = rq->q->queuedata;

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
@@ -889,6 +889,8 @@ void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl);
 void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl);
 void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl);
 void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl);
+void nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl);
+void nvme_clear_hw_queues_idle(struct nvme_ctrl *ctrl);
 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl);
 void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
@@ -1022,7 +1024,6 @@ void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
-void nvme_failover_req(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
@@ -3374,6 +3374,8 @@ static void nvme_reset_work(struct work_struct *work)
 	nvme_unquiesce_admin_queue(&dev->ctrl);
 	mutex_unlock(&dev->shutdown_lock);
 
+	nvme_set_hw_queues_idle(&dev->ctrl);
+
 	/*
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
 	 * initializing procedure here.
@@ -3406,6 +3408,7 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
+	nvme_clear_hw_queues_idle(&dev->ctrl);
 	/*
 	 * Freeze and update the number of I/O queues as those might have
 	 * changed.  If there are no I/O queues left after this reset, keep the

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
@@ -721,6 +721,7 @@ enum {
 	BLK_MQ_S_SCHED_RESTART,
 	/* hw queue is inactive after all its CPUs become offline */
 	BLK_MQ_S_INACTIVE,
+	BLK_MQ_S_IDLE,
 	BLK_MQ_S_MAX
 };
 
@@ -934,6 +935,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_set_hw_queues_idle(struct request_queue *q);
+void blk_mq_clear_hw_queues_idle(struct request_queue *q);
 void blk_mq_quiesce_queue(struct request_queue *q);
 void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
 void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);

diff --git a/lib/group_cpus.c b/lib/group_cpus.c
@@ -510,25 +510,13 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 	if (!masks)
 		goto fail_node_to_cpumask;
 
+	/* Stabilize the cpumasks */
+	cpus_read_lock();
 	build_node_to_cpumask(node_to_cpumask);
 
-	/*
-	 * Make a local cache of 'cpu_present_mask', so the two stages
-	 * spread can observe consistent 'cpu_present_mask' without holding
-	 * cpu hotplug lock, then we can reduce deadlock risk with cpu
-	 * hotplug code.
-	 *
-	 * Here CPU hotplug may happen when reading `cpu_present_mask`, and
-	 * we can live with the case because it only affects that hotplug
-	 * CPU is handled in the 1st or 2nd stage, and either way is correct
-	 * from API user viewpoint since 2-stage spread is sort of
-	 * optimization.
-	 */
-	cpumask_copy(npresmsk, data_race(cpu_present_mask));
-
 	/* grouping present CPUs first */
 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
-				  npresmsk, nmsk, masks);
+				  cpu_present_mask, nmsk, masks);
 	if (ret < 0)
 		goto fail_node_to_cpumask;
 	nr_present = ret;
@@ -543,13 +531,14 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 		curgrp = 0;
 	else
 		curgrp = nr_present;
-	cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
+	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
 				  npresmsk, nmsk, masks);
 	if (ret >= 0)
 		nr_others = ret;
 
  fail_node_to_cpumask:
+	cpus_read_unlock();
 	free_node_to_cpumask(node_to_cpumask);
 
  fail_npresmsk: