From ca77ce5db7ebf1d495084346a820dbbb66769cab Mon Sep 17 00:00:00 2001
From: Daniel Wagner <wagi@kernel.org>
Date: Thu, 26 Feb 2026 14:40:35 +0100
Subject: [PATCH 1/3] nvme: failover requests for inactive hctx

When the ctrl is not in LIVE state, a hardware queue can be in the
INACTIVE state due to CPU hotplug offlining operations. In this case,
the driver will freeze and quiesce the request queue and doesn't expect
new request entering via queue_rq. Though a request will fail eventually,
though shortcut it and fail it earlier.

Check if a request is targeted for an inactive hardware queue and use
nvme_failover_req and hand it back to the block layer.

Signed-off-by: Daniel Wagner <wagi@kernel.org>
---
 drivers/nvme/host/core.c      | 55 ++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/multipath.c | 43 ---------------------------
 drivers/nvme/host/nvme.h      |  1 -
 3 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 58bf432ec5e6..1326acd9b253 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -454,6 +454,51 @@ void nvme_end_req(struct request *req)
 	blk_mq_end_request(req, status);
 }
 
+static void nvme_failover_req(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
+	unsigned long flags;
+	struct bio *bio;
+
+	if (nvme_ns_head_multipath(ns->head))
+		nvme_mpath_clear_current_path(ns);
+
+	/*
+	 * If we got back an ANA error, we know the controller is alive but not
+	 * ready to serve this namespace.  Kick of a re-read of the ANA
+	 * information page, and just try any other available path for now.
+	 */
+	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
+		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+		queue_work(nvme_wq, &ns->ctrl->ana_work);
+	}
+
+	spin_lock_irqsave(&ns->head->requeue_lock, flags);
+	for (bio = req->bio; bio; bio = bio->bi_next) {
+		if (nvme_ns_head_multipath(ns->head))
+			bio_set_dev(bio, ns->head->disk->part0);
+		if (bio->bi_opf & REQ_POLLED) {
+			bio->bi_opf &= ~REQ_POLLED;
+			bio->bi_cookie = BLK_QC_T_NONE;
+		}
+		/*
+		 * The alternate request queue that we may end up submitting
+		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
+		 * will fail the I/O immediately with EAGAIN to the issuer.
+		 * We are not in the issuer context which cannot block. Clear
+		 * the flag to avoid spurious EAGAIN I/O failures.
+		 */
+		bio->bi_opf &= ~REQ_NOWAIT;
+	}
+	blk_steal_bios(&ns->head->requeue_list, req);
+	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+
+	nvme_req(req)->status = 0;
+	nvme_end_req(req);
+	kblockd_schedule_work(&ns->head->requeue_work);
+}
+
 void nvme_complete_rq(struct request *req)
 {
 	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
@@ -762,8 +807,13 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
 	    state != NVME_CTRL_DELETING &&
 	    state != NVME_CTRL_DEAD &&
 	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
-	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) {
+		if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state)) {
+			nvme_failover_req(rq);
+			return BLK_STS_OK;
+		}
 		return BLK_STS_RESOURCE;
+	}
 
 	if (!(rq->rq_flags & RQF_DONTPREP))
 		nvme_clear_nvme_request(rq);
@@ -809,6 +859,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 		}
 	}
 
+	if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state))
+		return false;
+
 	return queue_live;
 }
 EXPORT_SYMBOL_GPL(__nvme_check_ready);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index fc6800a9f7f9..8a5737d9d6a9 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -134,49 +134,6 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 			blk_freeze_queue_start(h->disk->queue);
 }
 
-void nvme_failover_req(struct request *req)
-{
-	struct nvme_ns *ns = req->q->queuedata;
-	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
-	unsigned long flags;
-	struct bio *bio;
-
-	nvme_mpath_clear_current_path(ns);
-
-	/*
-	 * If we got back an ANA error, we know the controller is alive but not
-	 * ready to serve this namespace.  Kick of a re-read of the ANA
-	 * information page, and just try any other available path for now.
-	 */
-	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
-		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-		queue_work(nvme_wq, &ns->ctrl->ana_work);
-	}
-
-	spin_lock_irqsave(&ns->head->requeue_lock, flags);
-	for (bio = req->bio; bio; bio = bio->bi_next) {
-		bio_set_dev(bio, ns->head->disk->part0);
-		if (bio->bi_opf & REQ_POLLED) {
-			bio->bi_opf &= ~REQ_POLLED;
-			bio->bi_cookie = BLK_QC_T_NONE;
-		}
-		/*
-		 * The alternate request queue that we may end up submitting
-		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
-		 * will fail the I/O immediately with EAGAIN to the issuer.
-		 * We are not in the issuer context which cannot block. Clear
-		 * the flag to avoid spurious EAGAIN I/O failures.
-		 */
-		bio->bi_opf &= ~REQ_NOWAIT;
-	}
-	blk_steal_bios(&ns->head->requeue_list, req);
-	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
-
-	nvme_req(req)->status = 0;
-	nvme_end_req(req);
-	kblockd_schedule_work(&ns->head->requeue_work);
-}
-
 void nvme_mpath_start_request(struct request *rq)
 {
 	struct nvme_ns *ns = rq->q->queuedata;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9971045dbc05..4d92bf47a7ac 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1022,7 +1022,6 @@ void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
-void nvme_failover_req(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);

From bfa7d775b7f79684acb20a1427f0a6e74233c6f0 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <wagi@kernel.org>
Date: Thu, 26 Feb 2026 14:40:36 +0100
Subject: [PATCH 2/3] blk-mq: add handshake for offlinig hw queues

The CPU hotplug offline handler in the block layer checks for any
in-flight requests on a CPU going offline. It prevents the CPU hotplug
state engine from progressing as long as there are pending requests.

This is done by checking for any allocated requests on the hardware
context that is going offline. The driver is responsible for completing
all in-flight requests.

However, the driver might be performing error recovery simultaneously.
Therefore, the request queue might be in a frozen or quiesced state. In
this case, requests may not make progress (see
blk_mq_sched_dispatch_requests for an example).

Introduce an explicit handshake protocol between the driver and the
block layer. This allows the driver to signal when it is safe to ignore
any remaining pending requests.

Signed-off-by: Daniel Wagner <wagi@kernel.org>
---
 block/blk-mq-debugfs.c   |  1 +
 block/blk-mq.c           | 36 ++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c | 28 ++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  2 ++
 drivers/nvme/host/pci.c  |  3 +++
 include/linux/blk-mq.h   |  3 +++
 6 files changed, 73 insertions(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 28167c9baa55..a312cb6b6127 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -162,6 +162,7 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
 	HCTX_STATE_NAME(INACTIVE),
+	HCTX_STATE_NAME(IDLE),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3da2215b2912..b022d6a2bb88 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -112,6 +112,31 @@ void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
 	inflight[WRITE] = mi.inflight[WRITE];
 }
 
+static void __blk_update_hw_queue_idle(struct request_queue *q, bool idle)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (idle)
+			set_bit(BLK_MQ_S_IDLE, &hctx->state);
+		else
+			clear_bit(BLK_MQ_S_IDLE, &hctx->state);
+	}
+}
+
+void blk_mq_set_hw_queues_idle(struct request_queue *q)
+{
+	__blk_update_hw_queue_idle(q, true);
+}
+EXPORT_SYMBOL_GPL(blk_mq_set_hw_queues_idle);
+
+void blk_mq_clear_hw_queues_idle(struct request_queue *q)
+{
+	__blk_update_hw_queue_idle(q, false);
+}
+EXPORT_SYMBOL_GPL(blk_mq_clear_hw_queues_idle);
+
 #ifdef CONFIG_LOCKDEP
 static bool blk_freeze_set_owner(struct request_queue *q,
 				 struct task_struct *owner)
@@ -3679,6 +3704,17 @@ static bool blk_mq_has_request(struct request *rq, void *data)
 
 	if (rq->mq_hctx != iter_data->hctx)
 		return true;
+
+	/*
+	 * The driver ensures that all hardware queue resources are freed, even
+	 * if a request has a tag allocated to a CPU that is going offline. This
+	 * applies to requests not yet handed to the hardware. Essentially those
+	 * 'in-flight' between the block layer and the hardware (e.g., a request
+	 * blocked because the queue is quiesced).
+	 */
+	if (test_bit(BLK_MQ_S_IDLE, &iter_data->hctx->state))
+		return false;
+
 	iter_data->has_rq = true;
 	return false;
 }
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1326acd9b253..224852b3807f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5350,6 +5350,34 @@ void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
 
+static void __nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl, bool idle)
+{
+	struct nvme_ns *ns;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+				 srcu_read_lock_held(&ctrl->srcu)) {
+		if (idle)
+			blk_mq_set_hw_queues_idle(ns->queue);
+		else
+			blk_mq_clear_hw_queues_idle(ns->queue);
+	}
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
+}
+
+void nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl)
+{
+	__nvme_set_hw_queues_idle(ctrl, true);
+}
+EXPORT_SYMBOL_GPL(nvme_set_hw_queues_idle);
+
+void nvme_clear_hw_queues_idle(struct nvme_ctrl *ctrl)
+{
+	__nvme_set_hw_queues_idle(ctrl, false);
+}
+EXPORT_SYMBOL_GPL(nvme_clear_hw_queues_idle);
+
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4d92bf47a7ac..ae45b7fc30a8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -889,6 +889,8 @@ void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl);
 void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl);
 void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl);
 void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl);
+void nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl);
+void nvme_clear_hw_queues_idle(struct nvme_ctrl *ctrl);
 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl);
 void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8d09fa7d7ff9..6da0dd785b2c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3374,6 +3374,8 @@ static void nvme_reset_work(struct work_struct *work)
 	nvme_unquiesce_admin_queue(&dev->ctrl);
 	mutex_unlock(&dev->shutdown_lock);
 
+	nvme_set_hw_queues_idle(&dev->ctrl);
+
 	/*
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
 	 * initializing procedure here.
@@ -3406,6 +3408,7 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
+	nvme_clear_hw_queues_idle(&dev->ctrl);
 	/*
 	 * Freeze and update the number of I/O queues as those might have
 	 * changed.  If there are no I/O queues left after this reset, keep the
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 18a2388ba581..8885e84a7889 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -721,6 +721,7 @@ enum {
 	BLK_MQ_S_SCHED_RESTART,
 	/* hw queue is inactive after all its CPUs become offline */
 	BLK_MQ_S_INACTIVE,
+	BLK_MQ_S_IDLE,
 	BLK_MQ_S_MAX
 };
 
@@ -934,6 +935,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_set_hw_queues_idle(struct request_queue *q);
+void blk_mq_clear_hw_queues_idle(struct request_queue *q);
 void blk_mq_quiesce_queue(struct request_queue *q);
 void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
 void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);

From bd19b7291fd0b561ed9e82ab091c3b9fc3a615ae Mon Sep 17 00:00:00 2001
From: Daniel Wagner <wagi@kernel.org>
Date: Thu, 26 Feb 2026 14:40:37 +0100
Subject: [PATCH 3/3] Revert "lib/group_cpus.c: avoid acquiring cpu hotplug
 lock in group_cpus_evenly"

This reverts commit 0263f92fadbb9d294d5971ac57743f882c93b2b3.

The reason the lock was removed was that the nvme-pci driver reset
handler attempted to acquire the CPU read lock during CPU hotplug
offlining (holds the CPU write lock). Consequently, the block layer
offline notifier callback could not progress because in-flight requests
were detected.

Since then, in-flight detection has been improved, and the nvme-pci
driver now explicitly updates the hctx state when it is safe to ignore
detected in-flight requests. As a result, it's possible to reintroduce
the CPU read lock in group_cpus_evenly.

Signed-off-by: Daniel Wagner <wagi@kernel.org>
---
 lib/group_cpus.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index e6e18d7a49bb..533c722b5c2c 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -510,25 +510,13 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 	if (!masks)
 		goto fail_node_to_cpumask;
 
+	/* Stabilize the cpumasks */
+	cpus_read_lock();
 	build_node_to_cpumask(node_to_cpumask);
 
-	/*
-	 * Make a local cache of 'cpu_present_mask', so the two stages
-	 * spread can observe consistent 'cpu_present_mask' without holding
-	 * cpu hotplug lock, then we can reduce deadlock risk with cpu
-	 * hotplug code.
-	 *
-	 * Here CPU hotplug may happen when reading `cpu_present_mask`, and
-	 * we can live with the case because it only affects that hotplug
-	 * CPU is handled in the 1st or 2nd stage, and either way is correct
-	 * from API user viewpoint since 2-stage spread is sort of
-	 * optimization.
-	 */
-	cpumask_copy(npresmsk, data_race(cpu_present_mask));
-
 	/* grouping present CPUs first */
 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
-				  npresmsk, nmsk, masks);
+				  cpu_present_mask, nmsk, masks);
 	if (ret < 0)
 		goto fail_node_to_cpumask;
 	nr_present = ret;
@@ -543,13 +531,14 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 		curgrp = 0;
 	else
 		curgrp = nr_present;
-	cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
+	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
 				  npresmsk, nmsk, masks);
 	if (ret >= 0)
 		nr_others = ret;
 
  fail_node_to_cpumask:
+	cpus_read_unlock();
 	free_node_to_cpumask(node_to_cpumask);
 
  fail_npresmsk: