diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28167c9baa55..a312cb6b6127 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -162,6 +162,7 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), HCTX_STATE_NAME(INACTIVE), + HCTX_STATE_NAME(IDLE), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 3da2215b2912..b022d6a2bb88 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,6 +112,31 @@ void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) inflight[WRITE] = mi.inflight[WRITE]; } +static void __blk_update_hw_queue_idle(struct request_queue *q, bool idle) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (idle) + set_bit(BLK_MQ_S_IDLE, &hctx->state); + else + clear_bit(BLK_MQ_S_IDLE, &hctx->state); + } +} + +void blk_mq_set_hw_queues_idle(struct request_queue *q) +{ + __blk_update_hw_queue_idle(q, true); +} +EXPORT_SYMBOL_GPL(blk_mq_set_hw_queues_idle); + +void blk_mq_clear_hw_queues_idle(struct request_queue *q) +{ + __blk_update_hw_queue_idle(q, false); +} +EXPORT_SYMBOL_GPL(blk_mq_clear_hw_queues_idle); + #ifdef CONFIG_LOCKDEP static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) @@ -3679,6 +3704,17 @@ static bool blk_mq_has_request(struct request *rq, void *data) if (rq->mq_hctx != iter_data->hctx) return true; + + /* + * The driver ensures that all hardware queue resources are freed, even + * if a request has a tag allocated to a CPU that is going offline. This + * applies to requests not yet handed to the hardware. Essentially those + * 'in-flight' between the block layer and the hardware (e.g., a request + * blocked because the queue is quiesced). + */ + if (test_bit(BLK_MQ_S_IDLE, &iter_data->hctx->state)) + return false; + iter_data->has_rq = true; return false; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 58bf432ec5e6..224852b3807f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -454,6 +454,51 @@ void nvme_end_req(struct request *req) blk_mq_end_request(req, status); } +static void nvme_failover_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK; + unsigned long flags; + struct bio *bio; + + if (nvme_ns_head_multipath(ns->head)) + nvme_mpath_clear_current_path(ns); + + /* + * If we got back an ANA error, we know the controller is alive but not + * ready to serve this namespace. Kick of a re-read of the ANA + * information page, and just try any other available path for now. + */ + if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); + } + + spin_lock_irqsave(&ns->head->requeue_lock, flags); + for (bio = req->bio; bio; bio = bio->bi_next) { + if (nvme_ns_head_multipath(ns->head)) + bio_set_dev(bio, ns->head->disk->part0); + if (bio->bi_opf & REQ_POLLED) { + bio->bi_opf &= ~REQ_POLLED; + bio->bi_cookie = BLK_QC_T_NONE; + } + /* + * The alternate request queue that we may end up submitting + * the bio to may be frozen temporarily, in this case REQ_NOWAIT + * will fail the I/O immediately with EAGAIN to the issuer. + * We are not in the issuer context which cannot block. Clear + * the flag to avoid spurious EAGAIN I/O failures. + */ + bio->bi_opf &= ~REQ_NOWAIT; + } + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + + nvme_req(req)->status = 0; + nvme_end_req(req); + kblockd_schedule_work(&ns->head->requeue_work); +} + void nvme_complete_rq(struct request *req) { struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; @@ -762,8 +807,13 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD && !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && - !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) { + if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state)) { + nvme_failover_req(rq); + return BLK_STS_OK; + } return BLK_STS_RESOURCE; + } if (!(rq->rq_flags & RQF_DONTPREP)) nvme_clear_nvme_request(rq); @@ -809,6 +859,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, } } + if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state)) + return false; + return queue_live; } EXPORT_SYMBOL_GPL(__nvme_check_ready); @@ -5297,6 +5350,34 @@ void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); +static void __nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl, bool idle) +{ + struct nvme_ns *ns; + int srcu_idx; + + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) { + if (idle) + blk_mq_set_hw_queues_idle(ns->queue); + else + blk_mq_clear_hw_queues_idle(ns->queue); + } + srcu_read_unlock(&ctrl->srcu, srcu_idx); +} + +void nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl) +{ + __nvme_set_hw_queues_idle(ctrl, true); +} +EXPORT_SYMBOL_GPL(nvme_set_hw_queues_idle); + +void nvme_clear_hw_queues_idle(struct nvme_ctrl *ctrl) +{ + __nvme_set_hw_queues_idle(ctrl, false); +} +EXPORT_SYMBOL_GPL(nvme_clear_hw_queues_idle); + void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index fc6800a9f7f9..8a5737d9d6a9 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -134,49 +134,6 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) blk_freeze_queue_start(h->disk->queue); } -void nvme_failover_req(struct request *req) -{ - struct nvme_ns *ns = req->q->queuedata; - u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK; - unsigned long flags; - struct bio *bio; - - nvme_mpath_clear_current_path(ns); - - /* - * If we got back an ANA error, we know the controller is alive but not - * ready to serve this namespace. Kick of a re-read of the ANA - * information page, and just try any other available path for now. - */ - if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { - set_bit(NVME_NS_ANA_PENDING, &ns->flags); - queue_work(nvme_wq, &ns->ctrl->ana_work); - } - - spin_lock_irqsave(&ns->head->requeue_lock, flags); - for (bio = req->bio; bio; bio = bio->bi_next) { - bio_set_dev(bio, ns->head->disk->part0); - if (bio->bi_opf & REQ_POLLED) { - bio->bi_opf &= ~REQ_POLLED; - bio->bi_cookie = BLK_QC_T_NONE; - } - /* - * The alternate request queue that we may end up submitting - * the bio to may be frozen temporarily, in this case REQ_NOWAIT - * will fail the I/O immediately with EAGAIN to the issuer. - * We are not in the issuer context which cannot block. Clear - * the flag to avoid spurious EAGAIN I/O failures. - */ - bio->bi_opf &= ~REQ_NOWAIT; - } - blk_steal_bios(&ns->head->requeue_list, req); - spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - - nvme_req(req)->status = 0; - nvme_end_req(req); - kblockd_schedule_work(&ns->head->requeue_work); -} - void nvme_mpath_start_request(struct request *rq) { struct nvme_ns *ns = rq->q->queuedata; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9971045dbc05..ae45b7fc30a8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -889,6 +889,8 @@ void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl); void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl); void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl); void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl); +void nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl); +void nvme_clear_hw_queues_idle(struct nvme_ctrl *ctrl); void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); @@ -1022,7 +1024,6 @@ void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); -void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8d09fa7d7ff9..6da0dd785b2c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3374,6 +3374,8 @@ static void nvme_reset_work(struct work_struct *work) nvme_unquiesce_admin_queue(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); + nvme_set_hw_queues_idle(&dev->ctrl); + /* * Introduce CONNECTING state from nvme-fc/rdma transports to mark the * initializing procedure here. @@ -3406,6 +3408,7 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + nvme_clear_hw_queues_idle(&dev->ctrl); /* * Freeze and update the number of I/O queues as those might have * changed. If there are no I/O queues left after this reset, keep the diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 18a2388ba581..8885e84a7889 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -721,6 +721,7 @@ enum { BLK_MQ_S_SCHED_RESTART, /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE, + BLK_MQ_S_IDLE, BLK_MQ_S_MAX }; @@ -934,6 +935,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_set_hw_queues_idle(struct request_queue *q); +void blk_mq_clear_hw_queues_idle(struct request_queue *q); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); diff --git a/lib/group_cpus.c b/lib/group_cpus.c index e6e18d7a49bb..533c722b5c2c 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -510,25 +510,13 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) if (!masks) goto fail_node_to_cpumask; + /* Stabilize the cpumasks */ + cpus_read_lock(); build_node_to_cpumask(node_to_cpumask); - /* - * Make a local cache of 'cpu_present_mask', so the two stages - * spread can observe consistent 'cpu_present_mask' without holding - * cpu hotplug lock, then we can reduce deadlock risk with cpu - * hotplug code. - * - * Here CPU hotplug may happen when reading `cpu_present_mask`, and - * we can live with the case because it only affects that hotplug - * CPU is handled in the 1st or 2nd stage, and either way is correct - * from API user viewpoint since 2-stage spread is sort of - * optimization. - */ - cpumask_copy(npresmsk, data_race(cpu_present_mask)); - /* grouping present CPUs first */ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, - npresmsk, nmsk, masks); + cpu_present_mask, nmsk, masks); if (ret < 0) goto fail_node_to_cpumask; nr_present = ret; @@ -543,13 +531,14 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) curgrp = 0; else curgrp = nr_present; - cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, npresmsk, nmsk, masks); if (ret >= 0) nr_others = ret; fail_node_to_cpumask: + cpus_read_unlock(); free_node_to_cpumask(node_to_cpumask); fail_npresmsk: