Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions block/blk-mq-debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
HCTX_STATE_NAME(INACTIVE),
HCTX_STATE_NAME(IDLE),
};
#undef HCTX_STATE_NAME

Expand Down
36 changes: 36 additions & 0 deletions block/blk-mq.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,31 @@ void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
inflight[WRITE] = mi.inflight[WRITE];
}

static void __blk_update_hw_queue_idle(struct request_queue *q, bool idle)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;

queue_for_each_hw_ctx(q, hctx, i) {
if (idle)
set_bit(BLK_MQ_S_IDLE, &hctx->state);
else
clear_bit(BLK_MQ_S_IDLE, &hctx->state);
}
}

void blk_mq_set_hw_queues_idle(struct request_queue *q)
{
__blk_update_hw_queue_idle(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_set_hw_queues_idle);

void blk_mq_clear_hw_queues_idle(struct request_queue *q)
{
__blk_update_hw_queue_idle(q, false);
}
EXPORT_SYMBOL_GPL(blk_mq_clear_hw_queues_idle);

#ifdef CONFIG_LOCKDEP
static bool blk_freeze_set_owner(struct request_queue *q,
struct task_struct *owner)
Expand Down Expand Up @@ -3679,6 +3704,17 @@ static bool blk_mq_has_request(struct request *rq, void *data)

if (rq->mq_hctx != iter_data->hctx)
return true;

/*
* The driver ensures that all hardware queue resources are freed, even
* if a request has a tag allocated to a CPU that is going offline. This
* applies to requests not yet handed to the hardware. Essentially those
* 'in-flight' between the block layer and the hardware (e.g., a request
* blocked because the queue is quiesced).
*/
if (test_bit(BLK_MQ_S_IDLE, &iter_data->hctx->state))
return false;

iter_data->has_rq = true;
return false;
}
Expand Down
83 changes: 82 additions & 1 deletion drivers/nvme/host/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,51 @@ void nvme_end_req(struct request *req)
blk_mq_end_request(req, status);
}

static void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
unsigned long flags;
struct bio *bio;

if (nvme_ns_head_multipath(ns->head))
nvme_mpath_clear_current_path(ns);

/*
* If we got back an ANA error, we know the controller is alive but not
* ready to serve this namespace. Kick of a re-read of the ANA
* information page, and just try any other available path for now.
*/
if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
set_bit(NVME_NS_ANA_PENDING, &ns->flags);
queue_work(nvme_wq, &ns->ctrl->ana_work);
}

spin_lock_irqsave(&ns->head->requeue_lock, flags);
for (bio = req->bio; bio; bio = bio->bi_next) {
if (nvme_ns_head_multipath(ns->head))
bio_set_dev(bio, ns->head->disk->part0);
if (bio->bi_opf & REQ_POLLED) {
bio->bi_opf &= ~REQ_POLLED;
bio->bi_cookie = BLK_QC_T_NONE;
}
/*
* The alternate request queue that we may end up submitting
* the bio to may be frozen temporarily, in this case REQ_NOWAIT
* will fail the I/O immediately with EAGAIN to the issuer.
* We are not in the issuer context which cannot block. Clear
* the flag to avoid spurious EAGAIN I/O failures.
*/
bio->bi_opf &= ~REQ_NOWAIT;
}
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);

nvme_req(req)->status = 0;
nvme_end_req(req);
kblockd_schedule_work(&ns->head->requeue_work);
}

void nvme_complete_rq(struct request *req)
{
struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
Expand Down Expand Up @@ -762,8 +807,13 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
state != NVME_CTRL_DELETING &&
state != NVME_CTRL_DEAD &&
!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) {
if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state)) {
nvme_failover_req(rq);
return BLK_STS_OK;
}
return BLK_STS_RESOURCE;
}

if (!(rq->rq_flags & RQF_DONTPREP))
nvme_clear_nvme_request(rq);
Expand Down Expand Up @@ -809,6 +859,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
}
}

if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state))
return false;

return queue_live;
}
EXPORT_SYMBOL_GPL(__nvme_check_ready);
Expand Down Expand Up @@ -5297,6 +5350,34 @@ void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);

static void __nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl, bool idle)
{
struct nvme_ns *ns;
int srcu_idx;

srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
srcu_read_lock_held(&ctrl->srcu)) {
if (idle)
blk_mq_set_hw_queues_idle(ns->queue);
else
blk_mq_clear_hw_queues_idle(ns->queue);
}
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}

void nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl)
{
__nvme_set_hw_queues_idle(ctrl, true);
}
EXPORT_SYMBOL_GPL(nvme_set_hw_queues_idle);

void nvme_clear_hw_queues_idle(struct nvme_ctrl *ctrl)
{
__nvme_set_hw_queues_idle(ctrl, false);
}
EXPORT_SYMBOL_GPL(nvme_clear_hw_queues_idle);

void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
Expand Down
43 changes: 0 additions & 43 deletions drivers/nvme/host/multipath.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,49 +134,6 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
blk_freeze_queue_start(h->disk->queue);
}

void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
unsigned long flags;
struct bio *bio;

nvme_mpath_clear_current_path(ns);

/*
* If we got back an ANA error, we know the controller is alive but not
* ready to serve this namespace. Kick of a re-read of the ANA
* information page, and just try any other available path for now.
*/
if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
set_bit(NVME_NS_ANA_PENDING, &ns->flags);
queue_work(nvme_wq, &ns->ctrl->ana_work);
}

spin_lock_irqsave(&ns->head->requeue_lock, flags);
for (bio = req->bio; bio; bio = bio->bi_next) {
bio_set_dev(bio, ns->head->disk->part0);
if (bio->bi_opf & REQ_POLLED) {
bio->bi_opf &= ~REQ_POLLED;
bio->bi_cookie = BLK_QC_T_NONE;
}
/*
* The alternate request queue that we may end up submitting
* the bio to may be frozen temporarily, in this case REQ_NOWAIT
* will fail the I/O immediately with EAGAIN to the issuer.
* We are not in the issuer context which cannot block. Clear
* the flag to avoid spurious EAGAIN I/O failures.
*/
bio->bi_opf &= ~REQ_NOWAIT;
}
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);

nvme_req(req)->status = 0;
nvme_end_req(req);
kblockd_schedule_work(&ns->head->requeue_work);
}

void nvme_mpath_start_request(struct request *rq)
{
struct nvme_ns *ns = rq->q->queuedata;
Expand Down
3 changes: 2 additions & 1 deletion drivers/nvme/host/nvme.h
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,8 @@ void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl);
void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl);
void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl);
void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl);
void nvme_set_hw_queues_idle(struct nvme_ctrl *ctrl);
void nvme_clear_hw_queues_idle(struct nvme_ctrl *ctrl);
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl);
void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
Expand Down Expand Up @@ -1022,7 +1024,6 @@ void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
Expand Down
3 changes: 3 additions & 0 deletions drivers/nvme/host/pci.c
Original file line number Diff line number Diff line change
Expand Up @@ -3374,6 +3374,8 @@ static void nvme_reset_work(struct work_struct *work)
nvme_unquiesce_admin_queue(&dev->ctrl);
mutex_unlock(&dev->shutdown_lock);

nvme_set_hw_queues_idle(&dev->ctrl);

/*
* Introduce CONNECTING state from nvme-fc/rdma transports to mark the
* initializing procedure here.
Expand Down Expand Up @@ -3406,6 +3408,7 @@ static void nvme_reset_work(struct work_struct *work)
if (result)
goto out;

nvme_clear_hw_queues_idle(&dev->ctrl);
/*
* Freeze and update the number of I/O queues as those might have
* changed. If there are no I/O queues left after this reset, keep the
Expand Down
3 changes: 3 additions & 0 deletions include/linux/blk-mq.h
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,7 @@ enum {
BLK_MQ_S_SCHED_RESTART,
/* hw queue is inactive after all its CPUs become offline */
BLK_MQ_S_INACTIVE,
BLK_MQ_S_IDLE,
BLK_MQ_S_MAX
};

Expand Down Expand Up @@ -934,6 +935,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_set_hw_queues_idle(struct request_queue *q);
void blk_mq_clear_hw_queues_idle(struct request_queue *q);
void blk_mq_quiesce_queue(struct request_queue *q);
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
Expand Down
21 changes: 5 additions & 16 deletions lib/group_cpus.c
Original file line number Diff line number Diff line change
Expand Up @@ -510,25 +510,13 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
if (!masks)
goto fail_node_to_cpumask;

/* Stabilize the cpumasks */
cpus_read_lock();
build_node_to_cpumask(node_to_cpumask);

/*
* Make a local cache of 'cpu_present_mask', so the two stages
* spread can observe consistent 'cpu_present_mask' without holding
* cpu hotplug lock, then we can reduce deadlock risk with cpu
* hotplug code.
*
* Here CPU hotplug may happen when reading `cpu_present_mask`, and
* we can live with the case because it only affects that hotplug
* CPU is handled in the 1st or 2nd stage, and either way is correct
* from API user viewpoint since 2-stage spread is sort of
* optimization.
*/
cpumask_copy(npresmsk, data_race(cpu_present_mask));

/* grouping present CPUs first */
ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
npresmsk, nmsk, masks);
cpu_present_mask, nmsk, masks);
if (ret < 0)
goto fail_node_to_cpumask;
nr_present = ret;
Expand All @@ -543,13 +531,14 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
curgrp = 0;
else
curgrp = nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
npresmsk, nmsk, masks);
if (ret >= 0)
nr_others = ret;

fail_node_to_cpumask:
cpus_read_unlock();
free_node_to_cpumask(node_to_cpumask);

fail_npresmsk:
Expand Down