diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 09a9d4aca0fd..900b3fc4c72d 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -886,6 +886,21 @@ Description: zone commands, they will be treated as regular block devices and zoned will report "none". +What: /sys/block//queue/zoned_qd1_writes +Date: January 2026 +Contact: Damien Le Moal +Description: + [RW] zoned_qd1_writes indicates if write operations to a zoned + block device are being handled using a single issuer context (a + kernel thread) operating at a maximum queue depth of 1. This + attribute is visible only for zoned block devices. The default + value for zoned block devices that are not rotational devices + (e.g. ZNS SSDs or zoned UFS devices) is 0. For rotational zoned + block devices (e.g. SMR HDDs) the default value is 1. Since + this default may not be appropriate for some devices, e.g. + remotely connected devices over high latency networks, the user + can disable this feature by setting this attribute to 0. + What: /sys/block//hidden Date: March 2023 diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28167c9baa55..047ec887456b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -97,6 +97,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), QUEUE_FLAG_NAME(BIO_ISSUE_TIME), + QUEUE_FLAG_NAME(ZONED_QD1_WRITES), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 55a1bbfef7d4..878b8a4b55bb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -390,6 +390,36 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) return queue_var_show(disk_nr_zones(disk), page); } +static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page) +{ + return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue), + page); +} + +static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk, + const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned long qd1_writes; + unsigned int memflags; + ssize_t ret; + + ret = queue_var_store(&qd1_writes, page, count); + if (ret < 0) + return ret; + + memflags = blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + if (qd1_writes) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + else + blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); + + return count; +} + static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); @@ -617,6 +647,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); @@ -754,6 +785,7 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, + &queue_zoned_qd1_writes_entry.attr, NULL, }; @@ -786,7 +818,8 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || - attr == &queue_max_active_zones_entry.attr) && + attr == &queue_max_active_zones_entry.attr || + attr == &queue_zoned_qd1_writes_entry.attr) && !blk_queue_is_zoned(q)) return 0; @@ -934,6 +967,14 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); blk_debugfs_unlock(q, memflags); + /* + * For blk-mq rotational zoned devices, default to using QD=1 + * writes. For non-mq rotational zoned devices, the device driver can + * set an appropriate default. + */ + if (queue_is_mq(q) && blk_queue_rot(q) && blk_queue_is_zoned(q)) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + ret = disk_register_independent_access_ranges(disk); if (ret) goto out_debugfs_remove; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 9d1dd6ccfad7..e1a23c8b676d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include @@ -40,6 +42,8 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. + * @entry: list_head structure for listing the plug in the disk list of active + * zone write plugs. * @bio_list: The list of BIOs that are currently plugged. * @bio_work: Work struct to handle issuing of plugged BIOs * @rcu_head: RCU head to free zone write plugs with an RCU grace period. @@ -62,6 +66,7 @@ static const char *const zone_cond_name[] = { */ struct blk_zone_wplug { struct hlist_node node; + struct list_head entry; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; @@ -99,17 +104,17 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) * being executed or the zone write plug bio list is not empty. * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone * write pointer offset and need to update it. - * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed - * from the disk hash table and that the initial reference to the zone - * write plug set when the plug was first added to the hash table has been - * dropped. This flag is set when a zone is reset, finished or become full, - * to prevent new references to the zone write plug to be taken for - * newly incoming BIOs. A zone write plug flagged with this flag will be - * freed once all remaining references from BIOs or functions are dropped. + * - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be + * removed from the disk hash table of zone write plugs when the last + * reference on the zone write plug is dropped. If set, this flag also + * indicates that the initial extra reference on the zone write plug was + * dropped, meaning that the reference count indicates the current number of + * active users (code context or BIOs and requests in flight). This flag is + * set when a zone is reset, finished or becomes full. */ #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) -#define BLK_ZONE_WPLUG_UNHASHED (1U << 2) +#define BLK_ZONE_WPLUG_DEAD (1U << 2) /** * blk_zone_cond_str - Return a zone condition name string @@ -492,18 +497,12 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) return zone->start + zone->len >= get_capacity(disk); } -static bool disk_zone_is_full(struct gendisk *disk, - unsigned int zno, unsigned int offset_in_zone) -{ - if (zno < disk->nr_zones - 1) - return offset_in_zone >= disk->zone_capacity; - return offset_in_zone >= disk->last_zone_capacity; -} - static bool disk_zone_wplug_is_full(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); + if (zwplug->zone_no < disk->nr_zones - 1) + return zwplug->wp_offset >= disk->zone_capacity; + return zwplug->wp_offset >= disk->last_zone_capacity; } static bool disk_insert_zone_wplug(struct gendisk *disk, @@ -520,10 +519,11 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * are racing with other submission context, so we may already have a * zone write plug for the same zone. */ - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { if (zwplg->zone_no == zwplug->zone_no) { - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, + flags); return false; } } @@ -535,7 +535,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * necessarilly in the active condition. */ zones_cond = rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); if (zones_cond) zwplug->cond = zones_cond[zwplug->zone_no]; else @@ -543,7 +543,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); atomic_inc(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); return true; } @@ -587,105 +587,76 @@ static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); } -static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) +static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) { - if (refcount_dec_and_test(&zwplug->ref)) { - WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); - WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); + struct gendisk *disk = zwplug->disk; + unsigned long flags; - call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); - } -} + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)); + WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); + WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); -static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - lockdep_assert_held(&zwplug->lock); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); + blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, + lockdep_is_held(&disk->zone_wplugs_hash_lock)), + zwplug->zone_no, zwplug->cond); + hlist_del_init_rcu(&zwplug->node); + atomic_dec(&disk->nr_zone_wplugs); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); - /* If the zone write plug was already removed, we are done. */ - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) - return false; + call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); +} - /* If the zone write plug is still plugged, it cannot be removed. */ - if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) - return false; +static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) +{ + if (refcount_dec_and_test(&zwplug->ref)) + disk_free_zone_wplug(zwplug); +} - /* - * Completions of BIOs with blk_zone_write_plug_bio_endio() may - * happen after handling a request completion with - * blk_zone_write_plug_finish_request() (e.g. with split BIOs - * that are chained). In such case, disk_zone_wplug_unplug_bio() - * should not attempt to remove the zone write plug until all BIO - * completions are seen. Check by looking at the zone write plug - * reference count, which is 2 when the plug is unused (one reference - * taken when the plug was allocated and another reference taken by the - * caller context). - */ - if (refcount_read(&zwplug->ref) > 2) - return false; +/* + * Flag the zone write plug as dead and drop the initial reference we got when + * the zone write plug was added to the hash table. The zone write plug will be + * unhashed when its last reference is dropped. + */ +static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) +{ + lockdep_assert_held(&zwplug->lock); - /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); + if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) { + zwplug->flags |= BLK_ZONE_WPLUG_DEAD; + disk_put_zone_wplug(zwplug); + } } -static void disk_remove_zone_wplug(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug); + +static void blk_zone_wplug_bio_work(struct work_struct *work) { - unsigned long flags; + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); - /* If the zone write plug was already removed, we have nothing to do. */ - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) - return; + disk_zone_wplug_submit_bio(zwplug->disk, zwplug); - /* - * Mark the zone write plug as unhashed and drop the extra reference we - * took when the plug was inserted in the hash table. Also update the - * disk zone condition array with the current condition of the zone - * write plug. - */ - zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)), - zwplug->zone_no, zwplug->cond); - hlist_del_init_rcu(&zwplug->node); - atomic_dec(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ disk_put_zone_wplug(zwplug); } -static void blk_zone_wplug_bio_work(struct work_struct *work); - /* - * Get a reference on the write plug for the zone containing @sector. - * If the plug does not exist, it is allocated and hashed. - * Return a pointer to the zone write plug with the plug spinlock held. + * Get a zone write plug for the zone containing @sector. + * If the plug does not exist, it is allocated and inserted in the disk hash + * table. */ -static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, - sector_t sector, gfp_t gfp_mask, - unsigned long *flags) +static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk, + sector_t sector, gfp_t gfp_mask) { unsigned int zno = disk_zone_no(disk, sector); struct blk_zone_wplug *zwplug; again: zwplug = disk_get_zone_wplug(disk, sector); - if (zwplug) { - /* - * Check that a BIO completion or a zone reset or finish - * operation has not already removed the zone write plug from - * the hash table and dropped its reference count. In such case, - * we need to get a new plug so start over from the beginning. - */ - spin_lock_irqsave(&zwplug->lock, *flags); - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { - spin_unlock_irqrestore(&zwplug->lock, *flags); - disk_put_zone_wplug(zwplug); - goto again; - } + if (zwplug) return zwplug; - } /* * Allocate and initialize a zone write plug with an extra reference @@ -704,17 +675,15 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + INIT_LIST_HEAD(&zwplug->entry); zwplug->disk = disk; - spin_lock_irqsave(&zwplug->lock, *flags); - /* * Insert the new zone write plug in the hash table. This can fail only * if another context already inserted a plug. Retry from the beginning * in such case. */ if (!disk_insert_zone_wplug(disk, zwplug)) { - spin_unlock_irqrestore(&zwplug->lock, *flags); mempool_free(zwplug, disk->zone_wplugs_pool); goto again; } @@ -739,6 +708,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, */ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { + struct gendisk *disk = zwplug->disk; struct bio *bio; lockdep_assert_held(&zwplug->lock); @@ -752,6 +722,20 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) blk_zone_wplug_bio_io_error(zwplug, bio); zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If we are using the per disk zone write plugs worker thread, remove + * the zone write plug from the work list and drop the reference we + * took when the zone write plug was added to that list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (!list_empty(&zwplug->entry)) { + list_del_init(&zwplug->entry); + disk_put_zone_wplug(zwplug); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -788,14 +772,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, disk_zone_wplug_update_cond(disk, zwplug); disk_zone_wplug_abort(zwplug); - - /* - * The zone write plug now has no BIO plugged: remove it from the - * hash table so that it cannot be seen. The plug will be freed - * when the last reference is dropped. - */ - if (disk_should_remove_zone_wplug(disk, zwplug)) - disk_remove_zone_wplug(disk, zwplug); + if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) + disk_mark_zone_wplug_dead(zwplug); } static unsigned int blk_zone_wp_offset(struct blk_zone *zone) @@ -1192,19 +1170,24 @@ void blk_zone_mgmt_bio_endio(struct bio *bio) } } -static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static void disk_zone_wplug_schedule_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { lockdep_assert_held(&zwplug->lock); /* - * Take a reference on the zone write plug and schedule the submission - * of the next plugged BIO. blk_zone_wplug_bio_work() will release the - * reference we take here. + * Schedule the submission of the next plugged BIO. Taking a reference + * to the zone write plug is required as the bio_work belongs to the + * plug, and thus we must ensure that the write plug does not go away + * while the work is being scheduled but has not run yet. + * blk_zone_wplug_bio_work() will release the reference we take here, + * and we also drop this reference if the work is already scheduled. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); refcount_inc(&zwplug->ref); - queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); + if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) + disk_put_zone_wplug(zwplug); } static inline void disk_zone_wplug_add_bio(struct gendisk *disk, @@ -1241,6 +1224,22 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); + + /* + * If we are using the disk zone write plugs worker instead of the per + * zone write plug BIO work, add the zone write plug to the work list + * if it is not already there. Make sure to also get an extra reference + * on the zone write plug so that it does not go away until it is + * removed from the work list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (list_empty(&zwplug->entry)) { + list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); + refcount_inc(&zwplug->ref); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1438,7 +1437,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) if (bio->bi_opf & REQ_NOWAIT) gfp_mask = GFP_NOWAIT; - zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); + zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask); if (!zwplug) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); @@ -1447,6 +1446,21 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return true; } + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * If we got a zone write plug marked as dead, then the user is issuing + * writes to a full zone, or without synchronizing with zone reset or + * zone finish operations. In such case, fail the BIO to signal this + * invalid usage. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) { + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + bio_io_error(bio); + return true; + } + /* Indicate that this BIO is being handled using zone write plugging. */ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); @@ -1459,6 +1473,13 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) goto queue_bio; } + /* + * For rotational devices, we will use the gendisk zone write plugs + * work instead of the per zone write plug BIO work, so queue the BIO. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + goto queue_bio; + /* If the zone is already plugged, add the BIO to the BIO plug list. */ if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) goto queue_bio; @@ -1481,7 +1502,10 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - disk_zone_wplug_schedule_bio_work(disk, zwplug); + if (blk_queue_zoned_qd1_writes(disk->queue)) + wake_up_process(disk->zone_wplugs_worker); + else + disk_zone_wplug_schedule_work(disk, zwplug); } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1527,7 +1551,7 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) disk->disk_name, zwplug->zone_no); disk_zone_wplug_abort(zwplug); } - disk_remove_zone_wplug(disk, zwplug); + disk_mark_zone_wplug_dead(zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); @@ -1622,21 +1646,21 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_lock_irqsave(&zwplug->lock, flags); - /* Schedule submission of the next plugged BIO if we have one. */ - if (!bio_list_empty(&zwplug->bio_list)) { - disk_zone_wplug_schedule_bio_work(disk, zwplug); - spin_unlock_irqrestore(&zwplug->lock, flags); - return; - } - - zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; - /* - * If the zone is full (it was fully written or finished, or empty - * (it was reset), remove its zone write plug from the hash table. + * For rotational devices, signal the BIO completion to the zone write + * plug work. Otherwise, schedule submission of the next plugged BIO + * if we have one. */ - if (disk_should_remove_zone_wplug(disk, zwplug)) - disk_remove_zone_wplug(disk, zwplug); + if (bio_list_empty(&zwplug->bio_list)) + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + if (blk_queue_zoned_qd1_writes(disk->queue)) + complete(&disk->zone_wplugs_worker_bio_done); + else if (!bio_list_empty(&zwplug->bio_list)) + disk_zone_wplug_schedule_work(disk, zwplug); + + if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) + disk_mark_zone_wplug_dead(zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1727,10 +1751,9 @@ void blk_zone_write_plug_finish_request(struct request *req) disk_put_zone_wplug(zwplug); } -static void blk_zone_wplug_bio_work(struct work_struct *work) +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { - struct blk_zone_wplug *zwplug = - container_of(work, struct blk_zone_wplug, bio_work); struct block_device *bdev; unsigned long flags; struct bio *bio; @@ -1746,7 +1769,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); - goto put_zwplug; + return false; } trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, @@ -1760,14 +1783,15 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) goto again; } - bdev = bio->bi_bdev; - /* * blk-mq devices will reuse the extra reference on the request queue * usage counter we took when the BIO was plugged, but the submission * path for BIO-based devices will not do that. So drop this extra * reference here. */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + reinit_completion(&disk->zone_wplugs_worker_bio_done); + bdev = bio->bi_bdev; if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); @@ -1775,14 +1799,78 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) blk_mq_submit_bio(bio); } -put_zwplug: - /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ - disk_put_zone_wplug(zwplug); + return true; +} + +static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + + spin_lock_irq(&disk->zone_wplugs_list_lock); + zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, + struct blk_zone_wplug, entry); + if (zwplug) + list_del_init(&zwplug->entry); + spin_unlock_irq(&disk->zone_wplugs_list_lock); + + return zwplug; +} + +static int disk_zone_wplugs_worker(void *data) +{ + struct gendisk *disk = data; + struct blk_zone_wplug *zwplug; + unsigned int noio_flag; + + noio_flag = memalloc_noio_save(); + set_user_nice(current, MIN_NICE); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + + zwplug = disk_get_zone_wplugs_work(disk); + if (zwplug) { + /* + * Process all BIOs of this zone write plug and then + * drop the reference we took when adding the zone write + * plug to the active list. + */ + set_current_state(TASK_RUNNING); + while (disk_zone_wplug_submit_bio(disk, zwplug)) + blk_wait_io(&disk->zone_wplugs_worker_bio_done); + disk_put_zone_wplug(zwplug); + continue; + } + + /* + * Only sleep if nothing sets the state to running. Else check + * for zone write plugs work again as a newly submitted BIO + * might have added a zone write plug to the work list. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); + } else { + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } + schedule(); + } + } + + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + memalloc_noio_restore(noio_flag); + + return 0; } void disk_init_zone_resources(struct gendisk *disk) { - spin_lock_init(&disk->zone_wplugs_lock); + spin_lock_init(&disk->zone_wplugs_hash_lock); + spin_lock_init(&disk->zone_wplugs_list_lock); + INIT_LIST_HEAD(&disk->zone_wplugs_list); + init_completion(&disk->zone_wplugs_worker_bio_done); } /* @@ -1798,6 +1886,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, unsigned int pool_size) { unsigned int i; + int ret = -ENOMEM; atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = @@ -1823,8 +1912,21 @@ static int disk_alloc_zone_resources(struct gendisk *disk, if (!disk->zone_wplugs_wq) goto destroy_pool; + disk->zone_wplugs_worker = + kthread_create(disk_zone_wplugs_worker, disk, + "%s_zwplugs_worker", disk->disk_name); + if (IS_ERR(disk->zone_wplugs_worker)) { + ret = PTR_ERR(disk->zone_wplugs_worker); + disk->zone_wplugs_worker = NULL; + goto destroy_wq; + } + wake_up_process(disk->zone_wplugs_worker); + return 0; +destroy_wq: + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; destroy_pool: mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; @@ -1832,7 +1934,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; - return -ENOMEM; + return ret; } static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) @@ -1848,9 +1950,9 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) while (!hlist_empty(&disk->zone_wplugs_hash[i])) { zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, struct blk_zone_wplug, node); - refcount_inc(&zwplug->ref); - disk_remove_zone_wplug(disk, zwplug); - disk_put_zone_wplug(zwplug); + spin_lock_irq(&zwplug->lock); + disk_mark_zone_wplug_dead(zwplug); + spin_unlock_irq(&zwplug->lock); } } @@ -1872,16 +1974,20 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) { unsigned long flags; - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); kfree_rcu_mightsleep(zones_cond); } void disk_free_zone_resources(struct gendisk *disk) { + if (disk->zone_wplugs_worker) + kthread_stop(disk->zone_wplugs_worker); + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; @@ -2078,7 +2184,6 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, struct gendisk *disk = args->disk; struct blk_zone_wplug *zwplug; unsigned int wp_offset; - unsigned long flags; /* * Remember the capacity of the first sequential zone and check @@ -2108,10 +2213,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, if (!wp_offset || wp_offset >= zone->capacity) return 0; - zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); + zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO); if (!zwplug) return -ENOMEM; - spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d463b9b5a0a5..ac899cd0cd70 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -200,10 +201,14 @@ struct gendisk { u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; - spinlock_t zone_wplugs_lock; + spinlock_t zone_wplugs_hash_lock; struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; + spinlock_t zone_wplugs_list_lock; + struct list_head zone_wplugs_list; + struct task_struct *zone_wplugs_worker; + struct completion zone_wplugs_worker_bio_done; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -668,6 +673,7 @@ enum { QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ + QUEUE_FLAG_ZONED_QD1_WRITES, /* Limit zoned devices writes to QD=1 */ QUEUE_FLAG_MAX }; @@ -707,6 +713,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) #define blk_queue_no_elv_switch(q) \ test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) +#define blk_queue_zoned_qd1_writes(q) \ + test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q);