diff options
Diffstat (limited to 'block')
| -rw-r--r-- | block/bio-integrity-auto.c | 26 | ||||
| -rw-r--r-- | block/bio-integrity.c | 48 | ||||
| -rw-r--r-- | block/bio.c | 1 | ||||
| -rw-r--r-- | block/blk-core.c | 12 | ||||
| -rw-r--r-- | block/blk-iocost.c | 6 | ||||
| -rw-r--r-- | block/blk-lib.c | 6 | ||||
| -rw-r--r-- | block/blk-map.c | 3 | ||||
| -rw-r--r-- | block/blk-merge.c | 44 | ||||
| -rw-r--r-- | block/blk-mq-dma.c | 29 | ||||
| -rw-r--r-- | block/blk-mq-sched.c | 120 | ||||
| -rw-r--r-- | block/blk-mq-sched.h | 40 | ||||
| -rw-r--r-- | block/blk-mq-tag.c | 2 | ||||
| -rw-r--r-- | block/blk-mq.c | 152 | ||||
| -rw-r--r-- | block/blk-mq.h | 2 | ||||
| -rw-r--r-- | block/blk-settings.c | 27 | ||||
| -rw-r--r-- | block/blk-sysfs.c | 26 | ||||
| -rw-r--r-- | block/blk-throttle.c | 45 | ||||
| -rw-r--r-- | block/blk-zoned.c | 928 | ||||
| -rw-r--r-- | block/blk.h | 23 | ||||
| -rw-r--r-- | block/elevator.c | 80 | ||||
| -rw-r--r-- | block/elevator.h | 27 | ||||
| -rw-r--r-- | block/genhd.c | 8 | ||||
| -rw-r--r-- | block/ioctl.c | 8 | ||||
| -rw-r--r-- | block/kyber-iosched.c | 30 | ||||
| -rw-r--r-- | block/mq-deadline.c | 129 | ||||
| -rw-r--r-- | block/partitions/efi.c | 3 |
26 files changed, 1274 insertions, 551 deletions
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 687952f63bbb..9850c338548d 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -29,7 +29,7 @@ static void bio_integrity_finish(struct bio_integrity_data *bid) { bid->bio->bi_integrity = NULL; bid->bio->bi_opf &= ~REQ_INTEGRITY; - kfree(bvec_virt(bid->bip.bip_vec)); + bio_integrity_free_buf(&bid->bip); mempool_free(bid, &bid_pool); } @@ -110,8 +110,6 @@ bool bio_integrity_prep(struct bio *bio) struct bio_integrity_data *bid; bool set_flags = true; gfp_t gfp = GFP_NOIO; - unsigned int len; - void *buf; if (!bi) return true; @@ -152,19 +150,12 @@ bool bio_integrity_prep(struct bio *bio) if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return true; - /* Allocate kernel buffer for protection data */ - len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, gfp); - if (!buf) - goto err_end_io; bid = mempool_alloc(&bid_pool, GFP_NOIO); - if (!bid) - goto err_free_buf; bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); - bid->bio = bio; - bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bio_integrity_alloc_buf(bio, gfp & __GFP_ZERO); + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); if (set_flags) { @@ -176,23 +167,12 @@ bool bio_integrity_prep(struct bio *bio) bid->bip.bip_flags |= BIP_CHECK_REFTAG; } - if (bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)) < len) - goto err_end_io; - /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; return true; - -err_free_buf: - kfree(buf); -err_end_io: - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return false; } EXPORT_SYMBOL(bio_integrity_prep); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bed26f1ec869..09eeaf6e74b8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -14,6 +14,45 @@ struct bio_integrity_alloc { struct bio_vec bvecs[]; }; +static mempool_t integrity_buf_pool; + +void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + unsigned int len = bio_integrity_bytes(bi, bio_sectors(bio)); + gfp_t gfp = GFP_NOIO | (zero_buffer ? __GFP_ZERO : 0); + void *buf; + + buf = kmalloc(len, (gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN); + if (unlikely(!buf)) { + struct page *page; + + page = mempool_alloc(&integrity_buf_pool, GFP_NOFS); + if (zero_buffer) + memset(page_address(page), 0, len); + bvec_set_page(&bip->bip_vec[0], page, len, 0); + bip->bip_flags |= BIP_MEMPOOL; + } else { + bvec_set_page(&bip->bip_vec[0], virt_to_page(buf), len, + offset_in_page(buf)); + } + + bip->bip_vcnt = 1; + bip->bip_iter.bi_size = len; +} + +void bio_integrity_free_buf(struct bio_integrity_payload *bip) +{ + struct bio_vec *bv = &bip->bip_vec[0]; + + if (bip->bip_flags & BIP_MEMPOOL) + mempool_free(bv->bv_page, &integrity_buf_pool); + else + kfree(bvec_virt(bv)); +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -438,3 +477,12 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } + +static int __init bio_integrity_initfn(void) +{ + if (mempool_init_page_pool(&integrity_buf_pool, BIO_POOL_SIZE, + get_order(BLK_INTEGRITY_MAX_SIZE))) + panic("bio: can't create integrity buf pool\n"); + return 0; +} +subsys_initcall(bio_integrity_initfn); diff --git a/block/bio.c b/block/bio.c index b3a79285c278..7b13bdf72de0 100644 --- a/block/bio.c +++ b/block/bio.c @@ -253,6 +253,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_write_hint = 0; bio->bi_write_stream = 0; bio->bi_status = 0; + bio->bi_bvec_gap_bit = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; bio->bi_iter.bi_idx = 0; diff --git a/block/blk-core.c b/block/blk-core.c index 14ae73eebe0d..8387fe50ea15 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -662,13 +662,13 @@ static void __submit_bio(struct bio *bio) * bio_list of new bios to be added. ->submit_bio() may indeed add some more * bios through a recursive call to submit_bio_noacct. If it did, we find a * non-NULL value in bio_list and re-enter the loop from the top. - * - In this case we really did just take the bio of the top of the list (no + * - In this case we really did just take the bio off the top of the list (no * pretending) and so remove it from bio_list, and call into ->submit_bio() * again. * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current - * ->submit_bio, but that haven't been processed yet. + * ->submit_bio(), but that haven't been processed yet. */ static void __submit_bio_noacct(struct bio *bio) { @@ -743,8 +743,8 @@ void submit_bio_noacct_nocheck(struct bio *bio, bool split) /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list - * to collect a list of requests submited by a ->submit_bio method while - * it is active, and then process them after it returned. + * to collect a list of requests submitted by a ->submit_bio method + * while it is active, and then process them after it returned. */ if (current->bio_list) { if (split) @@ -901,7 +901,7 @@ static void bio_set_ioprio(struct bio *bio) * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The - * bio will be send to the device described by the bi_bdev field. + * bio will be sent to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback @@ -991,7 +991,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, * point to a freshly allocated bio at this point. If that happens * we have a few cases to consider: * - * 1) the bio is beeing initialized and bi_bdev is NULL. We can just + * 1) the bio is being initialized and bi_bdev is NULL. We can just * simply nothing in this case * 2) the bio points to a not poll enabled device. bio_poll will catch * this and return 0 diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5bfd70311359..a0416927d33d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2334,10 +2334,8 @@ static void ioc_timer_fn(struct timer_list *timer) else usage_dur = max_t(u64, now.now - ioc->period_at, 1); - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); + usage = clamp(DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. diff --git a/block/blk-lib.c b/block/blk-lib.c index 3030a772d3aa..19e0203cc18a 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -87,11 +87,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, { struct bio *bio = NULL; struct blk_plug plug; - int ret; + int ret = 0; blk_start_plug(&plug); - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); - if (!ret && bio) { + __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); + if (bio) { ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) ret = 0; diff --git a/block/blk-map.c b/block/blk-map.c index 60faf036fb6e..17a1dc288678 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -459,6 +459,8 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) if (rq->bio) { if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; + rq->phys_gap_bit = bio_seg_gap(rq->q, rq->biotail, bio, + rq->phys_gap_bit); rq->biotail->bi_next = bio; rq->biotail = bio; rq->__data_len += bio->bi_iter.bi_size; @@ -469,6 +471,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) rq->nr_phys_segments = nr_segs; rq->bio = rq->biotail = bio; rq->__data_len = bio->bi_iter.bi_size; + rq->phys_gap_bit = bio->bi_bvec_gap_bit; return 0; } EXPORT_SYMBOL(blk_rq_append_bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index 37864c5d287e..d3115d7469df 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -302,6 +302,12 @@ static unsigned int bio_split_alignment(struct bio *bio, return lim->logical_block_size; } +static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv, + struct bio_vec *bv) +{ + return bv->bv_offset | (bvprv->bv_offset + bvprv->bv_len); +} + /** * bio_split_io_at - check if and where to split a bio * @bio: [in] bio to be split @@ -319,8 +325,8 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { struct bio_vec bv, bvprv, *bvprvp = NULL; + unsigned nsegs = 0, bytes = 0, gaps = 0; struct bvec_iter iter; - unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { if (bv.bv_offset & lim->dma_alignment || @@ -331,12 +337,15 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) - goto split; + if (bvprvp) { + if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) + goto split; + gaps |= bvec_seg_gap(bvprvp, &bv); + } if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && - bv.bv_offset + bv.bv_len <= lim->min_segment_size) { + bv.bv_offset + bv.bv_len <= lim->max_fast_segment_size) { nsegs++; bytes += bv.bv_len; } else { @@ -350,6 +359,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, } *segs = nsegs; + bio->bi_bvec_gap_bit = ffs(gaps); return 0; split: if (bio->bi_opf & REQ_ATOMIC) @@ -385,6 +395,7 @@ split: * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); + bio->bi_bvec_gap_bit = ffs(gaps); return bytes >> SECTOR_SHIFT; } EXPORT_SYMBOL_GPL(bio_split_io_at); @@ -721,6 +732,24 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq, return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); } +u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, + u8 gaps_bit) +{ + struct bio_vec pb, nb; + + if (!bio_has_data(prev)) + return 0; + + gaps_bit = min_not_zero(gaps_bit, prev->bi_bvec_gap_bit); + gaps_bit = min_not_zero(gaps_bit, next->bi_bvec_gap_bit); + + bio_get_last_bvec(prev, &pb); + bio_get_first_bvec(next, &nb); + if (!biovec_phys_mergeable(q, &pb, &nb)) + gaps_bit = min_not_zero(gaps_bit, ffs(bvec_seg_gap(&pb, &nb))); + return gaps_bit; +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -785,6 +814,9 @@ static struct request *attempt_merge(struct request_queue *q, if (next->start_time_ns < req->start_time_ns) req->start_time_ns = next->start_time_ns; + req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, next->bio, + min_not_zero(next->phys_gap_bit, + req->phys_gap_bit)); req->biotail->bi_next = next->bio; req->biotail = next->biotail; @@ -908,6 +940,8 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req, if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) blk_zone_write_plug_bio_merged(bio); + req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, bio, + req->phys_gap_bit); req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -942,6 +976,8 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, blk_update_mixed_merge(req, bio, true); + req->phys_gap_bit = bio_seg_gap(req->q, bio, req->bio, + req->phys_gap_bit); bio->bi_next = req->bio; req->bio = bio; diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 449950029872..e9108ccaf4b0 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -79,8 +79,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, static inline bool blk_can_dma_map_iova(struct request *req, struct device *dma_dev) { - return !((queue_virt_boundary(req->q) + 1) & - dma_get_merge_boundary(dma_dev)); + return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev)); } static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) @@ -93,8 +92,13 @@ static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter, struct phys_vec *vec) { - iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), - offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); + unsigned int attrs = 0; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + + iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len, + rq_dma_dir(req), attrs); if (dma_mapping_error(dma_dev, iter->addr)) { iter->status = BLK_STS_RESOURCE; return false; @@ -109,14 +113,18 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, { enum dma_data_direction dir = rq_dma_dir(req); unsigned int mapped = 0; + unsigned int attrs = 0; int error; iter->addr = state->addr; iter->len = dma_iova_size(state); + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + do { error = dma_iova_link(dma_dev, state, vec->paddr, mapped, - vec->len, dir, 0); + vec->len, dir, attrs); if (error) break; mapped += vec->len; @@ -143,7 +151,7 @@ static inline void blk_rq_map_iter_init(struct request *rq, .bi_size = rq->special_vec.bv_len, } }; - } else if (bio) { + } else if (bio) { *iter = (struct blk_map_iter) { .bio = bio, .bvecs = bio->bi_io_vec, @@ -151,7 +159,7 @@ static inline void blk_rq_map_iter_init(struct request *rq, }; } else { /* the internal flush request may not have bio attached */ - *iter = (struct blk_map_iter) {}; + *iter = (struct blk_map_iter) {}; } } @@ -163,6 +171,7 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; + iter->p2pdma.map = PCI_P2PDMA_MAP_NONE; /* * Grab the first segment ASAP because we'll need it to check for P2P @@ -174,10 +183,6 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, phys_to_page(vec.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: - if (iter->iter.is_integrity) - bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; - else - req->cmd_flags |= REQ_P2PDMA; return blk_dma_map_bus(iter, &vec); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* @@ -352,7 +357,7 @@ bool blk_rq_integrity_dma_map_iter_start(struct request *req, EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); /** - * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for + * blk_rq_integrity_dma_map_iter_next - map the next integrity DMA segment for * a request * @req: request to map * @dma_dev: device to map to diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e0bed16485c3..e26898128a7e 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -427,11 +427,25 @@ void blk_mq_free_sched_tags(struct elevator_tags *et, kfree(et); } -void blk_mq_free_sched_tags_batch(struct xarray *et_table, +void blk_mq_free_sched_res(struct elevator_resources *res, + struct elevator_type *type, + struct blk_mq_tag_set *set) +{ + if (res->et) { + blk_mq_free_sched_tags(res->et, set); + res->et = NULL; + } + if (res->data) { + blk_mq_free_sched_data(type, res->data); + res->data = NULL; + } +} + +void blk_mq_free_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set) { struct request_queue *q; - struct elevator_tags *et; + struct elv_change_ctx *ctx; lockdep_assert_held_write(&set->update_nr_hwq_lock); @@ -444,13 +458,46 @@ void blk_mq_free_sched_tags_batch(struct xarray *et_table, * concurrently. */ if (q->elevator) { - et = xa_load(et_table, q->id); - if (unlikely(!et)) + ctx = xa_load(elv_tbl, q->id); + if (!ctx) { WARN_ON_ONCE(1); - else - blk_mq_free_sched_tags(et, set); + continue; + } + blk_mq_free_sched_res(&ctx->res, ctx->type, set); + } + } +} + +void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl) +{ + unsigned long i; + struct elv_change_ctx *ctx; + + xa_for_each(elv_tbl, i, ctx) { + xa_erase(elv_tbl, i); + kfree(ctx); + } +} + +int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, + struct blk_mq_tag_set *set) +{ + struct request_queue *q; + struct elv_change_ctx *ctx; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + ctx = kzalloc(sizeof(struct elv_change_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (xa_insert(elv_tbl, q->id, ctx, GFP_KERNEL)) { + kfree(ctx); + return -ENOMEM; } } + return 0; } struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, @@ -466,8 +513,7 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, else nr_tags = nr_hw_queues; - et = kmalloc(sizeof(struct elevator_tags) + - nr_tags * sizeof(struct blk_mq_tags *), gfp); + et = kmalloc(struct_size(et, tags, nr_tags), gfp); if (!et) return NULL; @@ -498,12 +544,33 @@ out: return NULL; } -int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, +int blk_mq_alloc_sched_res(struct request_queue *q, + struct elevator_type *type, + struct elevator_resources *res, + unsigned int nr_hw_queues) +{ + struct blk_mq_tag_set *set = q->tag_set; + + res->et = blk_mq_alloc_sched_tags(set, nr_hw_queues, + blk_mq_default_nr_requests(set)); + if (!res->et) + return -ENOMEM; + + res->data = blk_mq_alloc_sched_data(q, type); + if (IS_ERR(res->data)) { + blk_mq_free_sched_tags(res->et, set); + return -ENOMEM; + } + + return 0; +} + +int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set, unsigned int nr_hw_queues) { + struct elv_change_ctx *ctx; struct request_queue *q; - struct elevator_tags *et; - gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + int ret = -ENOMEM; lockdep_assert_held_write(&set->update_nr_hwq_lock); @@ -516,39 +583,44 @@ int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, * concurrently. */ if (q->elevator) { - et = blk_mq_alloc_sched_tags(set, nr_hw_queues, - blk_mq_default_nr_requests(set)); - if (!et) + ctx = xa_load(elv_tbl, q->id); + if (WARN_ON_ONCE(!ctx)) { + ret = -ENOENT; + goto out_unwind; + } + + ret = blk_mq_alloc_sched_res(q, q->elevator->type, + &ctx->res, nr_hw_queues); + if (ret) goto out_unwind; - if (xa_insert(et_table, q->id, et, gfp)) - goto out_free_tags; } } return 0; -out_free_tags: - blk_mq_free_sched_tags(et, set); + out_unwind: list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { if (q->elevator) { - et = xa_load(et_table, q->id); - if (et) - blk_mq_free_sched_tags(et, set); + ctx = xa_load(elv_tbl, q->id); + if (ctx) + blk_mq_free_sched_res(&ctx->res, + ctx->type, set); } } - return -ENOMEM; + return ret; } /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, - struct elevator_tags *et) + struct elevator_resources *res) { unsigned int flags = q->tag_set->flags; + struct elevator_tags *et = res->et; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; unsigned long i; int ret; - eq = elevator_alloc(q, e, et); + eq = elevator_alloc(q, e, res); if (!eq) return -ENOMEM; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 8e21a6b1415d..02c40a72e959 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -19,18 +19,52 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, - struct elevator_tags *et); + struct elevator_resources *res); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues, unsigned int nr_requests); -int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, +int blk_mq_alloc_sched_res(struct request_queue *q, + struct elevator_type *type, + struct elevator_resources *res, + unsigned int nr_hw_queues); +int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); +int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, + struct blk_mq_tag_set *set); +void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); -void blk_mq_free_sched_tags_batch(struct xarray *et_table, +void blk_mq_free_sched_res(struct elevator_resources *res, + struct elevator_type *type, + struct blk_mq_tag_set *set); +void blk_mq_free_sched_res_batch(struct xarray *et_table, struct blk_mq_tag_set *set); +/* + * blk_mq_alloc_sched_data() - Allocates scheduler specific data + * Returns: + * - Pointer to allocated data on success + * - NULL if no allocation needed + * - ERR_PTR(-ENOMEM) in case of failure + */ +static inline void *blk_mq_alloc_sched_data(struct request_queue *q, + struct elevator_type *e) +{ + void *sched_data; + + if (!e || !e->ops.alloc_sched_data) + return NULL; + + sched_data = e->ops.alloc_sched_data(q); + return (sched_data) ?: ERR_PTR(-ENOMEM); +} + +static inline void blk_mq_free_sched_data(struct elevator_type *e, void *data) +{ + if (e && e->ops.free_sched_data) + e->ops.free_sched_data(data); +} static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 5b664dbdf655..33946cdb5716 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -499,7 +499,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, int srcu_idx; /* - * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ diff --git a/block/blk-mq.c b/block/blk-mq.c index d626d32f6e57..4e96bb246247 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; + rq->phys_gap_bit = 0; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; @@ -467,21 +468,26 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) unsigned long tag_mask; int i, nr = 0; - tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); - if (unlikely(!tag_mask)) - return NULL; + do { + tag_mask = blk_mq_get_tags(data, data->nr_tags - nr, &tag_offset); + if (unlikely(!tag_mask)) { + if (nr == 0) + return NULL; + break; + } + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) + continue; + tag = tag_offset + i; + prefetch(tags->static_rqs[tag]); + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag); + rq_list_add_head(data->cached_rqs, rq); + nr++; + } + } while (data->nr_tags > nr); - tags = blk_mq_tags_from_data(data); - for (i = 0; tag_mask; i++) { - if (!(tag_mask & (1UL << i))) - continue; - tag = tag_offset + i; - prefetch(tags->static_rqs[tag]); - tag_mask &= ~(1UL << i); - rq = blk_mq_rq_ctx_init(data, tags, tag); - rq_list_add_head(data->cached_rqs, rq); - nr++; - } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ @@ -668,6 +674,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, goto out_queue_exit; } rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -723,7 +730,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = xa_load(&q->hctx_table, hctx_idx); + data.hctx = q->queue_hw_ctx[hctx_idx]; if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); @@ -748,6 +755,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -2674,6 +2682,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; rq->__data_len = bio->bi_iter.bi_size; + rq->phys_gap_bit = bio->bi_bvec_gap_bit; + rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, @@ -3380,6 +3390,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->nr_integrity_segments = rq_src->nr_integrity_segments; + rq->phys_gap_bit = rq_src->phys_gap_bit; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -3935,8 +3946,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_free_flush_queue_callback); hctx->fq = NULL; - xa_erase(&q->hctx_table, hctx_idx); - spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -3978,14 +3987,8 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->numa_node)) goto exit_hctx; - if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) - goto exit_flush_rq; - return 0; - exit_flush_rq: - if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -4374,7 +4377,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - xa_destroy(&q->hctx_table); + kfree(q->queue_hw_ctx); /* * release .mq_kobj and sw queue's kobject now because @@ -4518,26 +4521,49 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned long i, j; + int i, j, end; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + rcu_assign_pointer(q->queue_hw_ctx, new_hctxs); + /* + * Make sure reading the old queue_hw_ctx from other + * context concurrently won't trigger uaf. + */ + synchronize_rcu_expedited(); + kfree(hctxs); + hctxs = new_hctxs; + } for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); - struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); + struct blk_mq_hw_ctx *old_hctx = hctxs[i]; if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } - if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node); + if (!hctxs[i]) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); - hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); - WARN_ON_ONCE(!hctx); + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, + old_node); + WARN_ON_ONCE(!hctxs[i]); } } /* @@ -4546,13 +4572,21 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; + end = i; } else { j = i; + end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - xa_for_each_start(&q->hctx_table, j, hctx, j) - blk_mq_exit_hctx(q, set, hctx, j); + for (; j < end; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + blk_mq_exit_hctx(q, set, hctx, j); + hctxs[j] = NULL; + } + } } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, @@ -4588,8 +4622,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); - xa_init(&q->hctx_table); - blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4983,27 +5015,28 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, * Switch back to the elevator type stored in the xarray. */ static void blk_mq_elv_switch_back(struct request_queue *q, - struct xarray *elv_tbl, struct xarray *et_tbl) + struct xarray *elv_tbl) { - struct elevator_type *e = xa_load(elv_tbl, q->id); - struct elevator_tags *t = xa_load(et_tbl, q->id); + struct elv_change_ctx *ctx = xa_load(elv_tbl, q->id); + + if (WARN_ON_ONCE(!ctx)) + return; /* The elv_update_nr_hw_queues unfreezes the queue. */ - elv_update_nr_hw_queues(q, e, t); + elv_update_nr_hw_queues(q, ctx); /* Drop the reference acquired in blk_mq_elv_switch_none. */ - if (e) - elevator_put(e); + if (ctx->type) + elevator_put(ctx->type); } /* - * Stores elevator type in xarray and set current elevator to none. It uses - * q->id as an index to store the elevator type into the xarray. + * Stores elevator name and type in ctx and set current elevator to none. */ static int blk_mq_elv_switch_none(struct request_queue *q, struct xarray *elv_tbl) { - int ret = 0; + struct elv_change_ctx *ctx; lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); @@ -5015,10 +5048,11 @@ static int blk_mq_elv_switch_none(struct request_queue *q, * can't run concurrently. */ if (q->elevator) { + ctx = xa_load(elv_tbl, q->id); + if (WARN_ON_ONCE(!ctx)) + return -ENOENT; - ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); - if (WARN_ON_ONCE(ret)) - return ret; + ctx->name = q->elevator->type->elevator_name; /* * Before we switch elevator to 'none', take a reference to @@ -5029,9 +5063,14 @@ static int blk_mq_elv_switch_none(struct request_queue *q, */ __elevator_get(q->elevator->type); + /* + * Store elevator type so that we can release the reference + * taken above later. + */ + ctx->type = q->elevator->type; elevator_set_none(q); } - return ret; + return 0; } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, @@ -5041,7 +5080,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; - struct xarray elv_tbl, et_tbl; + struct xarray elv_tbl; bool queues_frozen = false; lockdep_assert_held(&set->tag_list_lock); @@ -5055,11 +5094,12 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, memflags = memalloc_noio_save(); - xa_init(&et_tbl); - if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) - goto out_memalloc_restore; - xa_init(&elv_tbl); + if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0) + goto out_free_ctx; + + if (blk_mq_alloc_sched_res_batch(&elv_tbl, set, nr_hw_queues) < 0) + goto out_free_ctx; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); @@ -5105,7 +5145,7 @@ switch_back: /* switch_back expects queue to be frozen */ if (!queues_frozen) blk_mq_freeze_queue_nomemsave(q); - blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); + blk_mq_elv_switch_back(q, &elv_tbl); } list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -5116,9 +5156,9 @@ switch_back: blk_mq_add_hw_queues_cpuhp(q); } +out_free_ctx: + blk_mq_free_sched_ctx_batch(&elv_tbl); xa_destroy(&elv_tbl); - xa_destroy(&et_tbl); -out_memalloc_restore: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ @@ -5168,7 +5208,7 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, { if (!blk_mq_can_poll(q)) return 0; - return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); + return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, diff --git a/block/blk-mq.h b/block/blk-mq.h index c4fccdeb5441..aa15d31aaae9 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -84,7 +84,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * enum hctx_type type, unsigned int cpu) { - return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); + return queue_hctx((q), (q->tag_set->map[type].mq_map[cpu])); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) diff --git a/block/blk-settings.c b/block/blk-settings.c index d74b13ec8e54..51401f08ce05 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -123,6 +123,19 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) return 0; } +/* + * Maximum size of I/O that needs a block layer integrity buffer. Limited + * by the number of intervals for which we can fit the integrity buffer into + * the buffer size. Because the buffer is a single segment it is also limited + * by the maximum segment size. + */ +static inline unsigned int max_integrity_io_size(struct queue_limits *lim) +{ + return min_t(unsigned int, lim->max_segment_size, + (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) << + lim->integrity.interval_exp); +} + static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; @@ -194,6 +207,14 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) (1U << bi->interval_exp) - 1); } + /* + * The block layer automatically adds integrity data for bios that don't + * already have it. Limit the I/O size so that a single maximum size + * metadata segment can cover the integrity data for the entire I/O. + */ + lim->max_sectors = min(lim->max_sectors, + max_integrity_io_size(lim) >> SECTOR_SHIFT); + return 0; } @@ -467,12 +488,12 @@ int blk_validate_limits(struct queue_limits *lim) return -EINVAL; } - /* setup min segment size for building new segment in fast path */ + /* setup max segment size for building new segment in fast path */ if (lim->seg_boundary_mask > lim->max_segment_size - 1) seg_size = lim->max_segment_size; else seg_size = lim->seg_boundary_mask + 1; - lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); + lim->max_fast_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); /* * We require drivers to at least do logical block aligned I/O, but @@ -535,6 +556,8 @@ int queue_limits_commit_update(struct request_queue *q, { int error; + lockdep_assert_held(&q->limits_lock); + error = blk_validate_limits(lim); if (error) goto out_unlock; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 76c47fe9b8d6..8684c57498cc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -143,21 +143,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; - unsigned int memflags; struct request_queue *q = disk->queue; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; /* - * ->ra_pages is protected by ->limits_lock because it is usually - * calculated from the queue limits by queue_limits_commit_update. + * The ->ra_pages change below is protected by ->limits_lock because it + * is usually calculated from the queue limits by + * queue_limits_commit_update(). + * + * bdi->ra_pages reads are not serialized against bdi->ra_pages writes. + * Use WRITE_ONCE() to write bdi->ra_pages once. */ mutex_lock(&q->limits_lock); - memflags = blk_mq_freeze_queue(q); - disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + WRITE_ONCE(disk->bdi->ra_pages, ra_kb >> (PAGE_SHIFT - 10)); mutex_unlock(&q->limits_lock); - blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -375,21 +376,18 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; - unsigned int memflags; struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; - memflags = blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -409,7 +407,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; - unsigned int memflags; ret = queue_var_store(&val, page, count); if (ret < 0) @@ -421,7 +418,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) * are accessed individually using atomic test_bit operation. So we * don't grab any lock while updating these flags. */ - memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); @@ -432,7 +428,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } - blk_mq_unfreeze_queue(q, memflags); #endif return ret; } @@ -446,11 +441,9 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int memflags; ssize_t ret = count; struct request_queue *q = disk->queue; - memflags = blk_mq_freeze_queue(q); if (!(q->limits.features & BLK_FEAT_POLL)) { ret = -EINVAL; goto out; @@ -459,7 +452,6 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page, pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); out: - blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -472,7 +464,7 @@ static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int val, memflags; + unsigned int val; int err; struct request_queue *q = disk->queue; @@ -480,9 +472,7 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, if (err || val == 0) return -EINVAL; - memflags = blk_mq_freeze_queue(q); blk_queue_rq_timeout(q, msecs_to_jiffies(val)); - blk_mq_unfreeze_queue(q, memflags); return count; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2c5b64b1a724..97188a795848 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -12,7 +12,6 @@ #include <linux/blktrace_api.h> #include "blk.h" #include "blk-cgroup-rwstat.h" -#include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ @@ -22,9 +21,7 @@ #define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ -#define DFL_THROTL_SLICE_HD (HZ / 10) -#define DFL_THROTL_SLICE_SSD (HZ / 50) -#define MAX_THROTL_SLICE (HZ) +#define DFL_THROTL_SLICE (HZ / 10) /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; @@ -41,12 +38,8 @@ struct throtl_data /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; - unsigned int throtl_slice; - /* Work for dispatching throttled bios */ struct work_struct dispatch_work; - - bool track_bio_latency; }; static void throtl_pending_timer_fn(struct timer_list *t); @@ -451,7 +444,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { - unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; + unsigned long max_expire = jiffies + 8 * DFL_THROTL_SLICE; /* * Since we are adjusting the throttle limit dynamically, the sleep @@ -519,7 +512,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; - tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + tg->slice_end[rw] = jiffies + DFL_THROTL_SLICE; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -534,7 +527,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, tg->io_disp[rw] = 0; } tg->slice_start[rw] = jiffies; - tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + tg->slice_end[rw] = jiffies + DFL_THROTL_SLICE; throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -545,7 +538,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); + tg->slice_end[rw] = roundup(jiffy_end, DFL_THROTL_SLICE); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, @@ -676,12 +669,12 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); + throtl_set_slice_end(tg, rw, jiffies + DFL_THROTL_SLICE); time_elapsed = rounddown(jiffies - tg->slice_start[rw], - tg->td->throtl_slice); + DFL_THROTL_SLICE); /* Don't trim slice until at least 2 slices are used */ - if (time_elapsed < tg->td->throtl_slice * 2) + if (time_elapsed < DFL_THROTL_SLICE * 2) return; /* @@ -692,7 +685,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * lower rate than expected. Therefore, other than the above rounddown, * one extra slice is preserved for deviation. */ - time_elapsed -= tg->td->throtl_slice; + time_elapsed -= DFL_THROTL_SLICE; bytes_trim = throtl_trim_bps(tg, rw, time_elapsed); io_trim = throtl_trim_iops(tg, rw, time_elapsed); if (!bytes_trim && !io_trim) @@ -702,7 +695,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, + rw == READ ? 'R' : 'W', time_elapsed / DFL_THROTL_SLICE, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } @@ -773,7 +766,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ - jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, DFL_THROTL_SLICE); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; @@ -799,9 +792,9 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) - jiffy_elapsed_rnd = tg->td->throtl_slice; + jiffy_elapsed_rnd = DFL_THROTL_SLICE; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, DFL_THROTL_SLICE); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); /* Need to consider the case of bytes_allowed overflow. */ if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) @@ -853,7 +846,7 @@ static void tg_update_slice(struct throtl_grp *tg, bool rw) sq_queued(&tg->service_queue, rw) == 0) throtl_start_new_slice(tg, rw, true); else - throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); + throtl_extend_slice(tg, rw, jiffies + DFL_THROTL_SLICE); } static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio) @@ -1338,18 +1331,8 @@ static int blk_throtl_init(struct gendisk *disk) if (ret) { q->td = NULL; kfree(td); - goto out; } - if (blk_queue_nonrot(q)) - td->throtl_slice = DFL_THROTL_SLICE_SSD; - else - td->throtl_slice = DFL_THROTL_SLICE_HD; - td->track_bio_latency = !queue_is_mq(q); - if (!td->track_bio_latency) - blk_stat_enable_accounting(q); - -out: blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue, memflags); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 5e2a5788dc3b..dcc295721c2c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -33,12 +33,18 @@ static const char *const zone_cond_name[] = { ZONE_COND_NAME(READONLY), ZONE_COND_NAME(FULL), ZONE_COND_NAME(OFFLINE), + ZONE_COND_NAME(ACTIVE), }; #undef ZONE_COND_NAME /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. + * @bio_list: The list of BIOs that are currently plugged. + * @bio_work: Work struct to handle issuing of plugged BIOs + * @rcu_head: RCU head to free zone write plugs with an RCU grace period. + * @disk: The gendisk the plug belongs to. + * @lock: Spinlock to atomically manipulate the plug. * @ref: Zone write plug reference counter. A zone write plug reference is * always at least 1 when the plug is hashed in the disk plug hash table. * The reference is incremented whenever a new BIO needing plugging is @@ -48,29 +54,44 @@ static const char *const zone_cond_name[] = { * reference is dropped whenever the zone of the zone write plug is reset, * finished and when the zone becomes full (last write BIO to the zone * completes). - * @lock: Spinlock to atomically manipulate the plug. * @flags: Flags indicating the plug state. * @zone_no: The number of the zone the plug is managing. * @wp_offset: The zone write pointer location relative to the start of the zone * as a number of 512B sectors. - * @bio_list: The list of BIOs that are currently plugged. - * @bio_work: Work struct to handle issuing of plugged BIOs - * @rcu_head: RCU head to free zone write plugs with an RCU grace period. - * @disk: The gendisk the plug belongs to. + * @cond: Condition of the zone */ struct blk_zone_wplug { struct hlist_node node; - refcount_t ref; - spinlock_t lock; - unsigned int flags; - unsigned int zone_no; - unsigned int wp_offset; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; struct gendisk *disk; + spinlock_t lock; + refcount_t ref; + unsigned int flags; + unsigned int zone_no; + unsigned int wp_offset; + enum blk_zone_cond cond; }; +static inline bool disk_need_zone_resources(struct gendisk *disk) +{ + /* + * All request-based zoned devices need zone resources so that the + * block layer can automatically handle write BIO plugging. BIO-based + * device drivers (e.g. DM devices) are normally responsible for + * handling zone write ordering and do not need zone resources, unless + * the driver requires zone append emulation. + */ + return queue_is_mq(disk->queue) || + queue_emulates_zone_append(disk->queue); +} + +static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) +{ + return 1U << disk->zone_wplugs_hash_bits; +} + /* * Zone write plug flags bits: * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, @@ -109,28 +130,108 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -struct disk_report_zones_cb_args { - struct gendisk *disk; - report_zones_cb user_cb; - void *user_data; -}; +static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno, + enum blk_zone_cond cond) +{ + if (!zones_cond) + return; -static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, - struct blk_zone *zone); + switch (cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + zones_cond[zno] = BLK_ZONE_COND_ACTIVE; + return; + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + zones_cond[zno] = cond; + return; + } +} -static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, - void *data) +static void disk_zone_set_cond(struct gendisk *disk, sector_t sector, + enum blk_zone_cond cond) { - struct disk_report_zones_cb_args *args = data; - struct gendisk *disk = args->disk; + u8 *zones_cond; - if (disk->zone_wplugs_hash) - disk_zone_wplug_sync_wp_offset(disk, zone); + rcu_read_lock(); + zones_cond = rcu_dereference(disk->zones_cond); + if (zones_cond) { + unsigned int zno = disk_zone_no(disk, sector); + + /* + * The condition of a conventional, readonly and offline zones + * never changes, so do nothing if the target zone is in one of + * these conditions. + */ + switch (zones_cond[zno]) { + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_READONLY: + case BLK_ZONE_COND_OFFLINE: + break; + default: + blk_zone_set_cond(zones_cond, zno, cond); + break; + } + } + rcu_read_unlock(); +} + +/** + * bdev_zone_is_seq - check if a sector belongs to a sequential write zone + * @bdev: block device to check + * @sector: sector number + * + * Check if @sector on @bdev is contained in a sequential write required zone. + */ +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + struct gendisk *disk = bdev->bd_disk; + unsigned int zno = disk_zone_no(disk, sector); + bool is_seq = false; + u8 *zones_cond; + + if (!bdev_is_zoned(bdev)) + return false; + + rcu_read_lock(); + zones_cond = rcu_dereference(disk->zones_cond); + if (zones_cond && zno < disk->nr_zones) + is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP; + rcu_read_unlock(); - if (!args->user_cb) + return is_seq; +} +EXPORT_SYMBOL_GPL(bdev_zone_is_seq); + +/* + * Zone report arguments for block device drivers report_zones operation. + * @cb: report_zones_cb callback for each reported zone. + * @data: Private data passed to report_zones_cb. + */ +struct blk_report_zones_args { + report_zones_cb cb; + void *data; + bool report_active; +}; + +static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, + struct blk_report_zones_args *args) +{ + struct gendisk *disk = bdev->bd_disk; + + if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) + return -EOPNOTSUPP; + + if (!nr_zones || sector >= get_capacity(disk)) return 0; - return args->user_cb(zone, idx, args->user_data); + return disk->fops->report_zones(disk, sector, nr_zones, args); } /** @@ -155,22 +256,12 @@ static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { - struct gendisk *disk = bdev->bd_disk; - sector_t capacity = get_capacity(disk); - struct disk_report_zones_cb_args args = { - .disk = disk, - .user_cb = cb, - .user_data = data, + struct blk_report_zones_args args = { + .cb = cb, + .data = data, }; - if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) - return -EOPNOTSUPP; - - if (!nr_zones || sector >= capacity) - return 0; - - return disk->fops->report_zones(disk, sector, nr_zones, - disk_report_zones_cb, &args); + return blkdev_do_report_zones(bdev, sector, nr_zones, &args); } EXPORT_SYMBOL_GPL(blkdev_report_zones); @@ -266,7 +357,12 @@ static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, } /* - * BLKREPORTZONE ioctl processing. + * Mask of valid input flags for BLKREPORTZONEV2 ioctl. + */ +#define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED + +/* + * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing. * Called from blkdev_ioctl. */ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, @@ -290,8 +386,22 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, return -EINVAL; args.zones = argp + sizeof(struct blk_zone_report); - ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, - blkdev_copy_zone_to_user, &args); + + switch (cmd) { + case BLKREPORTZONE: + ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, + blkdev_copy_zone_to_user, &args); + break; + case BLKREPORTZONEV2: + if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS) + return -EINVAL; + ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones, + blkdev_copy_zone_to_user, &args); + break; + default: + return -EINVAL; + } + if (ret < 0) return ret; @@ -401,6 +511,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, { struct blk_zone_wplug *zwplg; unsigned long flags; + u8 *zones_cond; unsigned int idx = hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); @@ -416,6 +527,20 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, return false; } } + + /* + * Set the zone condition: if we do not yet have a zones_cond array + * attached to the disk, then this is a zone write plug insert from the + * first call to blk_revalidate_disk_zones(), in which case the zone is + * necessarilly in the active condition. + */ + zones_cond = rcu_dereference_check(disk->zones_cond, + lockdep_is_held(&disk->zone_wplugs_lock)); + if (zones_cond) + zwplug->cond = zones_cond[zwplug->zone_no]; + else + zwplug->cond = BLK_ZONE_COND_ACTIVE; + hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); atomic_inc(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); @@ -515,10 +640,15 @@ static void disk_remove_zone_wplug(struct gendisk *disk, /* * Mark the zone write plug as unhashed and drop the extra reference we - * took when the plug was inserted in the hash table. + * took when the plug was inserted in the hash table. Also update the + * disk zone condition array with the current condition of the zone + * write plug. */ zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, + lockdep_is_held(&disk->zone_wplugs_lock)), + zwplug->zone_no, zwplug->cond); hlist_del_init_rcu(&zwplug->node); atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); @@ -600,7 +730,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); bio_io_error(bio); disk_put_zone_wplug(zwplug); - /* Drop the reference taken by disk_zone_wplug_add_bio(() */ + /* Drop the reference taken by disk_zone_wplug_add_bio(). */ blk_queue_exit(q); } @@ -621,6 +751,22 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) } /* + * Update a zone write plug condition based on the write pointer offset. + */ +static void disk_zone_wplug_update_cond(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + lockdep_assert_held(&zwplug->lock); + + if (disk_zone_wplug_is_full(disk, zwplug)) + zwplug->cond = BLK_ZONE_COND_FULL; + else if (!zwplug->wp_offset) + zwplug->cond = BLK_ZONE_COND_EMPTY; + else + zwplug->cond = BLK_ZONE_COND_ACTIVE; +} + +/* * Set a zone write plug write pointer offset to the specified value. * This aborts all plugged BIOs, which is fine as this function is called for * a zone reset operation, a zone finish operation or if the zone needs a wp @@ -635,6 +781,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, /* Update the zone write pointer and abort all plugged BIOs. */ zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; zwplug->wp_offset = wp_offset; + disk_zone_wplug_update_cond(disk, zwplug); + disk_zone_wplug_abort(zwplug); /* @@ -652,122 +800,399 @@ static unsigned int blk_zone_wp_offset(struct blk_zone *zone) case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: return zone->wp - zone->start; - case BLK_ZONE_COND_FULL: - return zone->len; case BLK_ZONE_COND_EMPTY: return 0; + case BLK_ZONE_COND_FULL: case BLK_ZONE_COND_NOT_WP: case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: default: /* - * Conventional, offline and read-only zones do not have a valid - * write pointer. + * Conventional, full, offline and read-only zones do not have + * a valid write pointer. */ return UINT_MAX; } } -static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, - struct blk_zone *zone) +static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk, + struct blk_zone *zone) { struct blk_zone_wplug *zwplug; - unsigned long flags; + unsigned int wp_offset = blk_zone_wp_offset(zone); zwplug = disk_get_zone_wplug(disk, zone->start); - if (!zwplug) - return; + if (zwplug) { + unsigned long flags; - spin_lock_irqsave(&zwplug->lock, flags); - if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) - disk_zone_wplug_set_wp_offset(disk, zwplug, - blk_zone_wp_offset(zone)); - spin_unlock_irqrestore(&zwplug->lock, flags); + spin_lock_irqsave(&zwplug->lock, flags); + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) + disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + } - disk_put_zone_wplug(zwplug); + return wp_offset; } -static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) +/** + * disk_report_zone - Report one zone + * @disk: Target disk + * @zone: The zone to report + * @idx: The index of the zone in the overall zone report + * @args: report zones callback and data + * + * Description: + * Helper function for block device drivers to report one zone of a zone + * report initiated with blkdev_report_zones(). The zone being reported is + * specified by @zone and used to update, if necessary, the zone write plug + * information for the zone. If @args specifies a user callback function, + * this callback is executed. + */ +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args) { - struct disk_report_zones_cb_args args = { - .disk = disk, + if (args && args->report_active) { + /* + * If we come here, then this is a report zones as a fallback + * for a cached report. So collapse the implicit open, explicit + * open and closed conditions into the active zone condition. + */ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + zone->cond = BLK_ZONE_COND_ACTIVE; + break; + default: + break; + } + } + + if (disk->zone_wplugs_hash) + disk_zone_wplug_sync_wp_offset(disk, zone); + + if (args && args->cb) + return args->cb(zone, idx, args->data); + + return 0; +} +EXPORT_SYMBOL_GPL(disk_report_zone); + +static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + memcpy(data, zone, sizeof(struct blk_zone)); + return 0; +} + +static int blkdev_report_zone_fallback(struct block_device *bdev, + sector_t sector, struct blk_zone *zone) +{ + struct blk_report_zones_args args = { + .cb = blkdev_report_zone_cb, + .data = zone, + .report_active = true, }; + int error; + + error = blkdev_do_report_zones(bdev, sector, 1, &args); + if (error < 0) + return error; + if (error == 0) + return -EIO; + return 0; +} - return disk->fops->report_zones(disk, sector, 1, - disk_report_zones_cb, &args); +/* + * For devices that natively support zone append operations, we do not use zone + * write plugging for zone append writes, which makes the zone condition + * tracking invalid once zone append was used. In that case fall back to a + * regular report zones to get correct information. + */ +static inline bool blkdev_has_cached_report_zones(struct block_device *bdev) +{ + return disk_need_zone_resources(bdev->bd_disk) && + (bdev_emulates_zone_append(bdev) || + !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state)); } -static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, - unsigned int wp_offset) +/** + * blkdev_get_zone_info - Get a single zone information from cached data + * @bdev: Target block device + * @sector: Sector contained by the target zone + * @zone: zone structure to return the zone information + * + * Description: + * Get the zone information for the zone containing @sector using the zone + * write plug of the target zone, if one exist, or the disk zone condition + * array otherwise. The zone condition may be reported as being + * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit + * open, explicit open or closed condition. + * + * Returns 0 on success and a negative error code on failure. + */ +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone) { - struct gendisk *disk = bio->bi_bdev->bd_disk; - sector_t sector = bio->bi_iter.bi_sector; + struct gendisk *disk = bdev->bd_disk; + sector_t zone_sectors = bdev_zone_sectors(bdev); struct blk_zone_wplug *zwplug; unsigned long flags; + u8 *zones_cond; - /* Conventional zones cannot be reset nor finished. */ - if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { - bio_io_error(bio); - return true; + if (!bdev_is_zoned(bdev)) + return -EOPNOTSUPP; + + if (sector >= get_capacity(disk)) + return -EINVAL; + + memset(zone, 0, sizeof(*zone)); + sector = bdev_zone_start(bdev, sector); + + if (!blkdev_has_cached_report_zones(bdev)) + return blkdev_report_zone_fallback(bdev, sector, zone); + + rcu_read_lock(); + zones_cond = rcu_dereference(disk->zones_cond); + if (!disk->zone_wplugs_hash || !zones_cond) { + rcu_read_unlock(); + return blkdev_report_zone_fallback(bdev, sector, zone); } + zone->cond = zones_cond[disk_zone_no(disk, sector)]; + rcu_read_unlock(); + + zone->start = sector; + zone->len = zone_sectors; /* - * No-wait reset or finish BIOs do not make much sense as the callers - * issue these as blocking operations in most cases. To avoid issues - * the BIO execution potentially failing with BLK_STS_AGAIN, warn about - * REQ_NOWAIT being set and ignore that flag. + * If this is a conventional zone, we do not have a zone write plug and + * can report the zone immediately. */ - if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) - bio->bi_opf &= ~REQ_NOWAIT; + if (zone->cond == BLK_ZONE_COND_NOT_WP) { + zone->type = BLK_ZONE_TYPE_CONVENTIONAL; + zone->capacity = zone_sectors; + zone->wp = ULLONG_MAX; + return 0; + } + + /* + * This is a sequential write required zone. If the zone is read-only or + * offline, only set the zone write pointer to an invalid value and + * report the zone. + */ + zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; + if (disk_zone_is_last(disk, zone)) + zone->capacity = disk->last_zone_capacity; + else + zone->capacity = disk->zone_capacity; + + if (zone->cond == BLK_ZONE_COND_READONLY || + zone->cond == BLK_ZONE_COND_OFFLINE) { + zone->wp = ULLONG_MAX; + return 0; + } /* - * If we have a zone write plug, set its write pointer offset to 0 - * (reset case) or to the zone size (finish case). This will abort all - * BIOs plugged for the target zone. It is fine as resetting or - * finishing zones while writes are still in-flight will result in the + * If the zone does not have a zone write plug, it is either full or + * empty, as we otherwise would have a zone write plug for it. In this + * case, set the write pointer accordingly and report the zone. + * Otherwise, if we have a zone write plug, use it. + */ + zwplug = disk_get_zone_wplug(disk, sector); + if (!zwplug) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = ULLONG_MAX; + else + zone->wp = sector; + return 0; + } + + spin_lock_irqsave(&zwplug->lock, flags); + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) { + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + return blkdev_report_zone_fallback(bdev, sector, zone); + } + zone->cond = zwplug->cond; + zone->wp = sector + zwplug->wp_offset; + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); + + return 0; +} +EXPORT_SYMBOL_GPL(blkdev_get_zone_info); + +/** + * blkdev_report_zones_cached - Get cached zones information + * @bdev: Target block device + * @sector: Sector from which to report zones + * @nr_zones: Maximum number of zones to report + * @cb: Callback function called for each reported zone + * @data: Private data for the callback function + * + * Description: + * Similar to blkdev_report_zones() but instead of calling into the low level + * device driver to get the zone report from the device, use + * blkdev_get_zone_info() to generate the report from the disk zone write + * plugs and zones condition array. Since calling this function without a + * callback does not make sense, @cb must be specified. + */ +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct gendisk *disk = bdev->bd_disk; + sector_t capacity = get_capacity(disk); + sector_t zone_sectors = bdev_zone_sectors(bdev); + unsigned int idx = 0; + struct blk_zone zone; + int ret; + + if (!cb || !bdev_is_zoned(bdev) || + WARN_ON_ONCE(!disk->fops->report_zones)) + return -EOPNOTSUPP; + + if (!nr_zones || sector >= capacity) + return 0; + + if (!blkdev_has_cached_report_zones(bdev)) { + struct blk_report_zones_args args = { + .cb = cb, + .data = data, + .report_active = true, + }; + + return blkdev_do_report_zones(bdev, sector, nr_zones, &args); + } + + for (sector = bdev_zone_start(bdev, sector); + sector < capacity && idx < nr_zones; + sector += zone_sectors, idx++) { + ret = blkdev_get_zone_info(bdev, sector, &zone); + if (ret) + return ret; + + ret = cb(&zone, idx, data); + if (ret) + return ret; + } + + return idx; +} +EXPORT_SYMBOL_GPL(blkdev_report_zones_cached); + +static void blk_zone_reset_bio_endio(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; + struct blk_zone_wplug *zwplug; + + /* + * If we have a zone write plug, set its write pointer offset to 0. + * This will abort all BIOs plugged for the target zone. It is fine as + * resetting zones while writes are still in-flight will result in the * writes failing anyway. */ zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { + unsigned long flags; + spin_lock_irqsave(&zwplug->lock, flags); - disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + disk_zone_wplug_set_wp_offset(disk, zwplug, 0); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); + } else { + disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); } - - return false; } -static bool blk_zone_wplug_handle_reset_all(struct bio *bio) +static void blk_zone_reset_all_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t capacity = get_capacity(disk); struct blk_zone_wplug *zwplug; unsigned long flags; sector_t sector; + unsigned int i; + + if (atomic_read(&disk->nr_zone_wplugs)) { + /* Update the condition of all zone write plugs. */ + rcu_read_lock(); + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { + hlist_for_each_entry_rcu(zwplug, + &disk->zone_wplugs_hash[i], + node) { + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_set_wp_offset(disk, zwplug, 0); + spin_unlock_irqrestore(&zwplug->lock, flags); + } + } + rcu_read_unlock(); + } + + /* Update the cached zone conditions. */ + for (sector = 0; sector < capacity; + sector += bdev_zone_sectors(bio->bi_bdev)) + disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); + clear_bit(GD_ZONE_APPEND_USED, &disk->state); +} + +static void blk_zone_finish_bio_endio(struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct gendisk *disk = bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; + struct blk_zone_wplug *zwplug; /* - * Set the write pointer offset of all zone write plugs to 0. This will - * abort all plugged BIOs. It is fine as resetting zones while writes - * are still in-flight will result in the writes failing anyway. + * If we have a zone write plug, set its write pointer offset to the + * zone size. This will abort all BIOs plugged for the target zone. It + * is fine as resetting zones while writes are still in-flight will + * result in the writes failing anyway. */ - for (sector = 0; sector < get_capacity(disk); - sector += disk->queue->limits.chunk_sectors) { - zwplug = disk_get_zone_wplug(disk, sector); - if (zwplug) { - spin_lock_irqsave(&zwplug->lock, flags); - disk_zone_wplug_set_wp_offset(disk, zwplug, 0); - spin_unlock_irqrestore(&zwplug->lock, flags); - disk_put_zone_wplug(zwplug); - } + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_set_wp_offset(disk, zwplug, + bdev_zone_sectors(bdev)); + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + } else { + disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL); } +} - return false; +void blk_zone_mgmt_bio_endio(struct bio *bio) +{ + /* If the BIO failed, we have nothing to do. */ + if (bio->bi_status != BLK_STS_OK) + return; + + switch (bio_op(bio)) { + case REQ_OP_ZONE_RESET: + blk_zone_reset_bio_endio(bio); + return; + case REQ_OP_ZONE_RESET_ALL: + blk_zone_reset_all_bio_endio(bio); + return; + case REQ_OP_ZONE_FINISH: + blk_zone_finish_bio_endio(bio); + return; + default: + return; + } } static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, struct blk_zone_wplug *zwplug) { + lockdep_assert_held(&zwplug->lock); + /* * Take a reference on the zone write plug and schedule the submission * of the next plugged BIO. blk_zone_wplug_bio_work() will release the @@ -782,8 +1207,6 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, struct blk_zone_wplug *zwplug, struct bio *bio, unsigned int nr_segs) { - bool schedule_bio_work = false; - /* * Grab an extra reference on the BIO request queue usage counter. * This reference will be reused to submit a request for the BIO for @@ -800,16 +1223,6 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_clear_polled(bio); /* - * REQ_NOWAIT BIOs are always handled using the zone write plug BIO - * work, which can block. So clear the REQ_NOWAIT flag and schedule the - * work if this is the first BIO we are plugging. - */ - if (bio->bi_opf & REQ_NOWAIT) { - schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); - bio->bi_opf &= ~REQ_NOWAIT; - } - - /* * Reuse the poll cookie field to store the number of segments when * split to the hardware limits. */ @@ -824,11 +1237,6 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); - - zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - - if (schedule_bio_work) - disk_zone_wplug_schedule_bio_work(disk, zwplug); } /* @@ -836,6 +1244,7 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, */ void blk_zone_write_plug_bio_merged(struct bio *bio) { + struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug; unsigned long flags; @@ -857,13 +1266,13 @@ void blk_zone_write_plug_bio_merged(struct bio *bio) * have at least one request and one BIO referencing the zone write * plug. So this should not fail. */ - zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, - bio->bi_iter.bi_sector); + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); if (WARN_ON_ONCE(!zwplug)) return; spin_lock_irqsave(&zwplug->lock, flags); zwplug->wp_offset += bio_sectors(bio); + disk_zone_wplug_update_cond(disk, zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -922,6 +1331,7 @@ void blk_zone_write_plug_init_request(struct request *req) /* Drop the reference taken by disk_zone_wplug_add_bio(). */ blk_queue_exit(q); zwplug->wp_offset += bio_sectors(bio); + disk_zone_wplug_update_cond(disk, zwplug); req_back_sector += bio_sectors(bio); } @@ -985,6 +1395,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, /* Advance the zone write pointer offset. */ zwplug->wp_offset += bio_sectors(bio); + disk_zone_wplug_update_cond(disk, zwplug); return true; } @@ -1036,14 +1447,17 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); /* - * If the zone is already plugged, add the BIO to the plug BIO list. - * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a - * BLK_STS_AGAIN failure if we let the BIO execute. - * Otherwise, plug and let the BIO execute. + * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a + * BLK_STS_AGAIN failure if we let the caller submit the BIO. */ - if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || - (bio->bi_opf & REQ_NOWAIT)) - goto plug; + if (bio->bi_opf & REQ_NOWAIT) { + bio->bi_opf &= ~REQ_NOWAIT; + goto queue_bio; + } + + /* If the zone is already plugged, add the BIO to the BIO plug list. */ + if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) + goto queue_bio; if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1051,15 +1465,21 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return true; } + /* Otherwise, plug and let the caller submit the BIO. */ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); return false; -plug: +queue_bio: disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); + if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + disk_zone_wplug_schedule_bio_work(disk, zwplug); + } + spin_unlock_irqrestore(&zwplug->lock, flags); return true; @@ -1071,6 +1491,9 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) struct blk_zone_wplug *zwplug; unsigned long flags; + if (!test_bit(GD_ZONE_APPEND_USED, &disk->state)) + set_bit(GD_ZONE_APPEND_USED, &disk->state); + /* * We have native support for zone append operations, so we are not * going to handle @bio through plugging. However, we may already have a @@ -1106,6 +1529,30 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) disk_put_zone_wplug(zwplug); } +static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio) +{ + if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL && + !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { + /* + * Zone reset and zone finish operations do not apply to + * conventional zones. + */ + bio_io_error(bio); + return true; + } + + /* + * No-wait zone management BIOs do not make much sense as the callers + * issue these as blocking operations in most cases. To avoid issues + * with the BIO execution potentially failing with BLK_STS_AGAIN, warn + * about REQ_NOWAIT being set and ignore that flag. + */ + if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) + bio->bi_opf &= ~REQ_NOWAIT; + + return false; +} + /** * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging * @bio: The BIO being submitted @@ -1153,12 +1600,9 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) case REQ_OP_WRITE_ZEROES: return blk_zone_wplug_handle_write(bio, nr_segs); case REQ_OP_ZONE_RESET: - return blk_zone_wplug_handle_reset_or_finish(bio, 0); case REQ_OP_ZONE_FINISH: - return blk_zone_wplug_handle_reset_or_finish(bio, - bdev_zone_sectors(bdev)); case REQ_OP_ZONE_RESET_ALL: - return blk_zone_wplug_handle_reset_all(bio); + return blk_zone_wplug_handle_zone_mgmt(bio); default: return false; } @@ -1332,11 +1776,6 @@ put_zwplug: disk_put_zone_wplug(zwplug); } -static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) -{ - return 1U << disk->zone_wplugs_hash_bits; -} - void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_lock); @@ -1415,31 +1854,30 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; + + /* + * Wait for the zone write plugs to be RCU-freed before destroying the + * mempool. + */ + rcu_barrier(); + mempool_destroy(disk->zone_wplugs_pool); + disk->zone_wplugs_pool = NULL; } -static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, - unsigned long *bitmap) +static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) { - unsigned int nr_conv_zones = 0; unsigned long flags; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - if (bitmap) - nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); - bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, - lockdep_is_held(&disk->zone_wplugs_lock)); + zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, + lockdep_is_held(&disk->zone_wplugs_lock)); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); - kfree_rcu_mightsleep(bitmap); - - return nr_conv_zones; + kfree_rcu_mightsleep(zones_cond); } void disk_free_zone_resources(struct gendisk *disk) { - if (!disk->zone_wplugs_pool) - return; - if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; @@ -1447,40 +1885,37 @@ void disk_free_zone_resources(struct gendisk *disk) disk_destroy_zone_wplugs_hash_table(disk); - /* - * Wait for the zone write plugs to be RCU-freed before - * destorying the mempool. - */ - rcu_barrier(); - - mempool_destroy(disk->zone_wplugs_pool); - disk->zone_wplugs_pool = NULL; - - disk_set_conv_zones_bitmap(disk, NULL); + disk_set_zones_cond_array(disk, NULL); disk->zone_capacity = 0; disk->last_zone_capacity = 0; disk->nr_zones = 0; } -static inline bool disk_need_zone_resources(struct gendisk *disk) -{ - /* - * All mq zoned devices need zone resources so that the block layer - * can automatically handle write BIO plugging. BIO-based device drivers - * (e.g. DM devices) are normally responsible for handling zone write - * ordering and do not need zone resources, unless the driver requires - * zone append emulation. - */ - return queue_is_mq(disk->queue) || - queue_emulates_zone_append(disk->queue); -} +struct blk_revalidate_zone_args { + struct gendisk *disk; + u8 *zones_cond; + unsigned int nr_zones; + unsigned int nr_conv_zones; + unsigned int zone_capacity; + unsigned int last_zone_capacity; + sector_t sector; +}; static int disk_revalidate_zone_resources(struct gendisk *disk, - unsigned int nr_zones) + struct blk_revalidate_zone_args *args) { struct queue_limits *lim = &disk->queue->limits; unsigned int pool_size; + args->disk = disk; + args->nr_zones = + DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors); + + /* Cached zone conditions: 1 byte per zone */ + args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO); + if (!args->zones_cond) + return -ENOMEM; + if (!disk_need_zone_resources(disk)) return 0; @@ -1490,7 +1925,8 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, */ pool_size = max(lim->max_open_zones, lim->max_active_zones); if (!pool_size) - pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); + pool_size = + min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); if (!disk->zone_wplugs_hash) return disk_alloc_zone_resources(disk, pool_size); @@ -1498,15 +1934,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, return 0; } -struct blk_revalidate_zone_args { - struct gendisk *disk; - unsigned long *conv_zones_bitmap; - unsigned int nr_zones; - unsigned int zone_capacity; - unsigned int last_zone_capacity; - sector_t sector; -}; - /* * Update the disk zone resources information and device queue limits. * The disk queue is frozen when this is executed. @@ -1515,30 +1942,34 @@ static int disk_update_zone_resources(struct gendisk *disk, struct blk_revalidate_zone_args *args) { struct request_queue *q = disk->queue; - unsigned int nr_seq_zones, nr_conv_zones; - unsigned int pool_size; + unsigned int nr_seq_zones; + unsigned int pool_size, memflags; struct queue_limits lim; + int ret = 0; + + lim = queue_limits_start_update(q); + + memflags = blk_mq_freeze_queue(q); disk->nr_zones = args->nr_zones; - disk->zone_capacity = args->zone_capacity; - disk->last_zone_capacity = args->last_zone_capacity; - nr_conv_zones = - disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); - if (nr_conv_zones >= disk->nr_zones) { + if (args->nr_conv_zones >= disk->nr_zones) { pr_warn("%s: Invalid number of conventional zones %u / %u\n", - disk->disk_name, nr_conv_zones, disk->nr_zones); - return -ENODEV; + disk->disk_name, args->nr_conv_zones, disk->nr_zones); + ret = -ENODEV; + goto unfreeze; } - lim = queue_limits_start_update(q); + disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; + disk_set_zones_cond_array(disk, args->zones_cond); /* - * Some devices can advertize zone resource limits that are larger than + * Some devices can advertise zone resource limits that are larger than * the number of sequential zones of the zoned block device, e.g. a * small ZNS namespace. For such case, assume that the zoned device has * no zone resource limits. */ - nr_seq_zones = disk->nr_zones - nr_conv_zones; + nr_seq_zones = disk->nr_zones - args->nr_conv_zones; if (lim.max_open_zones >= nr_seq_zones) lim.max_open_zones = 0; if (lim.max_active_zones >= nr_seq_zones) @@ -1568,7 +1999,53 @@ static int disk_update_zone_resources(struct gendisk *disk, } commit: - return queue_limits_commit_update_frozen(q, &lim); + ret = queue_limits_commit_update(q, &lim); + +unfreeze: + if (ret) + disk_free_zone_resources(disk); + + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} + +static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + enum blk_zone_cond cond = zone->cond; + + /* Check that the zone condition is consistent with the zone type. */ + switch (cond) { + case BLK_ZONE_COND_NOT_WP: + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) + goto invalid_condition; + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + goto invalid_condition; + break; + default: + pr_warn("%s: Invalid zone condition 0x%X\n", + args->disk->disk_name, cond); + return -ENODEV; + } + + blk_zone_set_cond(args->zones_cond, idx, cond); + + return 0; + +invalid_condition: + pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n", + args->disk->disk_name, cond, zone->type); + + return -ENODEV; } static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, @@ -1585,17 +2062,7 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, if (disk_zone_is_last(disk, zone)) args->last_zone_capacity = zone->capacity; - if (!disk_need_zone_resources(disk)) - return 0; - - if (!args->conv_zones_bitmap) { - args->conv_zones_bitmap = - bitmap_zalloc(args->nr_zones, GFP_NOIO); - if (!args->conv_zones_bitmap) - return -ENOMEM; - } - - set_bit(idx, args->conv_zones_bitmap); + args->nr_conv_zones++; return 0; } @@ -1632,9 +2099,7 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) return 0; - disk_zone_wplug_sync_wp_offset(disk, zone); - - wp_offset = blk_zone_wp_offset(zone); + wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone); if (!wp_offset || wp_offset >= zone->capacity) return 0; @@ -1693,6 +2158,11 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + /* Check zone condition */ + ret = blk_revalidate_zone_cond(zone, idx, args); + if (ret) + return ret; + /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: @@ -1733,7 +2203,11 @@ int blk_revalidate_disk_zones(struct gendisk *disk) sector_t zone_sectors = q->limits.chunk_sectors; sector_t capacity = get_capacity(disk); struct blk_revalidate_zone_args args = { }; - unsigned int noio_flag; + unsigned int memflags, noio_flag; + struct blk_report_zones_args rep_args = { + .cb = blk_revalidate_zone_cb, + .data = &args, + }; int ret = -ENOMEM; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) @@ -1756,17 +2230,14 @@ int blk_revalidate_disk_zones(struct gendisk *disk) * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. */ - args.disk = disk; - args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); - ret = disk_revalidate_zone_resources(disk, args.nr_zones); + ret = disk_revalidate_zone_resources(disk, &args); if (ret) { memalloc_noio_restore(noio_flag); return ret; } - ret = disk->fops->report_zones(disk, 0, UINT_MAX, - blk_revalidate_zone_cb, &args); + ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args); if (!ret) { pr_warn("%s: No zones reported\n", disk->disk_name); ret = -ENODEV; @@ -1783,20 +2254,14 @@ int blk_revalidate_disk_zones(struct gendisk *disk) ret = -ENODEV; } - /* - * Set the new disk zone parameters only once the queue is frozen and - * all I/Os are completed. - */ if (ret > 0) - ret = disk_update_zone_resources(disk, &args); - else - pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - if (ret) { - unsigned int memflags = blk_mq_freeze_queue(q); + return disk_update_zone_resources(disk, &args); - disk_free_zone_resources(disk); - blk_mq_unfreeze_queue(q, memflags); - } + pr_warn("%s: failed to revalidate zones\n", disk->disk_name); + + memflags = blk_mq_freeze_queue(q); + disk_free_zone_resources(disk); + blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -1817,6 +2282,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask) { + struct gendisk *disk = bdev->bd_disk; int ret; if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) @@ -1832,7 +2298,7 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, * pointer. Undo this using a report zone to update the zone write * pointer to the correct current value. */ - ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); + ret = disk->fops->report_zones(disk, sector, 1, NULL); if (ret != 1) return ret < 0 ? ret : -EIO; @@ -1851,18 +2317,22 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, unsigned int zwp_wp_offset, zwp_flags; unsigned int zwp_zone_no, zwp_ref; unsigned int zwp_bio_list_size; + enum blk_zone_cond zwp_cond; unsigned long flags; spin_lock_irqsave(&zwplug->lock, flags); zwp_zone_no = zwplug->zone_no; zwp_flags = zwplug->flags; zwp_ref = refcount_read(&zwplug->ref); + zwp_cond = zwplug->cond; zwp_wp_offset = zwplug->wp_offset; zwp_bio_list_size = bio_list_size(&zwplug->bio_list); spin_unlock_irqrestore(&zwplug->lock, flags); - seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, - zwp_wp_offset, zwp_bio_list_size); + seq_printf(m, + "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n", + zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond), + zwp_wp_offset, zwp_bio_list_size); } int queue_zone_wplugs_show(void *data, struct seq_file *m) diff --git a/block/blk.h b/block/blk.h index 170794632135..e4c433f62dfc 100644 --- a/block/blk.h +++ b/block/blk.h @@ -11,8 +11,7 @@ #include <xen/xen.h> #include "blk-crypto-internal.h" -struct elevator_type; -struct elevator_tags; +struct elv_change_ctx; /* * Default upper limit for the software max_sectors limit used for regular I/Os. @@ -333,8 +332,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, - struct elevator_tags *t); +void elv_update_nr_hw_queues(struct request_queue *q, + struct elv_change_ctx *ctx); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); @@ -377,7 +376,7 @@ static inline bool bio_may_need_split(struct bio *bio, if (bio->bi_vcnt != 1) return true; return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > - lim->min_segment_size; + lim->max_fast_segment_size; } /** @@ -489,10 +488,24 @@ static inline bool blk_req_bio_is_zone_append(struct request *rq, void blk_zone_write_plug_bio_merged(struct bio *bio); void blk_zone_write_plug_init_request(struct request *rq); void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio); +void blk_zone_mgmt_bio_endio(struct bio *bio); void blk_zone_write_plug_bio_endio(struct bio *bio); static inline void blk_zone_bio_endio(struct bio *bio) { /* + * Zone management BIOs may impact zone write plugs (e.g. a zone reset + * changes a zone write plug zone write pointer offset), but these + * operation do not go through zone write plugging as they may operate + * on zones that do not have a zone write + * plug. blk_zone_mgmt_bio_endio() handles the potential changes to zone + * write plugs that are present. + */ + if (op_is_zone_mgmt(bio_op(bio))) { + blk_zone_mgmt_bio_endio(bio); + return; + } + + /* * For write BIOs to zoned devices, signal the completion of the BIO so * that the next write BIO can be submitted by zone write plugging. */ diff --git a/block/elevator.c b/block/elevator.c index e2ebfbf107b3..5b37ef44f52d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -45,19 +45,6 @@ #include "blk-wbt.h" #include "blk-cgroup.h" -/* Holding context data for changing elevator */ -struct elv_change_ctx { - const char *name; - bool no_uevent; - - /* for unregistering old elevator */ - struct elevator_queue *old; - /* for registering new elevator */ - struct elevator_queue *new; - /* holds sched tags data */ - struct elevator_tags *et; -}; - static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -134,7 +121,7 @@ static struct elevator_type *elevator_find_get(const char *name) static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, - struct elevator_type *e, struct elevator_tags *et) + struct elevator_type *e, struct elevator_resources *res) { struct elevator_queue *eq; @@ -147,7 +134,8 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); - eq->et = et; + eq->et = res->et; + eq->elevator_data = res->data; return eq; } @@ -593,7 +581,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) } if (new_e) { - ret = blk_mq_init_sched(q, new_e, ctx->et); + ret = blk_mq_init_sched(q, new_e, &ctx->res); if (ret) goto out_unfreeze; ctx->new = q->elevator; @@ -617,7 +605,8 @@ out_unfreeze: return ret; } -static void elv_exit_and_release(struct request_queue *q) +static void elv_exit_and_release(struct elv_change_ctx *ctx, + struct request_queue *q) { struct elevator_queue *e; unsigned memflags; @@ -629,7 +618,7 @@ static void elv_exit_and_release(struct request_queue *q) mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); if (e) { - blk_mq_free_sched_tags(e->et, q->tag_set); + blk_mq_free_sched_res(&ctx->res, ctx->type, q->tag_set); kobject_put(&e->kobj); } } @@ -640,11 +629,15 @@ static int elevator_change_done(struct request_queue *q, int ret = 0; if (ctx->old) { + struct elevator_resources res = { + .et = ctx->old->et, + .data = ctx->old->elevator_data + }; bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &ctx->old->flags); elv_unregister_queue(q, ctx->old); - blk_mq_free_sched_tags(ctx->old->et, q->tag_set); + blk_mq_free_sched_res(&res, ctx->old->type, q->tag_set); kobject_put(&ctx->old->kobj); if (enable_wbt) wbt_enable_default(q->disk); @@ -652,7 +645,7 @@ static int elevator_change_done(struct request_queue *q, if (ctx->new) { ret = elv_register_queue(q, ctx->new, !ctx->no_uevent); if (ret) - elv_exit_and_release(q); + elv_exit_and_release(ctx, q); } return ret; } @@ -669,10 +662,10 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) lockdep_assert_held(&set->update_nr_hwq_lock); if (strncmp(ctx->name, "none", 4)) { - ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues, - blk_mq_default_nr_requests(set)); - if (!ctx->et) - return -ENOMEM; + ret = blk_mq_alloc_sched_res(q, ctx->type, &ctx->res, + set->nr_hw_queues); + if (ret) + return ret; } memflags = blk_mq_freeze_queue(q); @@ -693,11 +686,12 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) blk_mq_unfreeze_queue(q, memflags); if (!ret) ret = elevator_change_done(q, ctx); + /* - * Free sched tags if it's allocated but we couldn't switch elevator. + * Free sched resource if it's allocated but we couldn't switch elevator. */ - if (ctx->et && !ctx->new) - blk_mq_free_sched_tags(ctx->et, set); + if (!ctx->new) + blk_mq_free_sched_res(&ctx->res, ctx->type, set); return ret; } @@ -706,32 +700,29 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, - struct elevator_tags *t) +void elv_update_nr_hw_queues(struct request_queue *q, + struct elv_change_ctx *ctx) { struct blk_mq_tag_set *set = q->tag_set; - struct elv_change_ctx ctx = {}; int ret = -ENODEV; WARN_ON_ONCE(q->mq_freeze_depth == 0); - if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { - ctx.name = e->elevator_name; - ctx.et = t; - + if (ctx->type && !blk_queue_dying(q) && blk_queue_registered(q)) { mutex_lock(&q->elevator_lock); /* force to reattach elevator after nr_hw_queue is updated */ - ret = elevator_switch(q, &ctx); + ret = elevator_switch(q, ctx); mutex_unlock(&q->elevator_lock); } blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) - WARN_ON_ONCE(elevator_change_done(q, &ctx)); + WARN_ON_ONCE(elevator_change_done(q, ctx)); + /* - * Free sched tags if it's allocated but we couldn't switch elevator. + * Free sched resource if it's allocated but we couldn't switch elevator. */ - if (t && !ctx.new) - blk_mq_free_sched_tags(t, set); + if (!ctx->new) + blk_mq_free_sched_res(&ctx->res, ctx->type, set); } /* @@ -745,7 +736,6 @@ void elevator_set_default(struct request_queue *q) .no_uevent = true, }; int err; - struct elevator_type *e; /* now we allow to switch elevator */ blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q); @@ -758,8 +748,8 @@ void elevator_set_default(struct request_queue *q) * have multiple queues or mq-deadline is not available, default * to "none". */ - e = elevator_find_get(ctx.name); - if (!e) + ctx.type = elevator_find_get(ctx.name); + if (!ctx.type) return; if ((q->nr_hw_queues == 1 || @@ -769,7 +759,7 @@ void elevator_set_default(struct request_queue *q) pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n", ctx.name, err); } - elevator_put(e); + elevator_put(ctx.type); } void elevator_set_none(struct request_queue *q) @@ -818,6 +808,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, ctx.name = strstrip(elevator_name); elv_iosched_load_module(ctx.name); + ctx.type = elevator_find_get(ctx.name); down_read(&set->update_nr_hwq_lock); if (!blk_queue_no_elv_switch(q)) { @@ -828,6 +819,9 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, ret = -ENOENT; } up_read(&set->update_nr_hwq_lock); + + if (ctx.type) + elevator_put(ctx.type); return ret; } diff --git a/block/elevator.h b/block/elevator.h index c4d20155065e..a9d092c5a9e8 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -32,12 +32,36 @@ struct elevator_tags { struct blk_mq_tags *tags[]; }; +struct elevator_resources { + /* holds elevator data */ + void *data; + /* holds elevator tags */ + struct elevator_tags *et; +}; + +/* Holding context data for changing elevator */ +struct elv_change_ctx { + const char *name; + bool no_uevent; + + /* for unregistering old elevator */ + struct elevator_queue *old; + /* for registering new elevator */ + struct elevator_queue *new; + /* store elevator type */ + struct elevator_type *type; + /* store elevator resources */ + struct elevator_resources res; +}; + struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*depth_updated)(struct request_queue *); + void *(*alloc_sched_data)(struct request_queue *); + void (*free_sched_data)(void *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); @@ -147,7 +171,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -void elevator_init_mq(struct request_queue *q); /* * io scheduler registration @@ -163,7 +186,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); struct elevator_queue *elevator_alloc(struct request_queue *, - struct elevator_type *, struct elevator_tags *); + struct elevator_type *, struct elevator_resources *); /* * Helper functions. diff --git a/block/genhd.c b/block/genhd.c index 9bbc38d12792..69c75117ba2c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -90,7 +90,7 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) (disk->flags & GENHD_FL_HIDDEN)) return false; - pr_info("%s: detected capacity change from %lld to %lld\n", + pr_info_ratelimited("%s: detected capacity change from %lld to %lld\n", disk->disk_name, capacity, size); /* @@ -795,11 +795,11 @@ static void disable_elv_switch(struct request_queue *q) * partitions associated with the gendisk, and unregisters the associated * request_queue. * - * This is the counter to the respective __device_add_disk() call. + * This is the counter to the respective device_add_disk() call. * * The final removal of the struct gendisk happens when its refcount reaches 0 * with put_disk(), which should be called after del_gendisk(), if - * __device_add_disk() was used. + * device_add_disk() was used. * * Drivers exist which depend on the release of the gendisk to be synchronous, * it should not be deferred. @@ -1265,7 +1265,7 @@ static const struct attribute_group *disk_attr_groups[] = { * * This function releases all allocated resources of the gendisk. * - * Drivers which used __device_add_disk() have a gendisk with a request_queue + * Drivers which used device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the * request_queue refcount to reach 0 at this point, and so the request_queue diff --git a/block/ioctl.c b/block/ioctl.c index d7489a56b33c..2b3ab9bfc413 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -581,6 +581,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case BLKGETDISKSEQ: return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: + case BLKREPORTZONEV2: return blkdev_report_zones_ioctl(bdev, cmd, arg); case BLKRESETZONE: case BLKOPENZONE: @@ -691,6 +692,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) /* Incompatible alignment on i386 */ case BLKTRACESETUP: + case BLKTRACESETUP2: return blk_trace_ioctl(bdev, cmd, argp); default: break; @@ -769,14 +771,16 @@ struct blk_iou_cmd { bool nowait; }; -static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) +static void blk_cmd_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else - io_uring_cmd_done(cmd, bic->res, issue_flags); + io_uring_cmd_done(cmd, bic->res, + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); } static void bio_cmd_bio_end_io(struct bio *bio) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 18efd6ef2a2b..c1b36ffd19ce 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -409,30 +409,42 @@ static void kyber_depth_updated(struct request_queue *q) static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) { - struct kyber_queue_data *kqd; - - kqd = kyber_queue_data_alloc(q); - if (IS_ERR(kqd)) - return PTR_ERR(kqd); - blk_stat_enable_accounting(q); blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); - eq->elevator_data = kqd; q->elevator = eq; kyber_depth_updated(q); return 0; } +static void *kyber_alloc_sched_data(struct request_queue *q) +{ + struct kyber_queue_data *kqd; + + kqd = kyber_queue_data_alloc(q); + if (IS_ERR(kqd)) + return NULL; + + return kqd; +} + static void kyber_exit_sched(struct elevator_queue *e) { struct kyber_queue_data *kqd = e->elevator_data; - int i; timer_shutdown_sync(&kqd->timer); blk_stat_disable_accounting(kqd->q); +} + +static void kyber_free_sched_data(void *elv_data) +{ + struct kyber_queue_data *kqd = elv_data; + int i; + + if (!kqd) + return; for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_queue_free(&kqd->domain_tokens[i]); @@ -1004,6 +1016,8 @@ static struct elevator_type kyber_sched = { .exit_sched = kyber_exit_sched, .init_hctx = kyber_init_hctx, .exit_hctx = kyber_exit_hctx, + .alloc_sched_data = kyber_alloc_sched_data, + .free_sched_data = kyber_free_sched_data, .limit_depth = kyber_limit_depth, .bio_merge = kyber_bio_merge, .prepare_request = kyber_prepare_request, diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 3e741d33142d..3e3719093aec 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -71,7 +71,6 @@ struct io_stats_per_prio { * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { - struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Position of the most recently dispatched request. */ @@ -84,6 +83,7 @@ struct deadline_data { * run time data */ + struct list_head dispatch; struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ @@ -306,6 +306,19 @@ static bool started_after(struct deadline_data *dd, struct request *rq, return time_after(start_time, latest_start); } +static struct request *dd_start_request(struct deadline_data *dd, + enum dd_data_dir data_dir, + struct request *rq) +{ + u8 ioprio_class = dd_rq_ioclass(rq); + enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + + dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); + dd->per_prio[prio].stats.dispatched++; + rq->rq_flags |= RQF_STARTED; + return rq; +} + /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc and with a start time <= @latest_start. @@ -316,21 +329,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, { struct request *rq, *next_rq; enum dd_data_dir data_dir; - enum dd_prio prio; - u8 ioprio_class; lockdep_assert_held(&dd->lock); - if (!list_empty(&per_prio->dispatch)) { - rq = list_first_entry(&per_prio->dispatch, struct request, - queuelist); - if (started_after(dd, rq, latest_start)) - return NULL; - list_del_init(&rq->queuelist); - data_dir = rq_data_dir(rq); - goto done; - } - /* * batches are currently reads XOR writes */ @@ -410,13 +411,7 @@ dispatch_request: */ dd->batching++; deadline_move_request(dd, per_prio, rq); -done: - ioprio_class = dd_rq_ioclass(rq); - prio = ioprio_class_to_prio[ioprio_class]; - dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); - dd->per_prio[prio].stats.dispatched++; - rq->rq_flags |= RQF_STARTED; - return rq; + return dd_start_request(dd, data_dir, rq); } /* @@ -463,6 +458,14 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) enum dd_prio prio; spin_lock(&dd->lock); + + if (!list_empty(&dd->dispatch)) { + rq = list_first_entry(&dd->dispatch, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_start_request(dd, rq_data_dir(rq), rq); + goto unlock; + } + rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; @@ -551,10 +554,10 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) eq->elevator_data = dd; + INIT_LIST_HEAD(&dd->dispatch); for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; - INIT_LIST_HEAD(&per_prio->dispatch); INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; @@ -658,7 +661,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) { - list_add(&rq->queuelist, &per_prio->dispatch); + list_add(&rq->queuelist, &dd->dispatch); rq->fifo_time = jiffies; } else { deadline_add_rq_rb(per_prio, rq); @@ -725,8 +728,7 @@ static void dd_finish_request(struct request *rq) static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { - return !list_empty_careful(&per_prio->dispatch) || - !list_empty_careful(&per_prio->fifo_list[DD_READ]) || + return !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } @@ -735,6 +737,9 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; + if (!list_empty_careful(&dd->dispatch)) + return true; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; @@ -943,49 +948,39 @@ static int dd_owned_by_driver_show(void *data, struct seq_file *m) return 0; } -#define DEADLINE_DISPATCH_ATTR(prio) \ -static void *deadline_dispatch##prio##_start(struct seq_file *m, \ - loff_t *pos) \ - __acquires(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - spin_lock(&dd->lock); \ - return seq_list_start(&per_prio->dispatch, *pos); \ -} \ - \ -static void *deadline_dispatch##prio##_next(struct seq_file *m, \ - void *v, loff_t *pos) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - return seq_list_next(v, &per_prio->dispatch, pos); \ -} \ - \ -static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ - __releases(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - \ - spin_unlock(&dd->lock); \ -} \ - \ -static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ - .start = deadline_dispatch##prio##_start, \ - .next = deadline_dispatch##prio##_next, \ - .stop = deadline_dispatch##prio##_stop, \ - .show = blk_mq_debugfs_rq_show, \ +static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) + __acquires(&dd->lock) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + return seq_list_start(&dd->dispatch, *pos); +} + +static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + return seq_list_next(v, &dd->dispatch, pos); +} + +static void deadline_dispatch_stop(struct seq_file *m, void *v) + __releases(&dd->lock) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_unlock(&dd->lock); } -DEADLINE_DISPATCH_ATTR(0); -DEADLINE_DISPATCH_ATTR(1); -DEADLINE_DISPATCH_ATTR(2); -#undef DEADLINE_DISPATCH_ATTR +static const struct seq_operations deadline_dispatch_seq_ops = { + .start = deadline_dispatch_start, + .next = deadline_dispatch_next, + .stop = deadline_dispatch_stop, + .show = blk_mq_debugfs_rq_show, +}; #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ @@ -1008,9 +1003,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, - {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, - {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, - {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, + {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 7acba66eed48..638261e9f2fb 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -215,8 +215,7 @@ check_hybrid: sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", - sz, min_t(uint32_t, - total_sectors - 1, 0xFFFFFFFF)); + sz, (uint32_t)min(total_sectors - 1, 0xFFFFFFFF)); } done: return ret; |
