diff options
Diffstat (limited to 'block/blk-mq.c')
| -rw-r--r-- | block/blk-mq.c | 152 |
1 files changed, 96 insertions, 56 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index d626d32f6e57..4e96bb246247 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; + rq->phys_gap_bit = 0; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; @@ -467,21 +468,26 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) unsigned long tag_mask; int i, nr = 0; - tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); - if (unlikely(!tag_mask)) - return NULL; + do { + tag_mask = blk_mq_get_tags(data, data->nr_tags - nr, &tag_offset); + if (unlikely(!tag_mask)) { + if (nr == 0) + return NULL; + break; + } + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) + continue; + tag = tag_offset + i; + prefetch(tags->static_rqs[tag]); + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag); + rq_list_add_head(data->cached_rqs, rq); + nr++; + } + } while (data->nr_tags > nr); - tags = blk_mq_tags_from_data(data); - for (i = 0; tag_mask; i++) { - if (!(tag_mask & (1UL << i))) - continue; - tag = tag_offset + i; - prefetch(tags->static_rqs[tag]); - tag_mask &= ~(1UL << i); - rq = blk_mq_rq_ctx_init(data, tags, tag); - rq_list_add_head(data->cached_rqs, rq); - nr++; - } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ @@ -668,6 +674,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, goto out_queue_exit; } rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -723,7 +730,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = xa_load(&q->hctx_table, hctx_idx); + data.hctx = q->queue_hw_ctx[hctx_idx]; if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); @@ -748,6 +755,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -2674,6 +2682,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; rq->__data_len = bio->bi_iter.bi_size; + rq->phys_gap_bit = bio->bi_bvec_gap_bit; + rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, @@ -3380,6 +3390,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->nr_integrity_segments = rq_src->nr_integrity_segments; + rq->phys_gap_bit = rq_src->phys_gap_bit; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -3935,8 +3946,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_free_flush_queue_callback); hctx->fq = NULL; - xa_erase(&q->hctx_table, hctx_idx); - spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -3978,14 +3987,8 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->numa_node)) goto exit_hctx; - if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) - goto exit_flush_rq; - return 0; - exit_flush_rq: - if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -4374,7 +4377,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - xa_destroy(&q->hctx_table); + kfree(q->queue_hw_ctx); /* * release .mq_kobj and sw queue's kobject now because @@ -4518,26 +4521,49 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned long i, j; + int i, j, end; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + rcu_assign_pointer(q->queue_hw_ctx, new_hctxs); + /* + * Make sure reading the old queue_hw_ctx from other + * context concurrently won't trigger uaf. + */ + synchronize_rcu_expedited(); + kfree(hctxs); + hctxs = new_hctxs; + } for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); - struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); + struct blk_mq_hw_ctx *old_hctx = hctxs[i]; if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } - if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node); + if (!hctxs[i]) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); - hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); - WARN_ON_ONCE(!hctx); + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, + old_node); + WARN_ON_ONCE(!hctxs[i]); } } /* @@ -4546,13 +4572,21 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; + end = i; } else { j = i; + end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - xa_for_each_start(&q->hctx_table, j, hctx, j) - blk_mq_exit_hctx(q, set, hctx, j); + for (; j < end; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + blk_mq_exit_hctx(q, set, hctx, j); + hctxs[j] = NULL; + } + } } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, @@ -4588,8 +4622,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); - xa_init(&q->hctx_table); - blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4983,27 +5015,28 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, * Switch back to the elevator type stored in the xarray. */ static void blk_mq_elv_switch_back(struct request_queue *q, - struct xarray *elv_tbl, struct xarray *et_tbl) + struct xarray *elv_tbl) { - struct elevator_type *e = xa_load(elv_tbl, q->id); - struct elevator_tags *t = xa_load(et_tbl, q->id); + struct elv_change_ctx *ctx = xa_load(elv_tbl, q->id); + + if (WARN_ON_ONCE(!ctx)) + return; /* The elv_update_nr_hw_queues unfreezes the queue. */ - elv_update_nr_hw_queues(q, e, t); + elv_update_nr_hw_queues(q, ctx); /* Drop the reference acquired in blk_mq_elv_switch_none. */ - if (e) - elevator_put(e); + if (ctx->type) + elevator_put(ctx->type); } /* - * Stores elevator type in xarray and set current elevator to none. It uses - * q->id as an index to store the elevator type into the xarray. + * Stores elevator name and type in ctx and set current elevator to none. */ static int blk_mq_elv_switch_none(struct request_queue *q, struct xarray *elv_tbl) { - int ret = 0; + struct elv_change_ctx *ctx; lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); @@ -5015,10 +5048,11 @@ static int blk_mq_elv_switch_none(struct request_queue *q, * can't run concurrently. */ if (q->elevator) { + ctx = xa_load(elv_tbl, q->id); + if (WARN_ON_ONCE(!ctx)) + return -ENOENT; - ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); - if (WARN_ON_ONCE(ret)) - return ret; + ctx->name = q->elevator->type->elevator_name; /* * Before we switch elevator to 'none', take a reference to @@ -5029,9 +5063,14 @@ static int blk_mq_elv_switch_none(struct request_queue *q, */ __elevator_get(q->elevator->type); + /* + * Store elevator type so that we can release the reference + * taken above later. + */ + ctx->type = q->elevator->type; elevator_set_none(q); } - return ret; + return 0; } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, @@ -5041,7 +5080,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; - struct xarray elv_tbl, et_tbl; + struct xarray elv_tbl; bool queues_frozen = false; lockdep_assert_held(&set->tag_list_lock); @@ -5055,11 +5094,12 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, memflags = memalloc_noio_save(); - xa_init(&et_tbl); - if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) - goto out_memalloc_restore; - xa_init(&elv_tbl); + if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0) + goto out_free_ctx; + + if (blk_mq_alloc_sched_res_batch(&elv_tbl, set, nr_hw_queues) < 0) + goto out_free_ctx; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); @@ -5105,7 +5145,7 @@ switch_back: /* switch_back expects queue to be frozen */ if (!queues_frozen) blk_mq_freeze_queue_nomemsave(q); - blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); + blk_mq_elv_switch_back(q, &elv_tbl); } list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -5116,9 +5156,9 @@ switch_back: blk_mq_add_hw_queues_cpuhp(q); } +out_free_ctx: + blk_mq_free_sched_ctx_batch(&elv_tbl); xa_destroy(&elv_tbl); - xa_destroy(&et_tbl); -out_memalloc_restore: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ @@ -5168,7 +5208,7 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, { if (!blk_mq_can_poll(q)) return 0; - return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); + return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, |
