summaryrefslogtreecommitdiff
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c152
1 files changed, 96 insertions, 56 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d626d32f6e57..4e96bb246247 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
INIT_LIST_HEAD(&rq->queuelist);
rq->q = q;
rq->__sector = (sector_t) -1;
+ rq->phys_gap_bit = 0;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG;
@@ -467,21 +468,26 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
unsigned long tag_mask;
int i, nr = 0;
- tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
- if (unlikely(!tag_mask))
- return NULL;
+ do {
+ tag_mask = blk_mq_get_tags(data, data->nr_tags - nr, &tag_offset);
+ if (unlikely(!tag_mask)) {
+ if (nr == 0)
+ return NULL;
+ break;
+ }
+ tags = blk_mq_tags_from_data(data);
+ for (i = 0; tag_mask; i++) {
+ if (!(tag_mask & (1UL << i)))
+ continue;
+ tag = tag_offset + i;
+ prefetch(tags->static_rqs[tag]);
+ tag_mask &= ~(1UL << i);
+ rq = blk_mq_rq_ctx_init(data, tags, tag);
+ rq_list_add_head(data->cached_rqs, rq);
+ nr++;
+ }
+ } while (data->nr_tags > nr);
- tags = blk_mq_tags_from_data(data);
- for (i = 0; tag_mask; i++) {
- if (!(tag_mask & (1UL << i)))
- continue;
- tag = tag_offset + i;
- prefetch(tags->static_rqs[tag]);
- tag_mask &= ~(1UL << i);
- rq = blk_mq_rq_ctx_init(data, tags, tag);
- rq_list_add_head(data->cached_rqs, rq);
- nr++;
- }
if (!(data->rq_flags & RQF_SCHED_TAGS))
blk_mq_add_active_requests(data->hctx, nr);
/* caller already holds a reference, add for remainder */
@@ -668,6 +674,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
goto out_queue_exit;
}
rq->__data_len = 0;
+ rq->phys_gap_bit = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
@@ -723,7 +730,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
- data.hctx = xa_load(&q->hctx_table, hctx_idx);
+ data.hctx = q->queue_hw_ctx[hctx_idx];
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
@@ -748,6 +755,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
rq->__data_len = 0;
+ rq->phys_gap_bit = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
@@ -2674,6 +2682,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
rq->bio = rq->biotail = bio;
rq->__sector = bio->bi_iter.bi_sector;
rq->__data_len = bio->bi_iter.bi_size;
+ rq->phys_gap_bit = bio->bi_bvec_gap_bit;
+
rq->nr_phys_segments = nr_segs;
if (bio_integrity(bio))
rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
@@ -3380,6 +3390,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
rq->nr_integrity_segments = rq_src->nr_integrity_segments;
+ rq->phys_gap_bit = rq_src->phys_gap_bit;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;
@@ -3935,8 +3946,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
blk_free_flush_queue_callback);
hctx->fq = NULL;
- xa_erase(&q->hctx_table, hctx_idx);
-
spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
spin_unlock(&q->unused_hctx_lock);
@@ -3978,14 +3987,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
hctx->numa_node))
goto exit_hctx;
- if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
- goto exit_flush_rq;
-
return 0;
- exit_flush_rq:
- if (set->ops->exit_request)
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -4374,7 +4377,7 @@ void blk_mq_release(struct request_queue *q)
kobject_put(&hctx->kobj);
}
- xa_destroy(&q->hctx_table);
+ kfree(q->queue_hw_ctx);
/*
* release .mq_kobj and sw queue's kobject now because
@@ -4518,26 +4521,49 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
- struct blk_mq_hw_ctx *hctx;
- unsigned long i, j;
+ int i, j, end;
+ struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
+
+ if (q->nr_hw_queues < set->nr_hw_queues) {
+ struct blk_mq_hw_ctx **new_hctxs;
+
+ new_hctxs = kcalloc_node(set->nr_hw_queues,
+ sizeof(*new_hctxs), GFP_KERNEL,
+ set->numa_node);
+ if (!new_hctxs)
+ return;
+ if (hctxs)
+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
+ sizeof(*hctxs));
+ rcu_assign_pointer(q->queue_hw_ctx, new_hctxs);
+ /*
+ * Make sure reading the old queue_hw_ctx from other
+ * context concurrently won't trigger uaf.
+ */
+ synchronize_rcu_expedited();
+ kfree(hctxs);
+ hctxs = new_hctxs;
+ }
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
- struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
+ struct blk_mq_hw_ctx *old_hctx = hctxs[i];
if (old_hctx) {
old_node = old_hctx->numa_node;
blk_mq_exit_hctx(q, set, old_hctx, i);
}
- if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
+ hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node);
+ if (!hctxs[i]) {
if (!old_hctx)
break;
pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
node, old_node);
- hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
- WARN_ON_ONCE(!hctx);
+ hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i,
+ old_node);
+ WARN_ON_ONCE(!hctxs[i]);
}
}
/*
@@ -4546,13 +4572,21 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
+ end = i;
} else {
j = i;
+ end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
- xa_for_each_start(&q->hctx_table, j, hctx, j)
- blk_mq_exit_hctx(q, set, hctx, j);
+ for (; j < end; j++) {
+ struct blk_mq_hw_ctx *hctx = hctxs[j];
+
+ if (hctx) {
+ blk_mq_exit_hctx(q, set, hctx, j);
+ hctxs[j] = NULL;
+ }
+ }
}
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
@@ -4588,8 +4622,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
- xa_init(&q->hctx_table);
-
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
@@ -4983,27 +5015,28 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
* Switch back to the elevator type stored in the xarray.
*/
static void blk_mq_elv_switch_back(struct request_queue *q,
- struct xarray *elv_tbl, struct xarray *et_tbl)
+ struct xarray *elv_tbl)
{
- struct elevator_type *e = xa_load(elv_tbl, q->id);
- struct elevator_tags *t = xa_load(et_tbl, q->id);
+ struct elv_change_ctx *ctx = xa_load(elv_tbl, q->id);
+
+ if (WARN_ON_ONCE(!ctx))
+ return;
/* The elv_update_nr_hw_queues unfreezes the queue. */
- elv_update_nr_hw_queues(q, e, t);
+ elv_update_nr_hw_queues(q, ctx);
/* Drop the reference acquired in blk_mq_elv_switch_none. */
- if (e)
- elevator_put(e);
+ if (ctx->type)
+ elevator_put(ctx->type);
}
/*
- * Stores elevator type in xarray and set current elevator to none. It uses
- * q->id as an index to store the elevator type into the xarray.
+ * Stores elevator name and type in ctx and set current elevator to none.
*/
static int blk_mq_elv_switch_none(struct request_queue *q,
struct xarray *elv_tbl)
{
- int ret = 0;
+ struct elv_change_ctx *ctx;
lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock);
@@ -5015,10 +5048,11 @@ static int blk_mq_elv_switch_none(struct request_queue *q,
* can't run concurrently.
*/
if (q->elevator) {
+ ctx = xa_load(elv_tbl, q->id);
+ if (WARN_ON_ONCE(!ctx))
+ return -ENOENT;
- ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL);
- if (WARN_ON_ONCE(ret))
- return ret;
+ ctx->name = q->elevator->type->elevator_name;
/*
* Before we switch elevator to 'none', take a reference to
@@ -5029,9 +5063,14 @@ static int blk_mq_elv_switch_none(struct request_queue *q,
*/
__elevator_get(q->elevator->type);
+ /*
+ * Store elevator type so that we can release the reference
+ * taken above later.
+ */
+ ctx->type = q->elevator->type;
elevator_set_none(q);
}
- return ret;
+ return 0;
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
@@ -5041,7 +5080,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
- struct xarray elv_tbl, et_tbl;
+ struct xarray elv_tbl;
bool queues_frozen = false;
lockdep_assert_held(&set->tag_list_lock);
@@ -5055,11 +5094,12 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
memflags = memalloc_noio_save();
- xa_init(&et_tbl);
- if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
- goto out_memalloc_restore;
-
xa_init(&elv_tbl);
+ if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0)
+ goto out_free_ctx;
+
+ if (blk_mq_alloc_sched_res_batch(&elv_tbl, set, nr_hw_queues) < 0)
+ goto out_free_ctx;
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
@@ -5105,7 +5145,7 @@ switch_back:
/* switch_back expects queue to be frozen */
if (!queues_frozen)
blk_mq_freeze_queue_nomemsave(q);
- blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
+ blk_mq_elv_switch_back(q, &elv_tbl);
}
list_for_each_entry(q, &set->tag_list, tag_set_list) {
@@ -5116,9 +5156,9 @@ switch_back:
blk_mq_add_hw_queues_cpuhp(q);
}
+out_free_ctx:
+ blk_mq_free_sched_ctx_batch(&elv_tbl);
xa_destroy(&elv_tbl);
- xa_destroy(&et_tbl);
-out_memalloc_restore:
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
@@ -5168,7 +5208,7 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
{
if (!blk_mq_can_poll(q))
return 0;
- return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
+ return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags);
}
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,