From af2c68fe94e8c0a628519b60ba070c5cf6526a99 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 1 Aug 2019 15:50:40 -0700 Subject: block: Declare several function pointer arguments 'const' Make it clear to the compiler and also to humans that the functions that query request queue properties do not modify any member of the request_queue data structure. Reviewed-by: Johannes Thumshirn Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ef375dafb1c..96a29a72fd4a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1232,42 +1232,42 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -static inline unsigned long queue_segment_boundary(struct request_queue *q) +static inline unsigned long queue_segment_boundary(const struct request_queue *q) { return q->limits.seg_boundary_mask; } -static inline unsigned long queue_virt_boundary(struct request_queue *q) +static inline unsigned long queue_virt_boundary(const struct request_queue *q) { return q->limits.virt_boundary_mask; } -static inline unsigned int queue_max_sectors(struct request_queue *q) +static inline unsigned int queue_max_sectors(const struct request_queue *q) { return q->limits.max_sectors; } -static inline unsigned int queue_max_hw_sectors(struct request_queue *q) +static inline unsigned int queue_max_hw_sectors(const struct request_queue *q) { return q->limits.max_hw_sectors; } -static inline unsigned short queue_max_segments(struct request_queue *q) +static inline unsigned short queue_max_segments(const struct request_queue *q) { return q->limits.max_segments; } -static inline unsigned short queue_max_discard_segments(struct request_queue *q) +static inline unsigned short queue_max_discard_segments(const struct request_queue *q) { return q->limits.max_discard_segments; } -static inline unsigned int queue_max_segment_size(struct request_queue *q) +static inline unsigned int queue_max_segment_size(const struct request_queue *q) { return q->limits.max_segment_size; } -static inline unsigned short queue_logical_block_size(struct request_queue *q) +static inline unsigned short queue_logical_block_size(const struct request_queue *q) { int retval = 512; @@ -1282,7 +1282,7 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev) return queue_logical_block_size(bdev_get_queue(bdev)); } -static inline unsigned int queue_physical_block_size(struct request_queue *q) +static inline unsigned int queue_physical_block_size(const struct request_queue *q) { return q->limits.physical_block_size; } @@ -1292,7 +1292,7 @@ static inline unsigned int bdev_physical_block_size(struct block_device *bdev) return queue_physical_block_size(bdev_get_queue(bdev)); } -static inline unsigned int queue_io_min(struct request_queue *q) +static inline unsigned int queue_io_min(const struct request_queue *q) { return q->limits.io_min; } @@ -1302,7 +1302,7 @@ static inline int bdev_io_min(struct block_device *bdev) return queue_io_min(bdev_get_queue(bdev)); } -static inline unsigned int queue_io_opt(struct request_queue *q) +static inline unsigned int queue_io_opt(const struct request_queue *q) { return q->limits.io_opt; } @@ -1312,7 +1312,7 @@ static inline int bdev_io_opt(struct block_device *bdev) return queue_io_opt(bdev_get_queue(bdev)); } -static inline int queue_alignment_offset(struct request_queue *q) +static inline int queue_alignment_offset(const struct request_queue *q) { if (q->limits.misaligned) return -1; @@ -1342,7 +1342,7 @@ static inline int bdev_alignment_offset(struct block_device *bdev) return q->limits.alignment_offset; } -static inline int queue_discard_alignment(struct request_queue *q) +static inline int queue_discard_alignment(const struct request_queue *q) { if (q->limits.discard_misaligned) return -1; @@ -1432,7 +1432,7 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return 0; } -static inline int queue_dma_alignment(struct request_queue *q) +static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; } @@ -1543,7 +1543,7 @@ static inline void blk_queue_max_integrity_segments(struct request_queue *q, } static inline unsigned short -queue_max_integrity_segments(struct request_queue *q) +queue_max_integrity_segments(const struct request_queue *q) { return q->limits.max_integrity_segments; } @@ -1626,7 +1626,7 @@ static inline void blk_queue_max_integrity_segments(struct request_queue *q, unsigned int segs) { } -static inline unsigned short queue_max_integrity_segments(struct request_queue *q) +static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { return 0; } -- cgit v1.2.3 From e84e8f0663956f45c747df5629046794cff93893 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 1 Aug 2019 10:26:35 -0700 Subject: block: add req op to reset all zones and flag This patch introduces a new request operation REQ_OP_ZONE_RESET_ALL. This is useful for the applications like mkfs where it needs to reset all the zones present on the underlying block device. As part for this patch we also introduce new QUEUE_FLAG_ZONE_RESETALL which indicates the queue zone reset all capability and corresponding helper macro. Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b1fa1557e68..d6ce7b3ec8b1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -282,6 +282,8 @@ enum req_opf { REQ_OP_ZONE_RESET = 6, /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, + /* reset all the zone present on the device */ + REQ_OP_ZONE_RESET_ALL = 8, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 9, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 96a29a72fd4a..167bf879f072 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -611,6 +611,7 @@ struct request_queue { #define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ +#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP)) @@ -630,6 +631,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) +#define blk_queue_zone_resetall(q) \ + test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -- cgit v1.2.3 From 988721db93b2f5e6477cb0ea0b64ba9bcfd67778 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 16 Aug 2019 14:12:33 -0700 Subject: block: remove struct request_queue queue_head The dispatch list is not used any more, as the legacy block IO stack has been removed. Reviewed-by: Bart Van Assche Reviewed-by: Ming Lei Signed-off-by: Junxiao Bi Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - include/linux/blkdev.h | 4 ---- 2 files changed, 5 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 834aea04718f..5d0d7441a443 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -482,7 +482,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 167bf879f072..4798bb25f1ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -391,10 +391,6 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, #endif /* CONFIG_BLK_DEV_ZONED */ struct request_queue { - /* - * Together with queue_head for cacheline sharing - */ - struct list_head queue_head; struct request *last_merge; struct elevator_queue *elevator; -- cgit v1.2.3 From 58c898ba370e68d39470cd0d932b524682c1f9be Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Aug 2019 19:01:47 +0800 Subject: block: add helper for checking if queue is registered There are 4 users which check if queue is registered, so add one helper to check it. Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Greg KH Cc: Mike Snitzer Cc: Bart Van Assche Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 4 ++-- block/blk-wbt.c | 2 +- block/elevator.c | 2 +- include/linux/blkdev.h | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 977c659dcd18..5b0b5224cfd4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -942,7 +942,7 @@ int blk_register_queue(struct gendisk *disk) if (WARN_ON(!q)) return -ENXIO; - WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), + WARN_ONCE(blk_queue_registered(q), "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); @@ -1026,7 +1026,7 @@ void blk_unregister_queue(struct gendisk *disk) return; /* Return early if disk->queue was never registered. */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return; /* diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 313f45a37e9d..c4d3089e47f7 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -656,7 +656,7 @@ void wbt_enable_default(struct request_queue *q) return; /* Queue not registered? Maybe shutting down... */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return; if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) diff --git a/block/elevator.c b/block/elevator.c index 33c15fb54ed1..03d923196569 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -656,7 +656,7 @@ static int __elevator_change(struct request_queue *q, const char *name) struct elevator_type *e; /* Make sure queue is not in the middle of being removed */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return -ENOENT; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4798bb25f1ee..d5077f3fdfd6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -643,6 +643,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) +#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From cecf5d87ff2035127bb5a9ee054d0023a4a7cad3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Aug 2019 19:01:48 +0800 Subject: block: split .sysfs_lock into two locks The kernfs built-in lock of 'kn->count' is held in sysfs .show/.store path. Meantime, inside block's .show/.store callback, q->sysfs_lock is required. However, when mq & iosched kobjects are removed via blk_mq_unregister_dev() & elv_unregister_queue(), q->sysfs_lock is held too. This way causes AB-BA lock because the kernfs built-in lock of 'kn-count' is required inside kobject_del() too, see the lockdep warning[1]. On the other hand, it isn't necessary to acquire q->sysfs_lock for both blk_mq_unregister_dev() & elv_unregister_queue() because clearing REGISTERED flag prevents storing to 'queue/scheduler' from being happened. Also sysfs write(store) is exclusive, so no necessary to hold the lock for elv_unregister_queue() when it is called in switching elevator path. So split .sysfs_lock into two: one is still named as .sysfs_lock for covering sync .store, the other one is named as .sysfs_dir_lock for covering kobjects and related status change. sysfs itself can handle the race between add/remove kobjects and showing/storing attributes under kobjects. For switching scheduler via storing to 'queue/scheduler', we use the queue flag of QUEUE_FLAG_REGISTERED with .sysfs_lock for avoiding the race, then we can avoid to hold .sysfs_lock during removing/adding kobjects. [1] lockdep warning ====================================================== WARNING: possible circular locking dependency detected 5.3.0-rc3-00044-g73277fc75ea0 #1380 Not tainted ------------------------------------------------------ rmmod/777 is trying to acquire lock: 00000000ac50e981 (kn->count#202){++++}, at: kernfs_remove_by_name_ns+0x59/0x72 but task is already holding lock: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&q->sysfs_lock){+.+.}: __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 __mutex_lock+0x14a/0xa9b blk_mq_hw_sysfs_show+0x63/0xb6 sysfs_kf_seq_show+0x11f/0x196 seq_read+0x2cd/0x5f2 vfs_read+0xc7/0x18c ksys_read+0xc4/0x13e do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (kn->count#202){++++}: check_prev_add+0x5d2/0xc45 validate_chain+0xed3/0xf94 __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 __kernfs_remove+0x237/0x40b kernfs_remove_by_name_ns+0x59/0x72 remove_files+0x61/0x96 sysfs_remove_group+0x81/0xa4 sysfs_remove_groups+0x3b/0x44 kobject_del+0x44/0x94 blk_mq_unregister_dev+0x83/0xdd blk_unregister_queue+0xa0/0x10b del_gendisk+0x259/0x3fa null_del_dev+0x8b/0x1c3 [null_blk] null_exit+0x5c/0x95 [null_blk] __se_sys_delete_module+0x204/0x337 do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->sysfs_lock); lock(kn->count#202); lock(&q->sysfs_lock); lock(kn->count#202); *** DEADLOCK *** 2 locks held by rmmod/777: #0: 00000000e69bd9de (&lock){+.+.}, at: null_exit+0x2e/0x95 [null_blk] #1: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b stack backtrace: CPU: 0 PID: 777 Comm: rmmod Not tainted 5.3.0-rc3-00044-g73277fc75ea0 #1380 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180724_192412-buildhw-07.phx4 Call Trace: dump_stack+0x9a/0xe6 check_noncircular+0x207/0x251 ? print_circular_bug+0x32a/0x32a ? find_usage_backwards+0x84/0xb0 check_prev_add+0x5d2/0xc45 validate_chain+0xed3/0xf94 ? check_prev_add+0xc45/0xc45 ? mark_lock+0x11b/0x804 ? check_usage_forwards+0x1ca/0x1ca __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 ? kernfs_remove_by_name_ns+0x59/0x72 __kernfs_remove+0x237/0x40b ? kernfs_remove_by_name_ns+0x59/0x72 ? kernfs_next_descendant_post+0x7d/0x7d ? strlen+0x10/0x23 ? strcmp+0x22/0x44 kernfs_remove_by_name_ns+0x59/0x72 remove_files+0x61/0x96 sysfs_remove_group+0x81/0xa4 sysfs_remove_groups+0x3b/0x44 kobject_del+0x44/0x94 blk_mq_unregister_dev+0x83/0xdd blk_unregister_queue+0xa0/0x10b del_gendisk+0x259/0x3fa ? disk_events_poll_msecs_store+0x12b/0x12b ? check_flags+0x1ea/0x204 ? mark_held_locks+0x1f/0x7a null_del_dev+0x8b/0x1c3 [null_blk] null_exit+0x5c/0x95 [null_blk] __se_sys_delete_module+0x204/0x337 ? free_module+0x39f/0x39f ? blkcg_maybe_throttle_current+0x8a/0x718 ? rwlock_bug+0x62/0x62 ? __blkcg_punt_bio_submit+0xd0/0xd0 ? trace_hardirqs_on_thunk+0x1a/0x20 ? mark_held_locks+0x1f/0x7a ? do_syscall_64+0x4c/0x295 do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fb696cdbe6b Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 008 RSP: 002b:00007ffec9588788 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559e589137c0 RCX: 00007fb696cdbe6b RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559e58913828 RBP: 0000000000000000 R08: 00007ffec9587701 R09: 0000000000000000 R10: 00007fb696d4eae0 R11: 0000000000000206 R12: 00007ffec95889b0 R13: 00007ffec95896b3 R14: 0000559e58913260 R15: 0000559e589137c0 Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Greg KH Cc: Mike Snitzer Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-mq-sysfs.c | 12 +++++------ block/blk-sysfs.c | 46 ++++++++++++++++++++++++----------------- block/blk.h | 2 +- block/elevator.c | 55 ++++++++++++++++++++++++++++++++++++++++++-------- include/linux/blkdev.h | 1 + 6 files changed, 84 insertions(+), 33 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 5d0d7441a443..77807a5d7f9e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) mutex_init(&q->blk_trace_mutex); #endif mutex_init(&q->sysfs_lock); + mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 6ddde3774ebe..a0d3ce30fa08 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -270,7 +270,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_dir_lock); queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); @@ -320,7 +320,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q) int ret, i; WARN_ON_ONCE(!q->kobj.parent); - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_dir_lock); ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) @@ -354,7 +354,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock; @@ -362,7 +362,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q) blk_mq_unregister_hctx(hctx); unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register(struct request_queue *q) @@ -370,7 +370,7 @@ int blk_mq_sysfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i, ret = 0; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock; @@ -381,7 +381,7 @@ int blk_mq_sysfs_register(struct request_queue *q) } unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); return ret; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5b0b5224cfd4..107513495220 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -938,6 +938,7 @@ int blk_register_queue(struct gendisk *disk) int ret; struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; + bool has_elevator = false; if (WARN_ON(!q)) return -ENXIO; @@ -945,7 +946,6 @@ int blk_register_queue(struct gendisk *disk) WARN_ONCE(blk_queue_registered(q), "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); /* * SCSI probing may synchronously create and destroy a lot of @@ -965,8 +965,7 @@ int blk_register_queue(struct gendisk *disk) if (ret) return ret; - /* Prevent changes through sysfs until registration is completed. */ - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); if (ret < 0) { @@ -987,26 +986,36 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); } - kobject_uevent(&q->kobj, KOBJ_ADD); - - wbt_enable_default(q); - - blk_throtl_register_queue(q); - + /* + * The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator + * switch won't happen at all. + */ if (q->elevator) { - ret = elv_register_queue(q); + ret = elv_register_queue(q, false); if (ret) { - mutex_unlock(&q->sysfs_lock); - kobject_uevent(&q->kobj, KOBJ_REMOVE); + mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); return ret; } + has_elevator = true; } + + mutex_lock(&q->sysfs_lock); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(q); + blk_throtl_register_queue(q); + + /* Now everything is ready and send out KOBJ_ADD uevent */ + kobject_uevent(&q->kobj, KOBJ_ADD); + if (has_elevator) + kobject_uevent(&q->elevator->kobj, KOBJ_ADD); + mutex_unlock(&q->sysfs_lock); + ret = 0; unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); @@ -1021,6 +1030,7 @@ EXPORT_SYMBOL_GPL(blk_register_queue); void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; + bool has_elevator; if (WARN_ON(!q)) return; @@ -1035,25 +1045,25 @@ void blk_unregister_queue(struct gendisk *disk) * concurrent elv_iosched_store() calls. */ mutex_lock(&q->sysfs_lock); - blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + has_elevator = !!q->elevator; + mutex_unlock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); - mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - mutex_lock(&q->sysfs_lock); - if (q->elevator) + if (has_elevator) elv_unregister_queue(q); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); kobject_put(&disk_to_dev(disk)->kobj); } diff --git a/block/blk.h b/block/blk.h index de6b2e146d6e..e4619fc5c99a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -188,7 +188,7 @@ int elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void __elevator_exit(struct request_queue *, struct elevator_queue *); -int elv_register_queue(struct request_queue *q); +int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); static inline void elevator_exit(struct request_queue *q, diff --git a/block/elevator.c b/block/elevator.c index 03d923196569..4781c4205a5d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -470,13 +470,16 @@ static struct kobj_type elv_ktype = { .release = elevator_release, }; -int elv_register_queue(struct request_queue *q) +/* + * elv_register_queue is called from either blk_register_queue or + * elevator_switch, elevator switch is prevented from being happen + * in the two paths, so it is safe to not hold q->sysfs_lock. + */ +int elv_register_queue(struct request_queue *q, bool uevent) { struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->sysfs_lock); - error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -487,24 +490,34 @@ int elv_register_queue(struct request_queue *q) attr++; } } - kobject_uevent(&e->kobj, KOBJ_ADD); + if (uevent) + kobject_uevent(&e->kobj, KOBJ_ADD); + + mutex_lock(&q->sysfs_lock); e->registered = 1; + mutex_unlock(&q->sysfs_lock); } return error; } +/* + * elv_unregister_queue is called from either blk_unregister_queue or + * elevator_switch, elevator switch is prevented from being happen + * in the two paths, so it is safe to not hold q->sysfs_lock. + */ void elv_unregister_queue(struct request_queue *q) { - lockdep_assert_held(&q->sysfs_lock); - if (q) { struct elevator_queue *e = q->elevator; kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + + mutex_lock(&q->sysfs_lock); e->registered = 0; /* Re-enable throttling in case elevator disabled it */ wbt_enable_default(q); + mutex_unlock(&q->sysfs_lock); } } @@ -567,10 +580,32 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock); if (q->elevator) { - if (q->elevator->registered) + if (q->elevator->registered) { + mutex_unlock(&q->sysfs_lock); + + /* + * Concurrent elevator switch can't happen becasue + * sysfs write is always exclusively on same file. + * + * Also the elevator queue won't be freed after + * sysfs_lock is released becasue kobject_del() in + * blk_unregister_queue() waits for completion of + * .store & .show on its attributes. + */ elv_unregister_queue(q); + + mutex_lock(&q->sysfs_lock); + } ioc_clear_queue(q); elevator_exit(q, q->elevator); + + /* + * sysfs_lock may be dropped, so re-check if queue is + * unregistered. If yes, don't switch to new elevator + * any more + */ + if (!blk_queue_registered(q)) + return 0; } ret = blk_mq_init_sched(q, new_e); @@ -578,7 +613,11 @@ int elevator_switch_mq(struct request_queue *q, goto out; if (new_e) { - ret = elv_register_queue(q); + mutex_unlock(&q->sysfs_lock); + + ret = elv_register_queue(q, true); + + mutex_lock(&q->sysfs_lock); if (ret) { elevator_exit(q, q->elevator); goto out; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5077f3fdfd6..1ac790178787 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -535,6 +535,7 @@ struct request_queue { struct delayed_work requeue_work; struct mutex sysfs_lock; + struct mutex sysfs_dir_lock; /* * for reusing dead hctx instance in case of updating -- cgit v1.2.3 From 6f816b4b746c2241540e537682d30d8e9997d674 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 28 Aug 2019 15:05:57 -0700 Subject: blk-mq: add optional request->alloc_time_ns There are currently two start time timestamps - start_time_ns and io_start_time_ns. The former marks the request allocation and and the second issue-to-device time. The planned io.weight controller needs to measure the total time bios take to execute after it leaves rq_qos including the time spent waiting for request to become available, which can easily dominate on saturated devices. This patch adds request->alloc_time_ns which records when the request allocation attempt started. As it isn't used for the usual stats, make it optional behind CONFIG_BLK_RQ_ALLOC_TIME and QUEUE_FLAG_RQ_ALLOC_TIME so that it can be compiled out when there are no users and it's active only on queues which need it even when compiled in. v2: s/pre_start_time/alloc_time/ and add CONFIG_BLK_RQ_ALLOC_TIME gating as suggested by Jens. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/Kconfig | 3 +++ block/blk-mq.c | 13 +++++++++++-- include/linux/blkdev.h | 13 ++++++++++++- 3 files changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/Kconfig b/block/Kconfig index 8b5f8e560eb4..1b62ad6d0e12 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -26,6 +26,9 @@ menuconfig BLOCK if BLOCK +config BLK_RQ_ALLOC_TIME + bool + config BLK_SCSI_REQUEST bool diff --git a/block/blk-mq.c b/block/blk-mq.c index cf768d0c2950..004411236034 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -291,7 +291,7 @@ static inline bool blk_mq_need_time_stamp(struct request *rq) } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, unsigned int op) + unsigned int tag, unsigned int op, u64 alloc_time_ns) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; @@ -325,6 +325,9 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + rq->alloc_time_ns = alloc_time_ns; +#endif if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else @@ -356,8 +359,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct request *rq; unsigned int tag; bool clear_ctx_on_error = false; + u64 alloc_time_ns = 0; blk_queue_enter_live(q); + + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); + data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); @@ -393,7 +402,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, return NULL; } - rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); + rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns); if (!op_is_flush(data->cmd_flags)) { rq->elv.icq = NULL; if (e && e->type->ops.prepare_request) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ac790178787..d0ad21e4771b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -194,7 +194,11 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; - /* Time that I/O was submitted to the kernel. */ +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + /* Time that the first bio started allocating this request. */ + u64 alloc_time_ns; +#endif + /* Time that this request was allocated for this IO. */ u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; @@ -609,6 +613,7 @@ struct request_queue { #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ +#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP)) @@ -637,6 +642,12 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) +#ifdef CONFIG_BLK_RQ_ALLOC_TIME +#define blk_queue_rq_alloc_time(q) \ + test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) +#else +#define blk_queue_rq_alloc_time(q) false +#endif #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ -- cgit v1.2.3 From 45147fb522bb459e79bdcb7504ee7ec8cfd4c12c Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 28 Aug 2019 21:35:42 +0900 Subject: block: add a helper function to merge the segments This patch adds a helper function whether a queue can merge the segments by the DMA MAP layer (e.g. via IOMMU). Signed-off-by: Yoshihiro Shimoda Reviewed-by: Christoph Hellwig Reviewed-by: Simon Horman --- block/blk-settings.c | 23 +++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 25 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-settings.c b/block/blk-settings.c index 2c1831207a8f..c3632fc6d540 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "blk.h" #include "blk-wbt.h" @@ -832,6 +833,28 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) } EXPORT_SYMBOL_GPL(blk_queue_write_cache); +/** + * blk_queue_can_use_dma_map_merging - configure queue for merging segments. + * @q: the request queue for the device + * @dev: the device pointer for dma + * + * Tell the block layer about merging the segments by dma map of @q. + */ +bool blk_queue_can_use_dma_map_merging(struct request_queue *q, + struct device *dev) +{ + unsigned long boundary = dma_get_merge_boundary(dev); + + if (!boundary) + return false; + + /* No need to update max_segment_size. see blk_queue_virt_boundary() */ + blk_queue_virt_boundary(q, boundary); + + return true; +} +EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ef375dafb1c..f6d55e2490dc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1085,6 +1085,8 @@ extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); +extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, + struct device *dev); /* * Number of physical segments as sent to the device. -- cgit v1.2.3 From 68c43f133a754c7bf5cb1018bb16dc0821cc43a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 5 Sep 2019 18:51:31 +0900 Subject: block: Introduce elevator features Introduce the definition of elevator features through the elevator_features flags in the elevator_type structure. Each flag can represent a feature supported by an elevator. The first feature defined by this patch is support for zoned block device sequential write constraint with the flag ELEVATOR_F_ZBD_SEQ_WRITE, which is implemented by the mq-deadline elevator using zone write locking. Other possible features are IO priorities, write hints, latency targets or single-LUN dual-actuator disks (for which the elevator could maintain one LBA ordered list per actuator). The required_elevator_features field is also added to the request_queue structure to allow a device driver to specify elevator feature flags that an elevator must support for the correct operation of the device (e.g. device drivers for zoned block devices can have the ELEVATOR_F_ZBD_SEQ_WRITE flag as a required feature). The helper function blk_queue_required_elevator_features() is defined for setting this new field. With these two new fields in place, the elevator functions elevator_match() and elevator_find() are modified to allow a user to set only an elevator with a set of features that satisfies the device required features. Elevators not matching the device requirements are not shown in the device sysfs queue/scheduler file to prevent their use. The "none" elevator can always be selected as before. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-settings.c | 16 ++++++++++++++++ block/elevator.c | 49 +++++++++++++++++++++++++++++++++++++----------- block/mq-deadline.c | 1 + include/linux/blkdev.h | 4 ++++ include/linux/elevator.h | 8 ++++++++ 5 files changed, 67 insertions(+), 11 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-settings.c b/block/blk-settings.c index a058997b9cce..6bd1e3b082d8 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -832,6 +832,22 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) } EXPORT_SYMBOL_GPL(blk_queue_write_cache); +/** + * blk_queue_required_elevator_features - Set a queue required elevator features + * @q: the request queue for the target device + * @features: Required elevator features OR'ed together + * + * Tell the block layer that for the device controlled through @q, only the + * only elevators that can be used are those that implement at least the set of + * features specified by @features. + */ +void blk_queue_required_elevator_features(struct request_queue *q, + unsigned int features) +{ + q->required_elevator_features = features; +} +EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/block/elevator.c b/block/elevator.c index 2944c129760c..ac7c8ad580ba 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,8 +83,26 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static bool elevator_match(const struct elevator_type *e, const char *name) +static inline bool elv_support_features(unsigned int elv_features, + unsigned int required_features) { + return (required_features & elv_features) == required_features; +} + +/** + * elevator_match - Test an elevator name and features + * @e: Scheduler to test + * @name: Elevator name to test + * @required_features: Features that the elevator must provide + * + * Return true is the elevator @e name matches @name and if @e provides all the + * the feratures spcified by @required_features. + */ +static bool elevator_match(const struct elevator_type *e, const char *name, + unsigned int required_features) +{ + if (!elv_support_features(e->elevator_features, required_features)) + return false; if (!strcmp(e->elevator_name, name)) return true; if (e->elevator_alias && !strcmp(e->elevator_alias, name)) @@ -93,15 +111,21 @@ static bool elevator_match(const struct elevator_type *e, const char *name) return false; } -/* - * Return scheduler with name 'name' +/** + * elevator_find - Find an elevator + * @name: Name of the elevator to find + * @required_features: Features that the elevator must provide + * + * Return the first registered scheduler with name @name and supporting the + * features @required_features and NULL otherwise. */ -static struct elevator_type *elevator_find(const char *name) +static struct elevator_type *elevator_find(const char *name, + unsigned int required_features) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (elevator_match(e, name)) + if (elevator_match(e, name, required_features)) return e; } @@ -120,12 +144,12 @@ static struct elevator_type *elevator_get(struct request_queue *q, spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->required_elevator_features); if (!e && try_loading) { spin_unlock(&elv_list_lock); request_module("%s-iosched", name); spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->required_elevator_features); } if (e && !try_module_get(e->elevator_owner)) @@ -525,7 +549,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name)) { + if (elevator_find(e->elevator_name, 0)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; @@ -709,7 +733,8 @@ static int __elevator_change(struct request_queue *q, const char *name) if (!e) return -EINVAL; - if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { + if (q->elevator && + elevator_match(q->elevator->type, elevator_name, 0)) { elevator_put(e); return 0; } @@ -749,11 +774,13 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (elv && elevator_match(elv, __e->elevator_name)) { + if (elv && elevator_match(elv, __e->elevator_name, 0)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } - if (elv_support_iosched(q)) + if (elv_support_iosched(q) && + elevator_match(__e, __e->elevator_name, + q->required_elevator_features)) len += sprintf(name+len, "%s ", __e->elevator_name); } spin_unlock(&elv_list_lock); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 35e84bc0ec8c..b490f47fd553 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -794,6 +794,7 @@ static struct elevator_type mq_deadline = { .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", + .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d0ad21e4771b..b196124e3240 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -496,6 +496,8 @@ struct request_queue { struct queue_limits limits; + unsigned int required_elevator_features; + #ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information for request dispatch control. @@ -1097,6 +1099,8 @@ extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); +extern void blk_queue_required_elevator_features(struct request_queue *q, + unsigned int features); /* * Number of physical segments as sent to the device. diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1dd014c9c87b..901bda352dcb 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -76,6 +76,7 @@ struct elevator_type struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; + const unsigned int elevator_features; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; @@ -165,5 +166,12 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) +/* + * Elevator features. + */ + +/* Supports zoned block devices sequential write constraint */ +#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) + #endif /* CONFIG_BLOCK */ #endif -- cgit v1.2.3 From 3d24430694077313c75c6b89f618db09943621e4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 21 May 2019 15:59:03 +0800 Subject: block: make rq sector size accessible for block stats Currently rq->data_len will be decreased by partial completion or zeroed by completion, so when blk_stat_add() is invoked, data_len will be zero and there will never be samples in poll_cb because blk_mq_poll_stats_bkt() will return -1 if data_len is zero. We could move blk_stat_add() back to __blk_mq_complete_request(), but that would make the effort of trying to call ktime_get_ns() once in vain. Instead we can reuse throtl_size field, and use it for both block stats and block throttle, and adjust the logic in blk_mq_poll_stats_bkt() accordingly. Fixes: 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()") Tested-by: Pavel Begunkov Signed-off-by: Hou Tao Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 +++++------ block/blk-throttle.c | 3 ++- include/linux/blkdev.h | 15 ++++++++++++--- 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3647776a0f6e..d30fabb583fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); static int blk_mq_poll_stats_bkt(const struct request *rq) { - int ddir, bytes, bucket; + int ddir, sectors, bucket; ddir = rq_data_dir(rq); - bytes = blk_rq_bytes(rq); + sectors = blk_rq_stats_sectors(rq); - bucket = ddir + 2*(ilog2(bytes) - 9); + bucket = ddir + 2 * ilog2(sectors); if (bucket < 0) return -1; @@ -333,6 +333,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, else rq->start_time_ns = 0; rq->io_start_time_ns = 0; + rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -681,9 +682,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - rq->throtl_size = blk_rq_sectors(rq); -#endif + rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0445c998c377..18f773e52dfb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2248,7 +2248,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns) struct request_queue *q = rq->q; struct throtl_data *td = q->td; - throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10); + throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), + time_ns >> 10); } void blk_throtl_bio_endio(struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b196124e3240..3094f2d513b2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -206,9 +206,12 @@ struct request { #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - unsigned short throtl_size; -#endif + /* + * rq sectors used for blk stats. It has the same value + * with blk_rq_sectors(rq), except that it never be zeroed + * by completion. + */ + unsigned short stats_sectors; /* * Number of scatter-gather DMA addr+len pairs after @@ -917,6 +920,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) * blk_rq_err_bytes() : bytes left till the next error boundary * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment + * blk_rq_stats_sectors() : sectors of the entire request used for stats */ static inline sector_t blk_rq_pos(const struct request *rq) { @@ -945,6 +949,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } +static inline unsigned int blk_rq_stats_sectors(const struct request *rq) +{ + return rq->stats_sectors; +} + #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) { -- cgit v1.2.3 From 54d4e6ab91eb24b47a58403d8561206e916f0242 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 16 Sep 2019 18:44:29 +0300 Subject: block: centralize PI remapping logic to the block layer Currently t10_pi_prepare/t10_pi_complete functions are called during the NVMe and SCSi layers command preparetion/completion, but their actual place should be the block layer since T10-PI is a general data integrity feature that is used by block storage protocols. Introduce .prepare_fn and .complete_fn callbacks within the integrity profile that each type can implement according to its needs. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Suggested-by: Martin K. Petersen Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Fixed to not call queue integrity functions if BLK_DEV_INTEGRITY isn't defined in the config. Signed-off-by: Jens Axboe --- block/blk-core.c | 7 +++ block/blk-integrity.c | 11 ++++ block/blk-mq.c | 6 ++ block/t10-pi.c | 144 ++++++++++++++++++++++++---------------------- drivers/md/dm-integrity.c | 10 ++++ drivers/nvme/host/core.c | 9 --- drivers/scsi/sd.c | 8 --- include/linux/blkdev.h | 4 ++ include/linux/t10-pi.h | 14 ----- 9 files changed, 114 insertions(+), 99 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 875e8d105067..d5e668ec751b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1436,6 +1437,12 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) print_req_error(req, error, __func__); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ca39b4624cf8..ff1070edbb40 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -368,10 +368,21 @@ static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) return BLK_STS_OK; } +static void blk_integrity_nop_prepare(struct request *rq) +{ +} + +static void blk_integrity_nop_complete(struct request *rq, + unsigned int nr_bytes) +{ +} + static const struct blk_integrity_profile nop_profile = { .name = "nop", .generate_fn = blk_integrity_nop_fn, .verify_fn = blk_integrity_nop_fn, + .prepare_fn = blk_integrity_nop_prepare, + .complete_fn = blk_integrity_nop_complete, }; /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 20a49be536b5..29275f5a996f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -30,6 +30,7 @@ #include #include +#include #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -700,6 +701,11 @@ void blk_mq_start_request(struct request *rq) */ rq->nr_phys_segments++; } + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) + q->integrity.profile->prepare_fn(rq); +#endif } EXPORT_SYMBOL(blk_mq_start_request); diff --git a/block/t10-pi.c b/block/t10-pi.c index 7fed58705d1a..0c0120a672f9 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -120,76 +120,22 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - /** - * t10_pi_prepare - prepare PI prior submitting request to device + * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to * partitioning, MD/DM cloning, etc. the actual physical start sector is * likely to be different. Remap protection information to match the * physical LBA. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_prepare(struct request *rq, u8 protection_type) +static void t10_pi_type1_prepare(struct request *rq) { const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -222,13 +168,11 @@ void t10_pi_prepare(struct request *rq, u8 protection_type) bip->bip_flags |= BIP_MAPPED_INTEGRITY; } } -EXPORT_SYMBOL(t10_pi_prepare); /** - * t10_pi_complete - prepare PI prior returning request to the block layer + * t10_pi_type1_complete - prepare PI prior returning request to the blk layer * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) - * @intervals: total elements to prepare + * @nr_bytes: total bytes to prepare * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to @@ -236,19 +180,14 @@ EXPORT_SYMBOL(t10_pi_prepare); * likely to be different. Since the physical start sector was submitted * to the device, we should remap it back to virtual values expected by the * block layer. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) +static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { + unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -276,4 +215,73 @@ void t10_pi_complete(struct request *rq, u8 protection_type, } } } -EXPORT_SYMBOL(t10_pi_complete); + +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +/** + * Type 3 does not have a reference tag so no remapping is required. + */ +static void t10_pi_type3_prepare(struct request *rq) +{ +} + +/** + * Type 3 does not have a reference tag so no remapping is required. + */ +static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) +{ +} + +const struct blk_integrity_profile t10_pi_type1_crc = { + .name = "T10-DIF-TYPE1-CRC", + .generate_fn = t10_pi_type1_generate_crc, + .verify_fn = t10_pi_type1_verify_crc, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_crc); + +const struct blk_integrity_profile t10_pi_type1_ip = { + .name = "T10-DIF-TYPE1-IP", + .generate_fn = t10_pi_type1_generate_ip, + .verify_fn = t10_pi_type1_verify_ip, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_ip); + +const struct blk_integrity_profile t10_pi_type3_crc = { + .name = "T10-DIF-TYPE3-CRC", + .generate_fn = t10_pi_type3_generate_crc, + .verify_fn = t10_pi_type3_verify_crc, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_crc); + +const struct blk_integrity_profile t10_pi_type3_ip = { + .name = "T10-DIF-TYPE3-IP", + .generate_fn = t10_pi_type3_generate_ip, + .verify_fn = t10_pi_type3_verify_ip, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_ip); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 9118ab85cb3a..dab4446fe7d8 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -345,6 +345,14 @@ static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif +static void dm_integrity_prepare(struct request *rq) +{ +} + +static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ +} + /* * DM Integrity profile, protection is performed layer above (dm-crypt) */ @@ -352,6 +360,8 @@ static const struct blk_integrity_profile dm_integrity_profile = { .name = "DM-DIF-EXT-TAG", .generate_fn = NULL, .verify_fn = NULL, + .prepare_fn = dm_integrity_prepare, + .complete_fn = dm_integrity_complete, }; static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1ede1763a5ee..108f60b46804 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -666,8 +666,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) return BLK_STS_NOTSUPP; control |= NVME_RW_PRINFO_PRACT; - } else if (req_op(req) == REQ_OP_WRITE) { - t10_pi_prepare(req, ns->pi_type); } switch (ns->pi_type) { @@ -690,13 +688,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, void nvme_cleanup_cmd(struct request *req) { - if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && - nvme_req(req)->status == 0) { - struct nvme_ns *ns = req->rq_disk->private_data; - - t10_pi_complete(req, ns->pi_type, - blk_rq_bytes(req) >> ns->lba_shift); - } if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { struct nvme_ns *ns = req->rq_disk->private_data; struct page *page = req->special_vec.bv_page; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 4b925552458f..0e8941caaa0d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1211,9 +1211,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); - if (write && dix) - t10_pi_prepare(cmd->request, sdkp->protection_type); - if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else @@ -2054,11 +2051,6 @@ static int sd_done(struct scsi_cmnd *SCpnt) "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); - if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) && - good_bytes) - t10_pi_complete(SCpnt->request, sdkp->protection_type, - good_bytes / scsi_prot_interval(SCpnt)); - return good_bytes; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3094f2d513b2..6032bb740cf4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1522,10 +1522,14 @@ struct blk_integrity_iter { }; typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); +typedef void (integrity_prepare_fn) (struct request *); +typedef void (integrity_complete_fn) (struct request *, unsigned int); struct blk_integrity_profile { integrity_processing_fn *generate_fn; integrity_processing_fn *verify_fn; + integrity_prepare_fn *prepare_fn; + integrity_complete_fn *complete_fn; const char *name; }; diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 3e2a80cc7b56..96305a64a5a7 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -53,18 +53,4 @@ extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; -#ifdef CONFIG_BLK_DEV_INTEGRITY -extern void t10_pi_prepare(struct request *rq, u8 protection_type); -extern void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals); -#else -static inline void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) -{ -} -static inline void t10_pi_prepare(struct request *rq, u8 protection_type) -{ -} -#endif - #endif -- cgit v1.2.3