From 309dca309fc39a9e3c31b916393b74bd174fd74e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:34 +0100 Subject: block: store a block_device pointer in struct bio Replace the gendisk pointer in struct bio with a pointer to the newly improved struct block device. From that the gendisk can be trivially accessed with an extra indirection, but it also allows to directly look up all information related to partition remapping. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/bio.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux/bio.h') diff --git a/include/linux/bio.h b/include/linux/bio.h index 1edda614f7ce..12af7aa5db37 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -483,24 +483,22 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); -#define bio_set_dev(bio, bdev) \ -do { \ - if ((bio)->bi_disk != (bdev)->bd_disk) \ - bio_clear_flag(bio, BIO_THROTTLED);\ - (bio)->bi_disk = (bdev)->bd_disk; \ - (bio)->bi_partno = (bdev)->bd_partno; \ - bio_associate_blkg(bio); \ +#define bio_set_dev(bio, bdev) \ +do { \ + if ((bio)->bi_bdev != (bdev)) \ + bio_clear_flag(bio, BIO_THROTTLED); \ + (bio)->bi_bdev = (bdev); \ + bio_associate_blkg(bio); \ } while (0) #define bio_copy_dev(dst, src) \ do { \ - (dst)->bi_disk = (src)->bi_disk; \ - (dst)->bi_partno = (src)->bi_partno; \ + (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) #define bio_dev(bio) \ - disk_devt((bio)->bi_disk) + disk_devt((bio)->bi_bdev->bd_disk) #ifdef CONFIG_BLK_CGROUP void bio_associate_blkg(struct bio *bio); -- cgit v1.2.3 From 30c5d3456c272f0de0d7e7eb9fc355fa64a5f649 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:36 +0100 Subject: block: do not reassig ->bi_bdev when partition remapping There is no good reason to reassign ->bi_bdev when remapping the partition-relative block number to the device wide one, as all the information required by the drivers comes from the gendisk anyway. Keeping the original ->bi_bdev alive will allow to greatly simplify the partition-away I/O accounting. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- include/linux/bio.h | 2 ++ include/linux/blk_types.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux/bio.h') diff --git a/block/blk-core.c b/block/blk-core.c index 64f69022de96..1c1b97a82caa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -752,7 +752,7 @@ static int blk_partition_remap(struct bio *bio) bio->bi_iter.bi_sector - p->bd_start_sect); } - bio->bi_bdev = bdev_whole(p); + bio_set_flag(bio, BIO_REMAPPED); return 0; } @@ -817,7 +817,8 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) goto end_io; if (unlikely(bio_check_eod(bio))) goto end_io; - if (bio->bi_bdev->bd_partno && unlikely(blk_partition_remap(bio))) + if (bio->bi_bdev->bd_partno && !bio_flagged(bio, BIO_REMAPPED) && + unlikely(blk_partition_remap(bio))) goto end_io; /* diff --git a/include/linux/bio.h b/include/linux/bio.h index 12af7aa5db37..2f1155eabaff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -485,6 +485,7 @@ extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ do { \ + bio_clear_flag(bio, BIO_REMAPPED); \ if ((bio)->bi_bdev != (bdev)) \ bio_clear_flag(bio, BIO_THROTTLED); \ (bio)->bi_bdev = (bdev); \ @@ -493,6 +494,7 @@ do { \ #define bio_copy_dev(dst, src) \ do { \ + bio_clear_flag(dst, BIO_REMAPPED); \ (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8ebd8be3e050..1bc6f6a01070 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -303,6 +303,7 @@ enum { * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ + BIO_REMAPPED, BIO_FLAG_LAST }; -- cgit v1.2.3 From 9f180e315a93cde559ac1c9c4c5ce980aa681c1c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:54 +0800 Subject: block: don't allocate inline bvecs if this bioset needn't bvecs The inline bvecs won't be used if user needn't bvecs by not passing BIOSET_NEED_BVECS, so don't allocate bvecs in this situation. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Tested-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/bio.c | 7 +++++-- include/linux/bio.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux/bio.h') diff --git a/block/bio.c b/block/bio.c index 87bf16460e0e..8ccda51dd831 100644 --- a/block/bio.c +++ b/block/bio.c @@ -89,8 +89,7 @@ fail_alloc_slab: static inline unsigned int bs_bio_slab_size(struct bio_set *bs) { - return bs->front_pad + sizeof(struct bio) + - BIO_INLINE_VECS * sizeof(struct bio_vec); + return bs->front_pad + sizeof(struct bio) + bs->back_pad; } static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) @@ -1563,6 +1562,10 @@ int bioset_init(struct bio_set *bs, int flags) { bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); diff --git a/include/linux/bio.h b/include/linux/bio.h index 2f1155eabaff..5d8977aafa19 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -703,6 +703,7 @@ struct bio_set { mempool_t bvec_integrity_pool; #endif + unsigned int back_pad; /* * Deadlock avoidance for stacking block drivers: see comments in * bio_alloc_bioset() for details -- cgit v1.2.3 From eec716a1c18c796a69db0be5e2a6f282ba5bccd6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:56 +0800 Subject: block: move three bvec helpers declaration into private helper bvec_alloc(), bvec_free() and bvec_nr_vecs() are only used inside block layer core functions, no need to declare them in public header. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk.h | 4 ++++ include/linux/bio.h | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/bio.h') diff --git a/block/blk.h b/block/blk.h index ab0aaf958553..0198335c5838 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,6 +55,10 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); +struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); +void bvec_free(mempool_t *, struct bio_vec *, unsigned int); +unsigned int bvec_nr_vecs(unsigned short idx); + static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 5d8977aafa19..e135b500df5d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -478,9 +478,6 @@ static inline void zero_fill_bio(struct bio *bio) zero_fill_bio_iter(bio, bio->bi_iter); } -extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); -extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); -extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ -- cgit v1.2.3 From 3e1a88ec96259282b9a8b45c3f1fda7a3ff4f6ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:03:02 +0000 Subject: bio: add a helper calculating nr segments to alloc Add a helper function calculating the number of bvec segments we need to allocate to construct a bio. It doesn't change anything functionally, but will be used to not duplicate special cases in the future. Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- fs/block_dev.c | 7 ++++--- fs/iomap/direct-io.c | 9 ++++----- include/linux/bio.h | 10 ++++++++++ 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include/linux/bio.h') diff --git a/fs/block_dev.c b/fs/block_dev.c index 3b8963e228a1..6f5bd9950baf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -416,7 +416,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; - nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES); if (!nr_pages) { bool polled = false; @@ -481,9 +481,10 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { int nr_pages; - nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); - if (!nr_pages) + if (!iov_iter_count(iter)) return 0; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1); if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 933f234d5bec..ea1e8f696076 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -250,11 +250,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, orig_count = iov_iter_count(dio->submit.iter); iov_iter_truncate(dio->submit.iter, length); - nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); - if (nr_pages <= 0) { - ret = nr_pages; + if (!iov_iter_count(dio->submit.iter)) goto out; - } if (need_zeroout) { /* zero out from the start of the block to the write offset */ @@ -263,6 +260,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, iomap_dio_zero(dio, iomap, pos - pad, pad); } + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES); do { size_t n; if (dio->error) { @@ -308,7 +306,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, dio->size += n; copied += n; - nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, + BIO_MAX_PAGES); iomap_dio_submit_bio(dio, iomap, bio, pos); pos += n; } while (nr_pages); diff --git a/include/linux/bio.h b/include/linux/bio.h index e135b500df5d..9ddb19801a03 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -10,6 +10,7 @@ #include /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include +#include #define BIO_DEBUG @@ -441,6 +442,15 @@ static inline void bio_wouldblock_error(struct bio *bio) bio_endio(bio); } +/* + * Calculate number of bvec segments that should be allocated to fit data + * pointed by @iter. + */ +static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) +{ + return iov_iter_npages(iter, max_segs); +} + struct request_queue; extern int submit_bio_wait(struct bio *bio); -- cgit v1.2.3 From c42bca92be928ce7dece5fc04cf68d0e37ee6718 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:03:03 +0000 Subject: bio: don't copy bvec for direct IO The block layer spends quite a while in blkdev_direct_IO() to copy and initialise bio's bvec. However, if we've already got a bvec in the input iterator it might be reused in some cases, i.e. when new ITER_BVEC_FLAG_FIXED flag is set. Simple tests show considerable performance boost, and it also reduces memory footprint. Suggested-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- Documentation/filesystems/porting.rst | 9 +++++ block/bio.c | 67 +++++++++++++++-------------------- include/linux/bio.h | 5 ++- 3 files changed, 42 insertions(+), 39 deletions(-) (limited to 'include/linux/bio.h') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index c722d94f29ea..1f8cf8e10b34 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -872,3 +872,12 @@ its result is kern_unmount() or kern_unmount_array(). zero-length bvec segments are disallowed, they must be filtered out before passed on to an iterator. + +--- + +**mandatory** + +For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but +uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and +page references stay until I/O has completed, i.e. until ->ki_complete() has +been called or returned with non -EIOCBQUEUED code. diff --git a/block/bio.c b/block/bio.c index 1cd8a2e79048..99040a7e6656 100644 --- a/block/bio.c +++ b/block/bio.c @@ -942,21 +942,17 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - const struct bio_vec *bv = iter->bvec; - unsigned int len; - size_t size; - - if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) - return -EINVAL; - - len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); - size = bio_add_page(bio, bv->bv_page, len, - bv->bv_offset + iter->iov_offset); - if (unlikely(size != len)) - return -EINVAL; - iov_iter_advance(iter, size); + WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0); + + bio->bi_vcnt = iter->nr_segs; + bio->bi_max_vecs = iter->nr_segs; + bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_bvec_done = iter->iov_offset; + bio->bi_iter.bi_size = iter->count; + + iov_iter_advance(iter, iter->count); return 0; } @@ -1070,12 +1066,12 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. If we're adding kernel pages, and the caller told us it's safe to - * do so, we just have to add the pages to the bio directly. We don't grab an - * extra reference to those pages (the user should already have that), and we - * don't put the page on IO completion. The caller needs to check if the bio is - * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be - * released. + * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided + * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs + * to ensure the bvecs and pages stay referenced until the submitted I/O is + * completed by a call to ->ki_complete() or returns with an error other than + * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF + * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If @@ -1087,27 +1083,22 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - const bool is_bvec = iov_iter_is_bvec(iter); - int ret; - - if (WARN_ON_ONCE(bio->bi_vcnt)) - return -EINVAL; + int ret = 0; - do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - if (WARN_ON_ONCE(is_bvec)) - return -EINVAL; - ret = __bio_iov_append_get_pages(bio, iter); - } else { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); + if (iov_iter_is_bvec(iter)) { + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return -EINVAL; + bio_iov_bvec_set(bio, iter); + bio_set_flag(bio, BIO_NO_PAGE_REF); + return 0; + } else { + do { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = __bio_iov_append_get_pages(bio, iter); else ret = __bio_iov_iter_get_pages(bio, iter); - } - } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - - if (is_bvec) - bio_set_flag(bio, BIO_NO_PAGE_REF); + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + } /* don't account direct I/O as memory stall */ bio_clear_flag(bio, BIO_WORKINGSET); diff --git a/include/linux/bio.h b/include/linux/bio.h index 9ddb19801a03..676870b2c88d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -444,10 +444,13 @@ static inline void bio_wouldblock_error(struct bio *bio) /* * Calculate number of bvec segments that should be allocated to fit data - * pointed by @iter. + * pointed by @iter. If @iter is backed by bvec it's going to be reused + * instead of allocating a new one. */ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) { + if (iov_iter_is_bvec(iter)) + return 0; return iov_iter_npages(iter, max_segs); } -- cgit v1.2.3 From 3175199ab0ac8c874ec25c6bf169f74888917435 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:34 +0100 Subject: block: split bio_kmalloc from bio_alloc_bioset bio_kmalloc shares almost no logic with the bio_set based fast path in bio_alloc_bioset. Split it into an entirely separate implementation. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/bio.c | 167 ++++++++++++++++++++++++++-------------------------- include/linux/bio.h | 6 +- 2 files changed, 86 insertions(+), 87 deletions(-) (limited to 'include/linux/bio.h') diff --git a/block/bio.c b/block/bio.c index dfd7740a3230..d4375619348c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -396,123 +396,101 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. + * Allocate a bio from the mempools in @bs. * - * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will - * always be able to allocate a bio. This is due to the mempool guarantees. - * To make this work, callers must never allocate more than 1 bio at a time - * from this pool. Callers that need to allocate more than 1 bio must always - * submit the previously allocated bio for IO before attempting to allocate - * a new one. Failure to do so can cause deadlocks under memory pressure. + * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to + * allocate a bio. This is due to the mempool guarantees. To make this work, + * callers must never allocate more than 1 bio at a time from the general pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under submit_bio_noacct() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * submit_bio_noacct() that converts recursion into iteration, to prevent - * stack overflows. + * Note that when running under submit_bio_noacct() (i.e. any block driver), + * bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. * - * This would normally mean allocating multiple bios under - * submit_bio_noacct() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. + * This would normally mean allocating multiple bios under submit_bio_noacct() + * would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. * - * RETURNS: - * Pointer to new bio on success, NULL on failure. + * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - struct bio_vec *bvl = NULL; struct bio *bio; void *p; - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && - nr_iovecs > 0)) - return NULL; - /* - * submit_bio_noacct() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath submit_bio_noacct(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_DIRECT_RECLAIM; if that fails, we punt those - * bios we would be blocking to the rescuer workqueue before - * we retry with the original gfp_flags. - */ - - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) + return NULL; + /* + * submit_bio_noacct() converts recursion to iteration; this means if + * we're running beneath it, any bios we allocate and submit will not be + * submitted (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios + * from the same bio_set() while running underneath submit_bio_noacct(). + * If we were to allocate multiple bios (say a stacking block driver + * that was splitting bios), we would deadlock if we exhausted the + * mempool's reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are bios on + * current->bio_list, we first try the allocation without + * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be + * blocking to the rescuer workqueue before we retry with the original + * gfp_flags. + */ + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; } - if (unlikely(!p)) return NULL; - bio = p + front_pad; - bio_init(bio, NULL, 0); - - if (nr_iovecs > inline_vecs) { + bio = p + bs->front_pad; + if (nr_iovecs > BIO_INLINE_VECS) { unsigned long idx = 0; + struct bio_vec *bvl = NULL; bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, + &bs->bvec_pool); } if (unlikely(!bvl)) goto err_free; bio->bi_flags |= idx << BVEC_POOL_OFFSET; - bio->bi_max_vecs = bvec_nr_vecs(idx); + bio_init(bio, bvl, bvec_nr_vecs(idx)); } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; - bio->bi_max_vecs = inline_vecs; + bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); + } else { + bio_init(bio, NULL, 0); } bio->bi_pool = bs; - bio->bi_io_vec = bvl; return bio; err_free: @@ -521,6 +499,31 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); +/** + * bio_kmalloc - kmalloc a bio for I/O + * @gfp_mask: the GFP_* mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * + * Use kmalloc to allocate and initialize a bio. + * + * Returns: Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + struct bio *bio; + + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); + if (unlikely(!bio)) + return NULL; + bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); + bio->bi_pool = NULL; + return bio; +} +EXPORT_SYMBOL(bio_kmalloc); + void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { unsigned long flags; diff --git a/include/linux/bio.h b/include/linux/bio.h index 676870b2c88d..c74857cf1252 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,6 +408,7 @@ extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); @@ -420,11 +421,6 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); -} - extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); -- cgit v1.2.3 From 6ac0b71537e1c14e7532408fe4aae553aa314237 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:20 +0100 Subject: block: move struct biovec_slab to bio.c struct biovec_slab is only used inside of bio.c, so move it there. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 6 ++++++ include/linux/bio.h | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/bio.h') diff --git a/block/bio.c b/block/bio.c index cee2d310f02e..2c359dadfdf6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,6 +25,12 @@ #include "blk.h" #include "blk-rq-qos.h" +struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +}; + /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an diff --git a/include/linux/bio.h b/include/linux/bio.h index c74857cf1252..4a84207dd996 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -720,12 +720,6 @@ struct bio_set { struct workqueue_struct *rescue_workqueue; }; -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - static inline bool bioset_initialized(struct bio_set *bs) { return bs->bio_slab != NULL; -- cgit v1.2.3 From 0f2e6ab851ae146c468bc5151c302c6e2473f70a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:24 +0100 Subject: block: turn the nr_iovecs argument to bio_alloc* into an unsigned short The bi_max_vecs and bi_vcnt fields are defined as unsigned short, so don't allow passing larger values in. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- include/linux/bio.h | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux/bio.h') diff --git a/block/bio.c b/block/bio.c index ae241252ea14..3d28d4723f6f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -407,7 +407,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; @@ -493,7 +493,7 @@ EXPORT_SYMBOL(bio_alloc_bioset); * * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) { struct bio *bio; diff --git a/include/linux/bio.h b/include/linux/bio.h index 4a84207dd996..9ceeb8ecdb7f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -407,8 +407,9 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); -extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs, + struct bio_set *bs); +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); @@ -416,7 +417,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; -static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -- cgit v1.2.3 From 7a800a20ae6329e803c5c646b20811a6ae9ca136 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:29 +0100 Subject: block: use bi_max_vecs to find the bvec pool Instead of encoding of the bvec pool using magic bio flags, just use a helper to find the pool based on the max_vecs value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 11 ++--- block/bio.c | 104 +++++++++++++++++++--------------------------- block/blk.h | 6 +-- include/linux/bio.h | 1 - include/linux/blk_types.h | 29 +------------ 5 files changed, 51 insertions(+), 100 deletions(-) (limited to 'include/linux/bio.h') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 19617fa326c3..dfa652122a2d 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -28,7 +28,7 @@ static void __bio_integrity_free(struct bio_set *bs, if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); + bip->bip_max_vcnt); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); @@ -70,14 +70,11 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); if (nr_vecs > inline_vecs) { - unsigned long idx = 0; - - bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - &bs->bvec_integrity_pool); + bip->bip_max_vcnt = nr_vecs; + bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, + &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; - bip->bip_max_vcnt = bvec_nr_vecs(idx); - bip->bip_slab = idx; } else { bip->bip_vec = bip->bip_inline_vecs; bip->bip_max_vcnt = inline_vecs; diff --git a/block/bio.c b/block/bio.c index a36f955cd120..a0eabe2f8b07 100644 --- a/block/bio.c +++ b/block/bio.c @@ -36,6 +36,24 @@ static struct biovec_slab { { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, }; +static struct biovec_slab *biovec_slab(unsigned short nr_vecs) +{ + switch (nr_vecs) { + /* smaller bios use inline vecs */ + case 5 ... 16: + return &bvec_slabs[0]; + case 17 ... 64: + return &bvec_slabs[1]; + case 65 ... 128: + return &bvec_slabs[2]; + case 129 ... BIO_MAX_PAGES: + return &bvec_slabs[3]; + default: + BUG(); + return NULL; + } +} + /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. @@ -131,26 +149,14 @@ out: mutex_unlock(&bio_slab_lock); } -unsigned int bvec_nr_vecs(unsigned short idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - return bvec_slabs[--idx].nr_vecs; -} + BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - if (!idx) - return; - idx--; - - BIO_BUG_ON(idx >= BVEC_POOL_NR); - - if (idx == BVEC_POOL_MAX) { + if (nr_vecs == BIO_MAX_PAGES) mempool_free(bv, pool); - } else { - struct biovec_slab *bvs = bvec_slabs + idx; - - kmem_cache_free(bvs->slab, bv); - } + else if (nr_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } /* @@ -163,48 +169,34 @@ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask) { + struct biovec_slab *bvs = biovec_slab(*nr_vecs); + + if (WARN_ON_ONCE(!bvs)) + return NULL; + /* - * see comment near bvec_array define! + * Upgrade the nr_vecs request to take full advantage of the allocation. + * We also rely on this in the bvec_free path. */ - switch (nr) { - /* smaller bios use inline vecs */ - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: - return NULL; - } + *nr_vecs = bvs->nr_vecs; /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. * The mempool is sized to handle up to BIO_MAX_PAGES entries. */ - if (*idx < BVEC_POOL_MAX) { - struct biovec_slab *bvs = bvec_slabs + *idx; + if (*nr_vecs < BIO_MAX_PAGES) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) { - (*idx)++; + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; - } - *idx = BVEC_POOL_MAX; + *nr_vecs = BIO_MAX_PAGES; } - (*idx)++; return mempool_alloc(pool, gfp_mask); } @@ -231,7 +223,7 @@ static void bio_free(struct bio *bio) bio_uninit(bio); if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); /* * If we have front padding, adjust the bio pointer before freeing @@ -275,12 +267,8 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio) { - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - bio_uninit(bio); - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -453,22 +441,18 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, bio = p + bs->front_pad; if (nr_iovecs > BIO_INLINE_VECS) { - unsigned long idx = 0; struct bio_vec *bvl = NULL; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, - &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); } - if (unlikely(!bvl)) goto err_free; - bio_init(bio, bvl, bvec_nr_vecs(idx)); - bio->bi_flags |= idx << BVEC_POOL_OFFSET; + bio_init(bio, bvl, nr_iovecs); } else if (nr_iovecs) { bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); } else { @@ -644,7 +628,7 @@ EXPORT_SYMBOL(bio_put); */ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); + WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); /* * most users will be overriding ->bi_bdev with a new target, @@ -934,7 +918,7 @@ EXPORT_SYMBOL_GPL(bio_release_pages); static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0); + WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; @@ -1495,7 +1479,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ int biovec_init_pool(mempool_t *pool, int pool_entries) { - struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; + struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } @@ -1605,8 +1589,6 @@ static int __init init_bio(void) { int i; - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - bio_integrity_init(); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { diff --git a/block/blk.h b/block/blk.h index e022a0d0f2ce..bfc4d526f626 100644 --- a/block/blk.h +++ b/block/blk.h @@ -56,9 +56,9 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); #define BIO_INLINE_VECS 4 -struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); -void bvec_free(mempool_t *, struct bio_vec *, unsigned int); -unsigned int bvec_nr_vecs(unsigned short idx); +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask); +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) diff --git a/include/linux/bio.h b/include/linux/bio.h index 9ceeb8ecdb7f..3cbbaf76906e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -329,7 +329,6 @@ struct bio_integrity_payload { struct bvec_iter bip_iter; - unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1bc6f6a01070..db026b6ec15a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -227,7 +227,7 @@ struct bio { * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* status, etc and bvec pool number */ + unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; @@ -307,33 +307,6 @@ enum { BIO_FLAG_LAST }; -/* See BVEC_POOL_OFFSET below before adding new flags */ - -/* - * We support 6 different bvec pools, the last one is magic in that it - * is backed by a mempool. - */ -#define BVEC_POOL_NR 6 -#define BVEC_POOL_MAX (BVEC_POOL_NR - 1) - -/* - * Top 3 bits of bio flags indicate the pool the bvecs came from. We add - * 1 to the actual index so that 0 indicates that there are no bvecs to be - * freed. - */ -#define BVEC_POOL_BITS (3) -#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) -#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) -#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) -# error "BVEC_POOL_BITS is too small" -#endif - -/* - * Flags starting here get preserved by bio_reset() - this includes - * only BVEC_POOL_IDX() - */ -#define BIO_RESET_BITS BVEC_POOL_OFFSET - typedef __u32 __bitwise blk_mq_req_flags_t; /* -- cgit v1.2.3