From 23a55f4492fcf868d068da31a2cd30c15f46207d Mon Sep 17 00:00:00 2001 From: Ofir Gal Date: Thu, 18 Jul 2024 11:45:12 +0300 Subject: net: introduce helper sendpages_ok() Network drivers are using sendpage_ok() to check the first page of an iterator in order to disable MSG_SPLICE_PAGES. The iterator can represent list of contiguous pages. When MSG_SPLICE_PAGES is enabled skb_splice_from_iter() is being used, it requires all pages in the iterator to be sendable. Therefore it needs to check that each page is sendable. The patch introduces a helper sendpages_ok(), it returns true if all the contiguous pages are sendable. Drivers who want to send contiguous pages with MSG_SPLICE_PAGES may use this helper to check whether the page list is OK. If the helper does not return true, the driver should remove MSG_SPLICE_PAGES flag. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ofir Gal Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240718084515.3833733-2-ofir.gal@volumez.com Signed-off-by: Jens Axboe --- include/linux/net.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 688320b79fcc..b75bc534c1b3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -322,6 +322,25 @@ static inline bool sendpage_ok(struct page *page) return !PageSlab(page) && page_count(page) >= 1; } +/* + * Check sendpage_ok on contiguous pages. + */ +static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) +{ + struct page *p = page + (offset >> PAGE_SHIFT); + size_t count = 0; + + while (count < len) { + if (!sendpage_ok(p)) + return false; + + p++; + count += PAGE_SIZE; + } + + return true; +} + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, -- cgit v1.2.3 From 7543ae2269a855683a39af57048035f44bc8ef9c Mon Sep 17 00:00:00 2001 From: Wouter Verhelst Date: Thu, 25 Jul 2024 18:45:36 +0200 Subject: nbd: add support for rotational devices The NBD protocol defines the flag NBD_FLAG_ROTATIONAL to flag that the export in use should be treated as a rotational device. Add support for that flag to the kernel driver. Signed-off-by: Wouter Verhelst Reviewed-by: Eric Blake Link: https://lore.kernel.org/r/20240725164536.1275851-1-w@uter.be Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 +++ include/uapi/linux/nbd.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 41a90150b501..5b1811b1ba5f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -350,6 +350,9 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim.features |= BLK_FEAT_WRITE_CACHE; lim.features &= ~BLK_FEAT_FUA; } + if (nbd->config->flags & NBD_FLAG_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; + lim.logical_block_size = blksize; lim.physical_block_size = blksize; error = queue_limits_commit_update(nbd->disk->queue, &lim); diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 80ce0ef43afd..d75215f2c675 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -51,8 +51,9 @@ enum { #define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ #define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ #define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */ -/* there is a gap here to match userspace */ +#define NBD_FLAG_ROTATIONAL (1 << 4) /* device is rotational */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +/* there is a gap here to match userspace */ #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ /* values for cmd flags in the upper 16 bits of request type */ -- cgit v1.2.3 From 5bc46b49c828a6dfaab80b71ecb63fe76a1096d2 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:20 +0200 Subject: nvme-tcp: check for invalidated or revoked key key_lookup() will always return a key, even if that key is revoked or invalidated. So check for invalid keys before continuing. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/common/keyring.c | 22 ++++++++++++++++++++++ drivers/nvme/host/Kconfig | 1 + drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/host/tcp.c | 2 +- include/linux/nvme-keyring.h | 6 +++++- 5 files changed, 30 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index 05e89307c8aa..ed5167f942d8 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -20,6 +20,28 @@ key_serial_t nvme_keyring_id(void) } EXPORT_SYMBOL_GPL(nvme_keyring_id); +static bool nvme_tls_psk_revoked(struct key *psk) +{ + return test_bit(KEY_FLAG_REVOKED, &psk->flags) || + test_bit(KEY_FLAG_INVALIDATED, &psk->flags); +} + +struct key *nvme_tls_key_lookup(key_serial_t key_id) +{ + struct key *key = key_lookup(key_id); + + if (IS_ERR(key)) { + pr_err("key id %08x not found\n", key_id); + return key; + } + if (nvme_tls_psk_revoked(key)) { + pr_err("key id %08x revoked\n", key_id); + return ERR_PTR(-EKEYREVOKED); + } + return key; +} +EXPORT_SYMBOL_GPL(nvme_tls_key_lookup); + static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index a3caef75aa0a..883aaab2d83e 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -109,6 +109,7 @@ config NVME_HOST_AUTH bool "NVMe over Fabrics In-Band Authentication in host side" depends on NVME_CORE select NVME_AUTH + select NVME_KEYRING if NVME_TCP_TLS help This provides support for NVMe over Fabrics In-Band Authentication in host side. diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f5f545fa0103..432efcbf9e2f 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -665,7 +665,7 @@ static struct key *nvmf_parse_key(int key_id) return ERR_PTR(-EINVAL); } - key = key_lookup(key_id); + key = nvme_tls_key_lookup(key_id); if (IS_ERR(key)) pr_err("key id %08x not found\n", key_id); else diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b40e95bee849..89c44413c593 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1596,7 +1596,7 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) goto out_complete; } - tls_key = key_lookup(pskid); + tls_key = nvme_tls_key_lookup(pskid); if (IS_ERR(tls_key)) { dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", qid, pskid); diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h index e10333d78dbb..19d2b256180f 100644 --- a/include/linux/nvme-keyring.h +++ b/include/linux/nvme-keyring.h @@ -12,7 +12,7 @@ key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn); key_serial_t nvme_keyring_id(void); - +struct key *nvme_tls_key_lookup(key_serial_t key_id); #else static inline key_serial_t nvme_tls_psk_default(struct key *keyring, @@ -24,5 +24,9 @@ static inline key_serial_t nvme_keyring_id(void) { return 0; } +static inline struct key *nvme_tls_key_lookup(key_serial_t key_id) +{ + return ERR_PTR(-ENOTSUPP); +} #endif /* !CONFIG_NVME_KEYRING */ #endif /* _NVME_KEYRING_H */ -- cgit v1.2.3 From 03c3d7c74371a46d967fbf41628874ec04ddda96 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 15 Aug 2024 22:11:31 +0200 Subject: nvme-rdma: send cntlid in the RDMA_CM_REQUEST Private Data When sending a RDMA_CM_REQUEST, the NVMe RDMA Transport Specification allows you to populate the cntlid field in the RDMA_CM_REQUEST Private Data. The cntlid is returned by the target on completion of the first RDMA_CM_REQUEST command (which creates the admin queue). The cntlid field can then be populated by the host when the I/O queues are created (using additional RDMA_CM_REQUEST commands), such that the target can perform extra validation for additional RDMA_CM_REQUEST commands. This additional error code and error message is also added, such that nvme_rdma_cm_msg() will display the proper error message if the target fails the RDMA_CM_REQUEST command because of this extra validation. Signed-off-by: Niklas Cassel Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 2 ++ include/linux/nvme-rdma.h | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2eb33842f971..e3520a91147b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1876,6 +1876,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) */ priv.hrqsize = cpu_to_le16(queue->queue_size); priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); + /* cntlid should only be set when creating an I/O queue */ + priv.cntlid = cpu_to_le16(ctrl->ctrl.cntlid); } ret = rdma_connect_locked(queue->cm_id, ¶m); diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index eb2f04d636c8..97c5f00b9aa3 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -25,6 +25,7 @@ enum nvme_rdma_cm_status { NVME_RDMA_CM_NO_RSC = 0x06, NVME_RDMA_CM_INVALID_IRD = 0x07, NVME_RDMA_CM_INVALID_ORD = 0x08, + NVME_RDMA_CM_INVALID_CNTLID = 0x09, }; static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) @@ -46,6 +47,8 @@ static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) return "invalid IRD"; case NVME_RDMA_CM_INVALID_ORD: return "Invalid ORD"; + case NVME_RDMA_CM_INVALID_CNTLID: + return "invalid controller ID"; default: return "unrecognized reason"; } @@ -64,7 +67,8 @@ struct nvme_rdma_cm_req { __le16 qid; __le16 hrqsize; __le16 hsqsize; - u8 rsvd[24]; + __le16 cntlid; + u8 rsvd[22]; }; /** -- cgit v1.2.3 From cead0b8991713bdeb5b947758dd62287fcf8da40 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 26 Aug 2024 16:09:43 +0530 Subject: nvme: rename apptag and appmask to lbat and lbatm Rename apptag and appmask to lbat and lbatm so that it matches the field names used in NVMe spec. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Suggested-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/ioctl.c | 4 ++-- drivers/nvme/host/rdma.c | 4 ++-- drivers/nvme/target/rdma.c | 4 ++-- include/linux/nvme.h | 8 ++++---- 5 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 93aa4c10adb7..75ab62cb4aa4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -987,8 +987,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1); cmnd->rw.reftag = 0; - cmnd->rw.apptag = 0; - cmnd->rw.appmask = 0; + cmnd->rw.lbat = 0; + cmnd->rw.lbatm = 0; if (ns->head->ms) { /* diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index f1d58e70933f..850f81e08e7d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -260,8 +260,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.control = cpu_to_le16(io.control); c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); + c.rw.lbat = cpu_to_le16(io.apptag); + c.rw.lbatm = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0, 0); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e3520a91147b..15b5e06039a5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1363,8 +1363,8 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1eff8ca6a5f1..1b6264fa5803 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -578,8 +578,8 @@ static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi, if (control & NVME_RW_PRINFO_PRCHK_REF) domain->sig.dif.ref_remap = true; - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); domain->sig.dif.app_escape = true; if (pi_type == NVME_NS_DPS_PI_TYPE3) domain->sig.dif.ref_escape = true; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7b2ae2e43544..b58d9405d65e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -987,8 +987,8 @@ struct nvme_rw_command { __le16 control; __le32 dsmgmt; __le32 reftag; - __le16 apptag; - __le16 appmask; + __le16 lbat; + __le16 lbatm; }; enum { @@ -1057,8 +1057,8 @@ struct nvme_write_zeroes_cmd { __le16 control; __le32 dsmgmt; __le32 reftag; - __le16 apptag; - __le16 appmask; + __le16 lbat; + __le16 lbatm; }; enum nvme_zone_mgmt_action { -- cgit v1.2.3 From b35243a447b9fe6457fa8e1352152b818436ba5a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2024 19:37:54 +0200 Subject: block: rework bio splitting The current setup with bio_may_exceed_limit and __bio_split_to_limits is a bit of a mess. Change it so that __bio_split_to_limits does all the work and is just a variant of bio_split_to_limits that returns nr_segs. This is done by inlining it and instead have the various bio_split_* helpers directly submit the potentially split bios. To support btrfs, the rw version has a lower level helper split out that just returns the offset to split. This turns out to nicely clean up the btrfs flow as well. Signed-off-by: Christoph Hellwig Acked-by: David Sterba Reviewed-by: Damien Le Moal Tested-by: Hans Holmberg Reviewed-by: Hans Holmberg Link: https://lore.kernel.org/r/20240826173820.1690925-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 146 ++++++++++++++++++++-------------------------------- block/blk-mq.c | 11 ++-- block/blk.h | 63 ++++++++++++++++------- fs/btrfs/bio.c | 30 ++++++----- include/linux/bio.h | 4 +- 5 files changed, 125 insertions(+), 129 deletions(-) (limited to 'include') diff --git a/block/blk-merge.c b/block/blk-merge.c index de5281bcadc5..c7222c4685e0 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -105,9 +105,33 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } -static struct bio *bio_split_discard(struct bio *bio, - const struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +static struct bio *bio_submit_split(struct bio *bio, int split_sectors) +{ + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } + + if (split_sectors) { + struct bio *split; + + split = bio_split(bio, split_sectors, GFP_NOIO, + &bio->bi_bdev->bd_disk->bio_split); + split->bi_opf |= REQ_NOMERGE; + blkcg_bio_issue_init(split); + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + submit_bio_noacct(bio); + return split; + } + + return bio; +} + +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs) { unsigned int max_discard_sectors, granularity; sector_t tmp; @@ -121,10 +145,10 @@ static struct bio *bio_split_discard(struct bio *bio, min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) - return NULL; + return bio; if (bio_sectors(bio) <= max_discard_sectors) - return NULL; + return bio; split_sectors = max_discard_sectors; @@ -139,19 +163,18 @@ static struct bio *bio_split_discard(struct bio *bio, if (split_sectors > tmp) split_sectors -= tmp; - return bio_split(bio, split_sectors, GFP_NOIO, bs); + return bio_submit_split(bio, split_sectors); } -static struct bio *bio_split_write_zeroes(struct bio *bio, - const struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs) { *nsegs = 0; if (!lim->max_write_zeroes_sectors) - return NULL; + return bio; if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) - return NULL; - return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); + return bio; + return bio_submit_split(bio, lim->max_write_zeroes_sectors); } static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, @@ -274,27 +297,19 @@ static bool bvec_split_segs(const struct queue_limits *lim, } /** - * bio_split_rw - split a bio in two bios + * bio_split_rw_at - check if and where to split a read/write bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors - * @bs: [in] bio set to allocate the clone from * @max_bytes: [in] maximum number of bytes per bio * - * Clone @bio, update the bi_iter of the clone to represent the first sectors - * of @bio and update @bio->bi_iter to represent the remaining sectors. The - * following is guaranteed for the cloned bio: - * - That it has at most @max_bytes worth of data - * - That it has at most queue_max_segments(@q) segments. - * - * Except for discard requests the cloned bio will point at the bi_io_vec of - * the original bio. It is the responsibility of the caller to ensure that the - * original bio is not freed before the cloned bio. The caller is also - * responsible for ensuring that @bs is only destroyed after processing of the - * split bio has finished. + * Find out if @bio needs to be split to fit the queue limits in @lim and a + * maximum size of @max_bytes. Returns a negative error number if @bio can't be + * split, 0 if the bio doesn't have to be split, or a positive sector offset if + * @bio needs to be split. */ -struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes) +int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; @@ -324,22 +339,17 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, } *segs = nsegs; - return NULL; + return 0; split: - if (bio->bi_opf & REQ_ATOMIC) { - bio->bi_status = BLK_STS_INVAL; - bio_endio(bio); - return ERR_PTR(-EINVAL); - } + if (bio->bi_opf & REQ_ATOMIC) + return -EINVAL; + /* * We can't sanely support splitting for a REQ_NOWAIT bio. End it * with EAGAIN if splitting is required and return an error pointer. */ - if (bio->bi_opf & REQ_NOWAIT) { - bio->bi_status = BLK_STS_AGAIN; - bio_endio(bio); - return ERR_PTR(-EAGAIN); - } + if (bio->bi_opf & REQ_NOWAIT) + return -EAGAIN; *segs = nsegs; @@ -356,58 +366,16 @@ split: * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); - return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); + return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw); +EXPORT_SYMBOL_GPL(bio_split_rw_at); -/** - * __bio_split_to_limits - split a bio to fit the queue limits - * @bio: bio to be split - * @lim: queue limits to split based on - * @nr_segs: returns the number of segments in the returned bio - * - * Check if @bio needs splitting based on the queue limits, and if so split off - * a bio fitting the limits from the beginning of @bio and return it. @bio is - * shortened to the remainder and re-submitted. - * - * The split bio is allocated from @q->bio_split, which is provided by the - * block layer. - */ -struct bio *__bio_split_to_limits(struct bio *bio, - const struct queue_limits *lim, - unsigned int *nr_segs) +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs) { - struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; - struct bio *split; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - split = bio_split_discard(bio, lim, nr_segs, bs); - break; - case REQ_OP_WRITE_ZEROES: - split = bio_split_write_zeroes(bio, lim, nr_segs, bs); - break; - default: - split = bio_split_rw(bio, lim, nr_segs, bs, - get_max_io_size(bio, lim) << SECTOR_SHIFT); - if (IS_ERR(split)) - return NULL; - break; - } - - if (split) { - /* there isn't chance to merge the split bio */ - split->bi_opf |= REQ_NOMERGE; - - blkcg_bio_issue_init(split); - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; - } - return bio; + return bio_submit_split(bio, + bio_split_rw_at(bio, lim, nr_segs, + get_max_io_size(bio, lim) << SECTOR_SHIFT)); } /** @@ -426,9 +394,7 @@ struct bio *bio_split_to_limits(struct bio *bio) const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; - if (bio_may_exceed_limits(bio, lim)) - return __bio_split_to_limits(bio, lim, &nr_segs); - return bio; + return __bio_split_to_limits(bio, lim, &nr_segs); } EXPORT_SYMBOL(bio_split_to_limits); diff --git a/block/blk-mq.c b/block/blk-mq.c index e3c3c0c21b55..36abbaefe387 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2939,7 +2939,7 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - unsigned int nr_segs = 1; + unsigned int nr_segs; struct request *rq; blk_status_t ret; @@ -2981,11 +2981,10 @@ void blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto queue_exit; - } + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto queue_exit; + if (!bio_integrity_prep(bio)) goto queue_exit; diff --git a/block/blk.h b/block/blk.h index e180863f918b..0d8cd64c1260 100644 --- a/block/blk.h +++ b/block/blk.h @@ -331,33 +331,58 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -static inline bool bio_may_exceed_limits(struct bio *bio, - const struct queue_limits *lim) +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs); +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs); +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs); + +/* + * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. + * + * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is + * always valid if a bio has data. The check might lead to occasional false + * positives when bios are cloned, but compared to the performance impact of + * cloned bios themselves the loop below doesn't matter anyway. + */ +static inline bool bio_may_need_split(struct bio *bio, + const struct queue_limits *lim) +{ + return lim->chunk_sectors || bio->bi_vcnt != 1 || + bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; +} + +/** + * __bio_split_to_limits - split a bio to fit the queue limits + * @bio: bio to be split + * @lim: queue limits to split based on + * @nr_segs: returns the number of segments in the returned bio + * + * Check if @bio needs splitting based on the queue limits, and if so split off + * a bio fitting the limits from the beginning of @bio and return it. @bio is + * shortened to the remainder and re-submitted. + * + * The split bio is allocated from @q->bio_split, which is provided by the + * block layer. + */ +static inline struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, unsigned int *nr_segs) { switch (bio_op(bio)) { + default: + if (bio_may_need_split(bio, lim)) + return bio_split_rw(bio, lim, nr_segs); + *nr_segs = 1; + return bio; case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + return bio_split_discard(bio, lim, nr_segs); case REQ_OP_WRITE_ZEROES: - return true; /* non-trivial splitting decisions */ - default: - break; + return bio_split_write_zeroes(bio, lim, nr_segs); } - - /* - * All drivers must accept single-segments bios that are <= PAGE_SIZE. - * This is a quick and dirty check that relies on the fact that - * bi_io_vec[0] is always valid if a bio has data. The check might - * lead to occasional false negatives when bios are cloned, but compared - * to the performance impact of cloned bios themselves the loop below - * doesn't matter anyway. - */ - return lim->chunk_sectors || bio->bi_vcnt != 1 || - bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } -struct bio *__bio_split_to_limits(struct bio *bio, - const struct queue_limits *lim, - unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f04d93109960..5ae769184907 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -73,20 +73,13 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, - u64 map_length, bool use_append) + u64 map_length) { struct btrfs_bio *bbio; struct bio *bio; - if (use_append) { - unsigned int nr_segs; - - bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, - &btrfs_clone_bioset, map_length); - } else { - bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, - GFP_NOFS, &btrfs_clone_bioset); - } + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -664,6 +657,19 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } +static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) +{ + unsigned int nr_segs; + int sector_offset; + + map_length = min(map_length, bbio->fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + &nr_segs, map_length); + if (sector_offset) + return sector_offset << SECTOR_SHIFT; + return map_length; +} + static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; @@ -691,10 +697,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) map_length = min(map_length, length); if (use_append) - map_length = min(map_length, fs_info->max_zone_append_size); + map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + bbio = btrfs_split_bio(fs_info, bbio, map_length); bio = &bbio->bio; } diff --git a/include/linux/bio.h b/include/linux/bio.h index a46e2047bea4..faceadb040f9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,8 +324,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes); +int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary -- cgit v1.2.3 From 379b122a3ec8033aa43cb70e8ecb6fb7f98aa68f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2024 19:37:55 +0200 Subject: block: constify the lim argument to queue_limits_max_zone_append_sectors queue_limits_max_zone_append_sectors doesn't change the lim argument, so mark it as const. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Hans Holmberg Reviewed-by: Hans Holmberg Link: https://lore.kernel.org/r/20240826173820.1690925-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e85ec73a07d5..ec3ea5d1f99d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1187,7 +1187,8 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) +static inline unsigned int +queue_limits_max_zone_append_sectors(const struct queue_limits *l) { unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); -- cgit v1.2.3 From e49dacc71ec2621ce4c422cd5605d4d06f7807b0 Mon Sep 17 00:00:00 2001 From: Wouter Verhelst Date: Mon, 12 Aug 2024 15:20:37 +0200 Subject: nbd: implement the WRITE_ZEROES command The NBD protocol defines a message for zeroing out a region of an export Add support to the kernel driver for that message. Signed-off-by: Wouter Verhelst Cc: Eric Blake Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20240812133032.115134-3-w@uter.be Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 8 ++++++++ include/uapi/linux/nbd.h | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4d06472bf112..eee632b84994 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -363,6 +363,8 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, } if (nbd->config->flags & NBD_FLAG_ROTATIONAL) lim.features |= BLK_FEAT_ROTATIONAL; + if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES) + lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT; lim.logical_block_size = blksize; lim.physical_block_size = blksize; @@ -432,6 +434,8 @@ static u32 req_to_nbd_cmd_type(struct request *req) return NBD_CMD_WRITE; case REQ_OP_READ: return NBD_CMD_READ; + case REQ_OP_WRITE_ZEROES: + return NBD_CMD_WRITE_ZEROES; default: return U32_MAX; } @@ -648,6 +652,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, if (req->cmd_flags & REQ_FUA) nbd_cmd_flags |= NBD_CMD_FLAG_FUA; + if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES)) + nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE; /* We did a partial send previously, and we at least sent the whole * request struct, so just go and send the rest of the pages in the @@ -1717,6 +1723,8 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) seq_puts(s, "NBD_FLAG_SEND_FUA\n"); if (flags & NBD_FLAG_SEND_TRIM) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); + if (flags & NBD_FLAG_SEND_WRITE_ZEROES) + seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n"); return 0; } diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index d75215f2c675..f1d468acfb25 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -42,8 +42,9 @@ enum { NBD_CMD_WRITE = 1, NBD_CMD_DISC = 2, NBD_CMD_FLUSH = 3, - NBD_CMD_TRIM = 4 + NBD_CMD_TRIM = 4, /* userspace defines additional extension commands */ + NBD_CMD_WRITE_ZEROES = 6, }; /* values for flags field, these are server interaction specific. */ @@ -53,11 +54,13 @@ enum { #define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */ #define NBD_FLAG_ROTATIONAL (1 << 4) /* device is rotational */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6) /* supports WRITE_ZEROES */ /* there is a gap here to match userspace */ #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ /* values for cmd flags in the upper 16 bits of request type */ #define NBD_CMD_FLAG_FUA (1 << 16) /* FUA (forced unit access) op */ +#define NBD_CMD_FLAG_NO_HOLE (1 << 17) /* Do not punch a hole for WRITE_ZEROES */ /* These are client behavior specific flags. */ #define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on -- cgit v1.2.3 From d3bfbfb1248498656cd25c51e41c1e31219bd0dd Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Wed, 11 Sep 2024 12:19:34 +0530 Subject: mm: release number of pages of a folio Add a new function unpin_user_folio() to put the refs of a folio by npages count. The check for BIO_PAGE_PINNED flag is removed as it is already checked in bio_release_pages(). Signed-off-by: Kundan Kumar Tested-by: Luis Chamberlain Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240911064935.5630-4-kundan.kumar@samsung.com Signed-off-by: Jens Axboe --- include/linux/mm.h | 1 + mm/gup.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..2fd88cd5997d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1597,6 +1597,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) diff --git a/mm/gup.c b/mm/gup.c index 54d0dc3831fb..02c46ae33028 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -415,6 +415,19 @@ void unpin_user_pages(struct page **pages, unsigned long npages) } EXPORT_SYMBOL(unpin_user_pages); +/** + * unpin_user_folio() - release pages of a folio + * @folio: pointer to folio to be released + * @npages: number of pages of same folio + * + * Release npages of the folio + */ +void unpin_user_folio(struct folio *folio, unsigned long npages) +{ + gup_put_folio(folio, npages, FOLL_PIN); +} +EXPORT_SYMBOL(unpin_user_folio); + /** * unpin_folios() - release an array of gup-pinned folios. * @folios: array of folios to be marked dirty and released. -- cgit v1.2.3