diff options
Diffstat (limited to 'drivers/block')
| -rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 10 | ||||
| -rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 14 | ||||
| -rw-r--r-- | drivers/block/floppy.c | 2 | ||||
| -rw-r--r-- | drivers/block/loop.c | 4 | ||||
| -rw-r--r-- | drivers/block/nbd.c | 44 | ||||
| -rw-r--r-- | drivers/block/null_blk/main.c | 81 | ||||
| -rw-r--r-- | drivers/block/null_blk/null_blk.h | 3 | ||||
| -rw-r--r-- | drivers/block/null_blk/zoned.c | 6 | ||||
| -rw-r--r-- | drivers/block/ps3disk.c | 4 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-proto.h | 15 | ||||
| -rw-r--r-- | drivers/block/rnull/configfs.rs | 9 | ||||
| -rw-r--r-- | drivers/block/rnull/rnull.rs | 3 | ||||
| -rw-r--r-- | drivers/block/ublk_drv.c | 407 | ||||
| -rw-r--r-- | drivers/block/virtio_blk.c | 24 | ||||
| -rw-r--r-- | drivers/block/zloop.c | 160 |
15 files changed, 461 insertions, 325 deletions
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 85ca000a0564..d90fa3e7f4cf 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1210,7 +1210,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned return err; } -/** +/* * drbd_bm_read() - Read the whole bitmap from its on disk location. * @device: DRBD device. */ @@ -1221,7 +1221,7 @@ int drbd_bm_read(struct drbd_device *device, return bm_rw(device, BM_AIO_READ, 0); } -/** +/* * drbd_bm_write() - Write the whole bitmap to its on disk location. * @device: DRBD device. * @@ -1233,7 +1233,7 @@ int drbd_bm_write(struct drbd_device *device, return bm_rw(device, 0, 0); } -/** +/* * drbd_bm_write_all() - Write the whole bitmap to its on disk location. * @device: DRBD device. * @@ -1255,7 +1255,7 @@ int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_ho return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx); } -/** +/* * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. * @device: DRBD device. * @@ -1272,7 +1272,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device, return bm_rw(device, BM_AIO_COPY_PAGES, 0); } -/** +/* * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. * @device: DRBD device. */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index caaf2781136d..3de919b6f0e1 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -450,7 +450,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) * a free one dynamically. */ what = "bind before connect"; - err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); + err = sock->ops->bind(sock, (struct sockaddr_unsized *) &src_in6, my_addr_len); if (err < 0) goto out; @@ -458,7 +458,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) * stay C_WF_CONNECTION, don't go Disconnecting! */ disconnect_on_error = 0; what = "connect"; - err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); + err = sock->ops->connect(sock, (struct sockaddr_unsized *) &peer_in6, peer_addr_len, 0); out: if (err < 0) { @@ -537,7 +537,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); what = "bind before listen"; - err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); + err = s_listen->ops->bind(s_listen, (struct sockaddr_unsized *)&my_addr, my_addr_len); if (err < 0) goto out; @@ -1736,13 +1736,13 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, page = peer_req->pages; page_chain_for_each(page) { unsigned len = min_t(int, ds, PAGE_SIZE); - data = kmap(page); + data = kmap_local_page(page); err = drbd_recv_all_warn(peer_device->connection, data, len); if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) { drbd_err(device, "Fault injection: Corrupting data on receive\n"); data[0] = data[0] ^ (unsigned long)-1; } - kunmap(page); + kunmap_local(data); if (err) { drbd_free_peer_req(device, peer_req); return NULL; @@ -1777,7 +1777,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) page = drbd_alloc_pages(peer_device, 1, 1); - data = kmap(page); + data = kmap_local_page(page); while (data_size) { unsigned int len = min_t(int, data_size, PAGE_SIZE); @@ -1786,7 +1786,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) break; data_size -= len; } - kunmap(page); + kunmap_local(data); drbd_free_pages(peer_device->device, page); return err; } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 5336c3c5ca36..c28786e0fe1c 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -329,7 +329,7 @@ static bool initialized; * This default is used whenever the current disk size is unknown. * [Now it is rather a minimum] */ -#define MAX_DISK_SIZE 4 /* 3984 */ +#define MAX_DISK_SIZE (PAGE_SIZE / 1024) /* * globals used by 'result()' diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 13ce229d450c..ebe751f39742 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1908,6 +1908,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd) goto failed; } + /* We can block in this context, so ignore REQ_NOWAIT. */ + if (rq->cmd_flags & REQ_NOWAIT) + rq->cmd_flags &= ~REQ_NOWAIT; + if (cmd_blkcg_css) kthread_associate_blkcg(cmd_blkcg_css); if (cmd_memcg_css) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 1188f32a5e5e..f6c33b21f69e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -565,24 +565,27 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, msg.msg_iter = *iter; noreclaim_flag = memalloc_noreclaim_save(); - do { - sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; - sock->sk->sk_use_task_frag = false; - msg.msg_flags = msg_flags | MSG_NOSIGNAL; - - if (send) - result = sock_sendmsg(sock, &msg); - else - result = sock_recvmsg(sock, &msg, msg.msg_flags); - - if (result <= 0) { - if (result == 0) - result = -EPIPE; /* short read */ - break; - } - if (sent) - *sent += result; - } while (msg_data_left(&msg)); + + scoped_with_kernel_creds() { + do { + sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; + sock->sk->sk_use_task_frag = false; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + + if (send) + result = sock_sendmsg(sock, &msg); + else + result = sock_recvmsg(sock, &msg, msg.msg_flags); + + if (result <= 0) { + if (result == 0) + result = -EPIPE; /* short read */ + break; + } + if (sent) + *sent += result; + } while (msg_data_left(&msg)); + } memalloc_noreclaim_restore(noreclaim_flag); @@ -1018,9 +1021,9 @@ static void recv_work(struct work_struct *work) nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); - nbd_config_put(nbd); atomic_dec(&config->recv_threads); wake_up(&config->recv_wq); + nbd_config_put(nbd); kfree(args); } @@ -2235,12 +2238,13 @@ again: ret = nbd_start_device(nbd); out: - mutex_unlock(&nbd->config_lock); if (!ret) { set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); refcount_inc(&nbd->config_refs); nbd_connect_reply(info, nbd->index); } + mutex_unlock(&nbd->config_lock); + nbd_config_put(nbd); if (put_dev) nbd_put(nbd); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index f982027e8c85..c7c0fb79a6bf 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1129,26 +1129,28 @@ again: return 0; } -static int copy_to_nullb(struct nullb *nullb, struct page *source, - unsigned int off, sector_t sector, size_t n, bool is_fua) +static blk_status_t copy_to_nullb(struct nullb *nullb, void *source, + loff_t pos, size_t n, bool is_fua) { size_t temp, count = 0; - unsigned int offset; struct nullb_page *t_page; + sector_t sector; while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); + temp = min3(nullb->dev->blocksize, n - count, + PAGE_SIZE - offset_in_page(pos)); + sector = pos >> SECTOR_SHIFT; if (null_cache_active(nullb) && !is_fua) null_make_cache_space(nullb, PAGE_SIZE); - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; t_page = null_insert_page(nullb, sector, !null_cache_active(nullb) || is_fua); if (!t_page) - return -ENOSPC; + return BLK_STS_NOSPC; - memcpy_page(t_page->page, offset, source, off + count, temp); + memcpy_to_page(t_page->page, offset_in_page(pos), + source + count, temp); __set_bit(sector & SECTOR_MASK, t_page->bitmap); @@ -1156,41 +1158,34 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, null_free_sector(nullb, sector, true); count += temp; - sector += temp >> SECTOR_SHIFT; + pos += temp; } - return 0; + return BLK_STS_OK; } -static int copy_from_nullb(struct nullb *nullb, struct page *dest, - unsigned int off, sector_t sector, size_t n) +static void copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos, + size_t n) { size_t temp, count = 0; - unsigned int offset; struct nullb_page *t_page; + sector_t sector; while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); + temp = min3(nullb->dev->blocksize, n - count, + PAGE_SIZE - offset_in_page(pos)); + sector = pos >> SECTOR_SHIFT; - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; t_page = null_lookup_page(nullb, sector, false, !null_cache_active(nullb)); - if (t_page) - memcpy_page(dest, off + count, t_page->page, offset, - temp); + memcpy_from_page(dest + count, t_page->page, + offset_in_page(pos), temp); else - memzero_page(dest, off + count, temp); + memset(dest + count, 0, temp); count += temp; - sector += temp >> SECTOR_SHIFT; + pos += temp; } - return 0; -} - -static void nullb_fill_pattern(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off) -{ - memset_page(page, off, 0xff, len); } blk_status_t null_handle_discard(struct nullb_device *dev, @@ -1234,34 +1229,39 @@ static blk_status_t null_handle_flush(struct nullb *nullb) return errno_to_blk_status(err); } -static int null_transfer(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off, bool is_write, sector_t sector, +static blk_status_t null_transfer(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off, bool is_write, loff_t pos, bool is_fua) { struct nullb_device *dev = nullb->dev; + blk_status_t err = BLK_STS_OK; unsigned int valid_len = len; - int err = 0; + void *p; + p = kmap_local_page(page) + off; if (!is_write) { - if (dev->zoned) + if (dev->zoned) { valid_len = null_zone_valid_read_len(nullb, - sector, len); + pos >> SECTOR_SHIFT, len); + if (valid_len && valid_len != len) + valid_len -= pos & (SECTOR_SIZE - 1); + } if (valid_len) { - err = copy_from_nullb(nullb, page, off, - sector, valid_len); + copy_from_nullb(nullb, p, pos, valid_len); off += valid_len; len -= valid_len; } if (len) - nullb_fill_pattern(nullb, page, len, off); + memset(p + valid_len, 0xff, len); flush_dcache_page(page); } else { flush_dcache_page(page); - err = copy_to_nullb(nullb, page, off, sector, len, is_fua); + err = copy_to_nullb(nullb, p, pos, len, is_fua); } + kunmap_local(p); return err; } @@ -1274,9 +1274,9 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; - int err = 0; + blk_status_t err = BLK_STS_OK; unsigned int len; - sector_t sector = blk_rq_pos(rq); + loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT; unsigned int max_bytes = nr_sectors << SECTOR_SHIFT; unsigned int transferred_bytes = 0; struct req_iterator iter; @@ -1288,18 +1288,18 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, if (transferred_bytes + len > max_bytes) len = max_bytes - transferred_bytes; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(req_op(rq)), sector, + op_is_write(req_op(rq)), pos, rq->cmd_flags & REQ_FUA); if (err) break; - sector += len >> SECTOR_SHIFT; + pos += len; transferred_bytes += len; if (transferred_bytes >= max_bytes) break; } spin_unlock_irq(&nullb->lock); - return errno_to_blk_status(err); + return err; } static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) @@ -1949,6 +1949,7 @@ static int null_add_dev(struct nullb_device *dev) .logical_block_size = dev->blocksize, .physical_block_size = dev->blocksize, .max_hw_sectors = dev->max_sectors, + .dma_alignment = 1, }; struct nullb *nullb; diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 7bb6128dbaaf..6c4c4bbe7dad 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -143,7 +143,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); int null_register_zoned_dev(struct nullb *nullb); void null_free_zoned_dev(struct nullb_device *dev); int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors); size_t null_zone_valid_read_len(struct nullb *nullb, diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 4e5728f45989..0ada35dc0989 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -191,7 +191,7 @@ void null_free_zoned_dev(struct nullb_device *dev) } int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; @@ -225,7 +225,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector, blkz.capacity = zone->capacity; null_unlock_zone(dev, zone); - error = cb(&blkz, i, data); + error = disk_report_zone(disk, &blkz, i, args); if (error) return error; } @@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb, { struct nullb_device *dev = nullb->dev; struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; - unsigned int nr_sectors = len >> SECTOR_SHIFT; + unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SIZE); /* Read must be below the write pointer position */ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index dc9e4a14b885..8892f218a814 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -85,10 +85,14 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev, struct bio_vec bvec; rq_for_each_segment(bvec, req, iter) { + dev_dbg(&dev->sbd.core, "%s:%u: %u sectors from %llu\n", + __func__, __LINE__, bio_sectors(iter.bio), + iter.bio->bi_iter.bi_sector); if (gather) memcpy_from_bvec(dev->bounce_buf + offset, &bvec); else memcpy_to_bvec(&bvec, dev->bounce_buf + offset); + offset += bvec.bv_len; } } diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index f35be51d213c..77360c2a6069 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -24,7 +24,7 @@ #define RTRS_PORT 1234 /** - * enum rnbd_msg_types - RNBD message types + * enum rnbd_msg_type - RNBD message types * @RNBD_MSG_SESS_INFO: initial session info from client to server * @RNBD_MSG_SESS_INFO_RSP: initial session info from server to client * @RNBD_MSG_OPEN: open (map) device request @@ -47,10 +47,11 @@ enum rnbd_msg_type { */ struct rnbd_msg_hdr { __le16 type; + /* private: */ __le16 __padding; }; -/** +/* * We allow to map RO many times and RW only once. We allow to map yet another * time RW, if MIGRATION is provided (second RW export can be required for * example for VM migration) @@ -78,6 +79,7 @@ static const __maybe_unused struct { struct rnbd_msg_sess_info { struct rnbd_msg_hdr hdr; u8 ver; + /* private: */ u8 reserved[31]; }; @@ -89,6 +91,7 @@ struct rnbd_msg_sess_info { struct rnbd_msg_sess_info_rsp { struct rnbd_msg_hdr hdr; u8 ver; + /* private: */ u8 reserved[31]; }; @@ -97,13 +100,16 @@ struct rnbd_msg_sess_info_rsp { * @hdr: message header * @access_mode: the mode to open remote device, valid values see: * enum rnbd_access_mode - * @device_name: device path on remote side + * @dev_name: device path on remote side */ struct rnbd_msg_open { struct rnbd_msg_hdr hdr; u8 access_mode; + /* private: */ u8 resv1; + /* public: */ s8 dev_name[NAME_MAX]; + /* private: */ u8 reserved[3]; }; @@ -155,6 +161,7 @@ struct rnbd_msg_open_rsp { __le16 secure_discard; u8 obsolete_rotational; u8 cache_policy; + /* private: */ u8 reserved[10]; }; @@ -187,7 +194,7 @@ struct rnbd_msg_io { * @RNBD_OP_DISCARD: discard sectors * @RNBD_OP_SECURE_ERASE: securely erase sectors * @RNBD_OP_WRITE_ZEROES: write zeroes sectors - + * * @RNBD_F_SYNC: request is sync (sync write or read) * @RNBD_F_FUA: forced unit access */ diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs index 8498e9bae6fd..6713a6d92391 100644 --- a/drivers/block/rnull/configfs.rs +++ b/drivers/block/rnull/configfs.rs @@ -1,12 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 use super::{NullBlkDevice, THIS_MODULE}; -use core::fmt::{Display, Write}; use kernel::{ block::mq::gen_disk::{GenDisk, GenDiskBuilder}, c_str, configfs::{self, AttributeOperations}, - configfs_attrs, new_mutex, + configfs_attrs, + fmt::{self, Write as _}, + new_mutex, page::PAGE_SIZE, prelude::*, str::{kstrtobool_bytes, CString}, @@ -99,8 +100,8 @@ impl TryFrom<u8> for IRQMode { } } -impl Display for IRQMode { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl fmt::Display for IRQMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::None => f.write_str("0")?, Self::Soft => f.write_str("1")?, diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index 1ec694d7f1a6..a9d5e575a2c4 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -17,8 +17,7 @@ use kernel::{ error::Result, pr_info, prelude::*, - sync::Arc, - types::ARef, + sync::{aref::ARef, Arc}, }; use pin_init::PinInit; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0c74a41a6753..2c715df63f23 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -155,12 +155,13 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) +union ublk_io_buf { + __u64 addr; + struct ublk_auto_buf_reg auto_reg; +}; + struct ublk_io { - /* userspace buffer address from io cmd */ - union { - __u64 addr; - struct ublk_auto_buf_reg buf; - }; + union ublk_io_buf buf; unsigned int flags; int res; @@ -203,15 +204,12 @@ struct ublk_queue { bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ spinlock_t cancel_lock; struct ublk_device *dev; - struct ublk_io ios[]; + struct ublk_io ios[] __counted_by(q_depth); }; struct ublk_device { struct gendisk *ub_disk; - char *__queues; - - unsigned int queue_size; struct ublksrv_ctrl_dev_info dev_info; struct blk_mq_tag_set tag_set; @@ -239,6 +237,8 @@ struct ublk_device { bool canceling; pid_t ublksrv_tgid; struct delayed_work exit_work; + + struct ublk_queue *queues[]; }; /* header of ublk_params */ @@ -265,7 +265,7 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) return ub->dev_info.flags & UBLK_F_ZONED; } -static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) +static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_ZONED; } @@ -368,7 +368,7 @@ static void *ublk_alloc_report_buffer(struct ublk_device *ublk, } static int ublk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct ublk_device *ub = disk->private_data; unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; @@ -431,7 +431,7 @@ free_req: if (!zone->len) break; - ret = cb(zone, i, data); + ret = disk_report_zone(disk, zone, i, args); if (ret) goto out; @@ -499,7 +499,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); - iod->addr = io->addr; + iod->addr = io->buf.addr; return BLK_STS_OK; } @@ -781,7 +781,7 @@ static noinline void ublk_put_device(struct ublk_device *ub) static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, int qid) { - return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); + return dev->queues[qid]; } static inline bool ublk_rq_has_data(const struct request *rq) @@ -914,73 +914,6 @@ static const struct block_device_operations ub_fops = { .report_zones = ublk_report_zones, }; -#define UBLK_MAX_PIN_PAGES 32 - -struct ublk_io_iter { - struct page *pages[UBLK_MAX_PIN_PAGES]; - struct bio *bio; - struct bvec_iter iter; -}; - -/* return how many pages are copied */ -static void ublk_copy_io_pages(struct ublk_io_iter *data, - size_t total, size_t pg_off, int dir) -{ - unsigned done = 0; - unsigned pg_idx = 0; - - while (done < total) { - struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); - unsigned int bytes = min3(bv.bv_len, (unsigned)total - done, - (unsigned)(PAGE_SIZE - pg_off)); - void *bv_buf = bvec_kmap_local(&bv); - void *pg_buf = kmap_local_page(data->pages[pg_idx]); - - if (dir == ITER_DEST) - memcpy(pg_buf + pg_off, bv_buf, bytes); - else - memcpy(bv_buf, pg_buf + pg_off, bytes); - - kunmap_local(pg_buf); - kunmap_local(bv_buf); - - /* advance page array */ - pg_off += bytes; - if (pg_off == PAGE_SIZE) { - pg_idx += 1; - pg_off = 0; - } - - done += bytes; - - /* advance bio */ - bio_advance_iter_single(data->bio, &data->iter, bytes); - if (!data->iter.bi_size) { - data->bio = data->bio->bi_next; - if (data->bio == NULL) - break; - data->iter = data->bio->bi_iter; - } - } -} - -static bool ublk_advance_io_iter(const struct request *req, - struct ublk_io_iter *iter, unsigned int offset) -{ - struct bio *bio = req->bio; - - for_each_bio(bio) { - if (bio->bi_iter.bi_size > offset) { - iter->bio = bio; - iter->iter = bio->bi_iter; - bio_advance_iter(iter->bio, &iter->iter, offset); - return true; - } - offset -= bio->bi_iter.bi_size; - } - return false; -} - /* * Copy data between request pages and io_iter, and 'offset' * is the start point of linear offset of request. @@ -988,34 +921,35 @@ static bool ublk_advance_io_iter(const struct request *req, static size_t ublk_copy_user_pages(const struct request *req, unsigned offset, struct iov_iter *uiter, int dir) { - struct ublk_io_iter iter; + struct req_iterator iter; + struct bio_vec bv; size_t done = 0; - if (!ublk_advance_io_iter(req, &iter, offset)) - return 0; + rq_for_each_segment(bv, req, iter) { + void *bv_buf; + size_t copied; - while (iov_iter_count(uiter) && iter.bio) { - unsigned nr_pages; - ssize_t len; - size_t off; - int i; - - len = iov_iter_get_pages2(uiter, iter.pages, - iov_iter_count(uiter), - UBLK_MAX_PIN_PAGES, &off); - if (len <= 0) - return done; - - ublk_copy_io_pages(&iter, len, off, dir); - nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); - for (i = 0; i < nr_pages; i++) { - if (dir == ITER_DEST) - set_page_dirty(iter.pages[i]); - put_page(iter.pages[i]); + if (offset >= bv.bv_len) { + offset -= bv.bv_len; + continue; } - done += len; - } + bv.bv_offset += offset; + bv.bv_len -= offset; + bv_buf = bvec_kmap_local(&bv); + if (dir == ITER_DEST) + copied = copy_to_iter(bv_buf, bv.bv_len, uiter); + else + copied = copy_from_iter(bv_buf, bv.bv_len, uiter); + + kunmap_local(bv_buf); + + done += copied; + if (copied < bv.bv_len) + break; + + offset = 0; + } return done; } @@ -1030,8 +964,9 @@ static inline bool ublk_need_unmap_req(const struct request *req) (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); } -static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, - const struct ublk_io *io) +static unsigned int ublk_map_io(const struct ublk_queue *ubq, + const struct request *req, + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); @@ -1047,13 +982,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct iov_iter iter; const int dir = ITER_DEST; - import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter); + import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } -static int ublk_unmap_io(bool need_map, +static unsigned int ublk_unmap_io(bool need_map, const struct request *req, const struct ublk_io *io) { @@ -1068,7 +1003,7 @@ static int ublk_unmap_io(bool need_map, WARN_ON_ONCE(io->res > rq_bytes); - import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter); + import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; @@ -1134,7 +1069,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); - iod->addr = io->addr; + iod->addr = io->buf.addr; return BLK_STS_OK; } @@ -1233,45 +1168,65 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, } static void -ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) +ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag) { - unsigned tag = io - ubq->ios; struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; } -static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, - struct ublk_io *io, unsigned int issue_flags) +enum auto_buf_reg_res { + AUTO_BUF_REG_FAIL, + AUTO_BUF_REG_FALLBACK, + AUTO_BUF_REG_OK, +}; + +static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, + enum auto_buf_reg_res res) +{ + if (res == AUTO_BUF_REG_OK) { + io->task_registered_buffers = 1; + io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); + io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; + } + ublk_init_req_ref(ubq, io); + __ublk_prep_compl_io_cmd(io, req); +} + +static enum auto_buf_reg_res +__ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) { int ret; - ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release, - io->buf.index, issue_flags); + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, + io->buf.auto_reg.index, issue_flags); if (ret) { - if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { - ublk_auto_buf_reg_fallback(ubq, io); - return true; + if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { + ublk_auto_buf_reg_fallback(ubq, req->tag); + return AUTO_BUF_REG_FALLBACK; } blk_mq_end_request(req, BLK_STS_IOERR); - return false; + return AUTO_BUF_REG_FAIL; } - io->task_registered_buffers = 1; - io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd); - io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; - return true; + return AUTO_BUF_REG_OK; } -static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, - struct request *req, struct ublk_io *io, - unsigned int issue_flags) +static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) { - ublk_init_req_ref(ubq, io); - if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) - return ublk_auto_buf_reg(ubq, req, io, issue_flags); + enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + issue_flags); - return true; + if (res != AUTO_BUF_REG_FAIL) { + ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags); + } } static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, @@ -1302,10 +1257,9 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, return true; } -static void ublk_dispatch_req(struct ublk_queue *ubq, - struct request *req, - unsigned int issue_flags) +static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; @@ -1344,17 +1298,21 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, if (!ublk_start_io(ubq, req, io)) return; - if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags)) + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { + ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags); + } else { + ublk_init_req_ref(ubq, io); ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); + } } -static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; - ublk_dispatch_req(ubq, pdu->req, issue_flags); + ublk_dispatch_req(ubq, pdu->req); } static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) @@ -1366,9 +1324,9 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); } -static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct request *rq = pdu->req_list; struct request *next; @@ -1376,7 +1334,7 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, do { next = rq->rq_next; rq->rq_next = NULL; - ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags); + ublk_dispatch_req(rq->mq_hctx->driver_data, rq); rq = next; } while (rq); } @@ -1537,7 +1495,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) */ io->flags &= UBLK_IO_FLAG_CANCELED; io->cmd = NULL; - io->addr = 0; + io->buf.addr = 0; /* * old task is PF_EXITING, put it now @@ -2098,13 +2056,16 @@ static inline int ublk_check_cmd_op(u32 cmd_op) static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) { - io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + struct ublk_auto_buf_reg buf; - if (io->buf.reserved0 || io->buf.reserved1) + buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + + if (buf.reserved0 || buf.reserved1) return -EINVAL; - if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) + if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) return -EINVAL; + io->buf.auto_reg = buf; return 0; } @@ -2126,7 +2087,7 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io, * this ublk request gets stuck. */ if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) - *buf_idx = io->buf.index; + *buf_idx = io->buf.auto_reg.index; } return ublk_set_auto_buf_reg(io, cmd); @@ -2154,7 +2115,7 @@ ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io, if (ublk_dev_support_auto_buf_reg(ub)) return ublk_handle_auto_buf_reg(io, cmd, buf_idx); - io->addr = buf_addr; + io->buf.addr = buf_addr; return 0; } @@ -2272,39 +2233,41 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) return 0; } -static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, - struct ublk_io *io, __u64 buf_addr) +static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, + struct ublk_io *io) { - int ret = 0; - - /* - * When handling FETCH command for setting up ublk uring queue, - * ub->mutex is the innermost lock, and we won't block for handling - * FETCH, so it is fine even for IO_URING_F_NONBLOCK. - */ - mutex_lock(&ub->mutex); /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ - if (ublk_dev_ready(ub)) { - ret = -EBUSY; - goto out; - } + if (ublk_dev_ready(ub)) + return -EBUSY; /* allow each command to be FETCHed at most once */ - if (io->flags & UBLK_IO_FLAG_ACTIVE) { - ret = -EINVAL; - goto out; - } + if (io->flags & UBLK_IO_FLAG_ACTIVE) + return -EINVAL; WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); - if (ret) - goto out; WRITE_ONCE(io->task, get_task_struct(current)); ublk_mark_io_ready(ub); -out: + + return 0; +} + +static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, + struct ublk_io *io, __u64 buf_addr) +{ + int ret; + + /* + * When handling FETCH command for setting up ublk uring queue, + * ub->mutex is the innermost lock, and we won't block for handling + * FETCH, so it is fine even for IO_URING_F_NONBLOCK. + */ + mutex_lock(&ub->mutex); + ret = __ublk_fetch(cmd, ub, io); + if (!ret) + ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); mutex_unlock(&ub->mutex); return ret; } @@ -2351,7 +2314,7 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, */ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; /* update iod->addr because ublksrv may have passed a new io buffer */ - ublk_get_iod(ubq, req->tag)->addr = io->addr; + ublk_get_iod(ubq, req->tag)->addr = io->buf.addr; pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n", __func__, ubq->q_id, req->tag, io->flags, ublk_get_iod(ubq, req->tag)->addr); @@ -2367,7 +2330,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; - struct ublk_io *io; + struct ublk_io *io = NULL; u32 cmd_op = cmd->cmd_op; u16 q_id = READ_ONCE(ub_src->q_id); u16 tag = READ_ONCE(ub_src->tag); @@ -2488,7 +2451,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, out: pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", - __func__, cmd_op, tag, ret, io->flags); + __func__, cmd_op, tag, ret, io ? io->flags : 0); return ret; } @@ -2523,9 +2486,10 @@ fail_put: return NULL; } -static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); if (ret != -EIOCBQUEUED) @@ -2575,9 +2539,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, size_t buf_off; u16 tag, q_id; - if (!ub) - return ERR_PTR(-EACCES); - if (!user_backed_iter(iter)) return ERR_PTR(-EACCES); @@ -2603,9 +2564,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, if (!req) return ERR_PTR(-EINVAL); - if (!req->mq_hctx || !req->mq_hctx->driver_data) - goto fail; - if (!ublk_check_ubuf_dir(req, dir)) goto fail; @@ -2662,9 +2620,13 @@ static const struct file_operations ublk_ch_fops = { static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { - int size = ublk_queue_cmd_buf_size(ub); - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - int i; + struct ublk_queue *ubq = ub->queues[q_id]; + int size, i; + + if (!ubq) + return; + + size = ublk_queue_cmd_buf_size(ub); for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -2676,57 +2638,76 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); + + kvfree(ubq); + ub->queues[q_id] = NULL; +} + +static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id) +{ + unsigned int cpu; + + /* Find first CPU mapped to this queue */ + for_each_possible_cpu(cpu) { + if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id) + return cpu_to_node(cpu); + } + + return NUMA_NO_NODE; } static int ublk_init_queue(struct ublk_device *ub, int q_id) { - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + int depth = ub->dev_info.queue_depth; gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; - void *ptr; + struct ublk_queue *ubq; + struct page *page; + int numa_node; int size; + /* Determine NUMA node based on queue's CPU affinity */ + numa_node = ublk_get_queue_numa_node(ub, q_id); + + /* Allocate queue structure on local NUMA node */ + ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL, + numa_node); + if (!ubq) + return -ENOMEM; + spin_lock_init(&ubq->cancel_lock); ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; - ubq->q_depth = ub->dev_info.queue_depth; + ubq->q_depth = depth; size = ublk_queue_cmd_buf_size(ub); - ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); - if (!ptr) + /* Allocate I/O command buffer on local NUMA node */ + page = alloc_pages_node(numa_node, gfp_flags, get_order(size)); + if (!page) { + kvfree(ubq); return -ENOMEM; + } + ubq->io_cmd_buf = page_address(page); - ubq->io_cmd_buf = ptr; + ub->queues[q_id] = ubq; ubq->dev = ub; return 0; } static void ublk_deinit_queues(struct ublk_device *ub) { - int nr_queues = ub->dev_info.nr_hw_queues; int i; - if (!ub->__queues) - return; - - for (i = 0; i < nr_queues; i++) + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) ublk_deinit_queue(ub, i); - kvfree(ub->__queues); } static int ublk_init_queues(struct ublk_device *ub) { - int nr_queues = ub->dev_info.nr_hw_queues; - int depth = ub->dev_info.queue_depth; - int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); - int i, ret = -ENOMEM; + int i, ret; - ub->queue_size = ubq_size; - ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); - if (!ub->__queues) - return ret; - - for (i = 0; i < nr_queues; i++) { - if (ublk_init_queue(ub, i)) + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ret = ublk_init_queue(ub, i); + if (ret) goto fail; } @@ -3128,7 +3109,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) goto out_unlock; ret = -ENOMEM; - ub = kzalloc(sizeof(*ub), GFP_KERNEL); + ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL); if (!ub) goto out_unlock; mutex_init(&ub->mutex); @@ -3178,17 +3159,17 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.nr_hw_queues, nr_cpu_ids); ublk_align_max_io_size(ub); - ret = ublk_init_queues(ub); + ret = ublk_add_tag_set(ub); if (ret) goto out_free_dev_number; - ret = ublk_add_tag_set(ub); + ret = ublk_init_queues(ub); if (ret) - goto out_deinit_queues; + goto out_free_tag_set; ret = -EFAULT; if (copy_to_user(argp, &ub->dev_info, sizeof(info))) - goto out_free_tag_set; + goto out_deinit_queues; /* * Add the char dev so that ublksrv daemon can be setup. @@ -3197,10 +3178,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ret = ublk_add_chdev(ub); goto out_unlock; -out_free_tag_set: - blk_mq_free_tag_set(&ub->tag_set); out_deinit_queues: ublk_deinit_queues(ub); +out_free_tag_set: + blk_mq_free_tag_set(&ub->tag_set); out_free_dev_number: ublk_free_dev_number(ub); out_free_ub: diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index f061420dfb10..357434bdae99 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -584,7 +584,8 @@ out: static int virtblk_parse_zone(struct virtio_blk *vblk, struct virtio_blk_zone_descriptor *entry, - unsigned int idx, report_zones_cb cb, void *data) + unsigned int idx, + struct blk_report_zones_args *args) { struct blk_zone zone = { }; @@ -650,12 +651,12 @@ static int virtblk_parse_zone(struct virtio_blk *vblk, * The callback below checks the validity of the reported * entry data, no need to further validate it here. */ - return cb(&zone, idx, data); + return disk_report_zone(vblk->disk, &zone, idx, args); } static int virtblk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, - void *data) + unsigned int nr_zones, + struct blk_report_zones_args *args) { struct virtio_blk *vblk = disk->private_data; struct virtio_blk_zone_report *report; @@ -693,7 +694,7 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector, for (i = 0; i < nz && zone_idx < nr_zones; i++) { ret = virtblk_parse_zone(vblk, &report->zones[i], - zone_idx, cb, data); + zone_idx, args); if (ret) goto fail_report; @@ -1026,8 +1027,13 @@ static int init_vq(struct virtio_blk *vblk) out: kfree(vqs); kfree(vqs_info); - if (err) + if (err) { kfree(vblk->vqs); + /* + * Set to NULL to prevent freeing vqs again during freezing. + */ + vblk->vqs = NULL; + } return err; } @@ -1598,6 +1604,12 @@ static int virtblk_freeze_priv(struct virtio_device *vdev) vdev->config->del_vqs(vdev); kfree(vblk->vqs); + /* + * Set to NULL to prevent freeing vqs again after a failed vqs + * allocation during resume. Note that kfree() already handles NULL + * pointers safely. + */ + vblk->vqs = NULL; return 0; } diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index a423228e201b..3f50321aa4a7 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -32,6 +32,8 @@ enum { ZLOOP_OPT_NR_QUEUES = (1 << 6), ZLOOP_OPT_QUEUE_DEPTH = (1 << 7), ZLOOP_OPT_BUFFERED_IO = (1 << 8), + ZLOOP_OPT_ZONE_APPEND = (1 << 9), + ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), }; static const match_table_t zloop_opt_tokens = { @@ -44,6 +46,8 @@ static const match_table_t zloop_opt_tokens = { { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" }, { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" }, { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, + { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, + { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, { ZLOOP_OPT_ERR, NULL } }; @@ -56,6 +60,8 @@ static const match_table_t zloop_opt_tokens = { #define ZLOOP_DEF_NR_QUEUES 1 #define ZLOOP_DEF_QUEUE_DEPTH 128 #define ZLOOP_DEF_BUFFERED_IO false +#define ZLOOP_DEF_ZONE_APPEND true +#define ZLOOP_DEF_ORDERED_ZONE_APPEND false /* Arbitrary limit on the zone size (16GB). */ #define ZLOOP_MAX_ZONE_SIZE_MB 16384 @@ -71,6 +77,8 @@ struct zloop_options { unsigned int nr_queues; unsigned int queue_depth; bool buffered_io; + bool zone_append; + bool ordered_zone_append; }; /* @@ -92,6 +100,7 @@ struct zloop_zone { unsigned long flags; struct mutex lock; + spinlock_t wp_lock; enum blk_zone_cond cond; sector_t start; sector_t wp; @@ -108,6 +117,8 @@ struct zloop_device { struct workqueue_struct *workqueue; bool buffered_io; + bool zone_append; + bool ordered_zone_append; const char *base_dir; struct file *data_dir; @@ -147,6 +158,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) struct zloop_zone *zone = &zlo->zones[zone_no]; struct kstat stat; sector_t file_sectors; + unsigned long flags; int ret; lockdep_assert_held(&zone->lock); @@ -172,16 +184,18 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) return -EINVAL; } + spin_lock_irqsave(&zone->wp_lock, flags); if (!file_sectors) { zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; } else if (file_sectors == zlo->zone_capacity) { zone->cond = BLK_ZONE_COND_FULL; - zone->wp = zone->start + zlo->zone_size; + zone->wp = ULLONG_MAX; } else { zone->cond = BLK_ZONE_COND_CLOSED; zone->wp = zone->start + file_sectors; } + spin_unlock_irqrestore(&zone->wp_lock, flags); return 0; } @@ -225,6 +239,7 @@ unlock: static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; + unsigned long flags; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) @@ -243,10 +258,12 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) break; case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: + spin_lock_irqsave(&zone->wp_lock, flags); if (zone->wp == zone->start) zone->cond = BLK_ZONE_COND_EMPTY; else zone->cond = BLK_ZONE_COND_CLOSED; + spin_unlock_irqrestore(&zone->wp_lock, flags); break; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_FULL: @@ -264,6 +281,7 @@ unlock: static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; + unsigned long flags; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) @@ -281,9 +299,11 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) goto unlock; } + spin_lock_irqsave(&zone->wp_lock, flags); zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + spin_unlock_irqrestore(&zone->wp_lock, flags); unlock: mutex_unlock(&zone->lock); @@ -308,6 +328,7 @@ static int zloop_reset_all_zones(struct zloop_device *zlo) static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; + unsigned long flags; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) @@ -325,9 +346,11 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) goto unlock; } + spin_lock_irqsave(&zone->wp_lock, flags); zone->cond = BLK_ZONE_COND_FULL; - zone->wp = zone->start + zlo->zone_size; + zone->wp = ULLONG_MAX; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + spin_unlock_irqrestore(&zone->wp_lock, flags); unlock: mutex_unlock(&zone->lock); @@ -369,6 +392,7 @@ static void zloop_rw(struct zloop_cmd *cmd) struct zloop_zone *zone; struct iov_iter iter; struct bio_vec tmp; + unsigned long flags; sector_t zone_end; int nr_bvec = 0; int ret; @@ -378,6 +402,11 @@ static void zloop_rw(struct zloop_cmd *cmd) cmd->nr_sectors = nr_sectors; cmd->ret = 0; + if (WARN_ON_ONCE(is_append && !zlo->zone_append)) { + ret = -EIO; + goto out; + } + /* We should never get an I/O beyond the device capacity. */ if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { ret = -EIO; @@ -406,16 +435,31 @@ static void zloop_rw(struct zloop_cmd *cmd) if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { mutex_lock(&zone->lock); - if (is_append) { - sector = zone->wp; - cmd->sector = sector; - } + spin_lock_irqsave(&zone->wp_lock, flags); /* - * Write operations must be aligned to the write pointer and - * fully contained within the zone capacity. + * Zone append operations always go at the current write + * pointer, but regular write operations must already be + * aligned to the write pointer when submitted. */ - if (sector != zone->wp || zone->wp + nr_sectors > zone_end) { + if (is_append) { + /* + * If ordered zone append is in use, we already checked + * and set the target sector in zloop_queue_rq(). + */ + if (!zlo->ordered_zone_append) { + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { + spin_unlock_irqrestore(&zone->wp_lock, + flags); + ret = -EIO; + goto unlock; + } + sector = zone->wp; + } + cmd->sector = sector; + } else if (sector != zone->wp) { + spin_unlock_irqrestore(&zone->wp_lock, flags); pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", zone_no, sector, zone->wp); ret = -EIO; @@ -428,13 +472,19 @@ static void zloop_rw(struct zloop_cmd *cmd) zone->cond = BLK_ZONE_COND_IMP_OPEN; /* - * Advance the write pointer of sequential zones. If the write - * fails, the wp position will be corrected when the next I/O - * copmpletes. + * Advance the write pointer, unless ordered zone append is in + * use. If the write fails, the write pointer position will be + * corrected when the next I/O starts execution. */ - zone->wp += nr_sectors; - if (zone->wp == zone_end) - zone->cond = BLK_ZONE_COND_FULL; + if (!is_append || !zlo->ordered_zone_append) { + zone->wp += nr_sectors; + if (zone->wp == zone_end) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } + } + + spin_unlock_irqrestore(&zone->wp_lock, flags); } rq_for_each_bvec(tmp, rq, rq_iter) @@ -498,6 +548,10 @@ static void zloop_handle_cmd(struct zloop_cmd *cmd) struct request *rq = blk_mq_rq_from_pdu(cmd); struct zloop_device *zlo = rq->q->queuedata; + /* We can block in this context, so ignore REQ_NOWAIT. */ + if (rq->cmd_flags & REQ_NOWAIT) + rq->cmd_flags &= ~REQ_NOWAIT; + switch (req_op(rq)) { case REQ_OP_READ: case REQ_OP_WRITE: @@ -608,6 +662,35 @@ static void zloop_complete_rq(struct request *rq) blk_mq_end_request(rq, sts); } +static bool zloop_set_zone_append_sector(struct request *rq) +{ + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + struct zloop_zone *zone = &zlo->zones[zone_no]; + sector_t zone_end = zone->start + zlo->zone_capacity; + sector_t nr_sectors = blk_rq_sectors(rq); + unsigned long flags; + + spin_lock_irqsave(&zone->wp_lock, flags); + + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { + spin_unlock_irqrestore(&zone->wp_lock, flags); + return false; + } + + rq->__sector = zone->wp; + zone->wp += blk_rq_sectors(rq); + if (zone->wp >= zone_end) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } + + spin_unlock_irqrestore(&zone->wp_lock, flags); + + return true; +} + static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -618,6 +701,16 @@ static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, if (zlo->state == Zlo_deleting) return BLK_STS_IOERR; + /* + * If we need to strongly order zone append operations, set the request + * sector to the zone write pointer location now instead of when the + * command work runs. + */ + if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) { + if (!zloop_set_zone_append_sector(rq)) + return BLK_STS_IOERR; + } + blk_mq_start_request(rq); INIT_WORK(&cmd->work, zloop_cmd_workfn); @@ -647,11 +740,12 @@ static int zloop_open(struct gendisk *disk, blk_mode_t mode) } static int zloop_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct zloop_device *zlo = disk->private_data; struct blk_zone blkz = {}; unsigned int first, i; + unsigned long flags; int ret; first = disk_zone_no(disk, sector); @@ -675,7 +769,9 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector, blkz.start = zone->start; blkz.len = zlo->zone_size; + spin_lock_irqsave(&zone->wp_lock, flags); blkz.wp = zone->wp; + spin_unlock_irqrestore(&zone->wp_lock, flags); blkz.cond = zone->cond; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { blkz.type = BLK_ZONE_TYPE_CONVENTIONAL; @@ -687,7 +783,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector, mutex_unlock(&zone->lock); - ret = cb(&blkz, i, data); + ret = disk_report_zone(disk, &blkz, i, args); if (ret) return ret; } @@ -783,6 +879,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, int ret; mutex_init(&zone->lock); + spin_lock_init(&zone->wp_lock); zone->start = (sector_t)zone_no << zlo->zone_shift; if (!restore) @@ -884,7 +981,6 @@ static int zloop_ctl_add(struct zloop_options *opts) { struct queue_limits lim = { .max_hw_sectors = SZ_1M >> SECTOR_SHIFT, - .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT, .chunk_sectors = opts->zone_size, .features = BLK_FEAT_ZONED, }; @@ -936,6 +1032,9 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->nr_zones = nr_zones; zlo->nr_conv_zones = opts->nr_conv_zones; zlo->buffered_io = opts->buffered_io; + zlo->zone_append = opts->zone_append; + if (zlo->zone_append) + zlo->ordered_zone_append = opts->ordered_zone_append; zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, opts->nr_queues * opts->queue_depth, zlo->id); @@ -976,6 +1075,8 @@ static int zloop_ctl_add(struct zloop_options *opts) lim.physical_block_size = zlo->block_size; lim.logical_block_size = zlo->block_size; + if (zlo->zone_append) + lim.max_hw_zone_append_sectors = lim.max_hw_sectors; zlo->tag_set.ops = &zloop_mq_ops; zlo->tag_set.nr_hw_queues = opts->nr_queues; @@ -1016,10 +1117,14 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->state = Zlo_live; mutex_unlock(&zloop_ctl_mutex); - pr_info("Added device %d: %u zones of %llu MB, %u B block size\n", + pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n", zlo->id, zlo->nr_zones, ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20, zlo->block_size); + pr_info("zloop%d: using %s%s zone append\n", + zlo->id, + zlo->ordered_zone_append ? "ordered " : "", + zlo->zone_append ? "native" : "emulated"); return 0; @@ -1106,6 +1211,8 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) opts->nr_queues = ZLOOP_DEF_NR_QUEUES; opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; + opts->zone_append = ZLOOP_DEF_ZONE_APPEND; + opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND; if (!buf) return 0; @@ -1215,6 +1322,21 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) case ZLOOP_OPT_BUFFERED_IO: opts->buffered_io = true; break; + case ZLOOP_OPT_ZONE_APPEND: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token != 0 && token != 1) { + pr_err("Invalid zone_append value\n"); + ret = -EINVAL; + goto out; + } + opts->zone_append = token; + break; + case ZLOOP_OPT_ORDERED_ZONE_APPEND: + opts->ordered_zone_append = true; + break; case ZLOOP_OPT_ERR: default: pr_warn("unknown parameter or missing value '%s'\n", p); |
