summaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/drbd/drbd_bitmap.c10
-rw-r--r--drivers/block/drbd/drbd_receiver.c14
-rw-r--r--drivers/block/floppy.c2
-rw-r--r--drivers/block/loop.c4
-rw-r--r--drivers/block/nbd.c59
-rw-r--r--drivers/block/null_blk/main.c81
-rw-r--r--drivers/block/null_blk/null_blk.h3
-rw-r--r--drivers/block/null_blk/zoned.c6
-rw-r--r--drivers/block/ps3disk.c4
-rw-r--r--drivers/block/rnbd/rnbd-proto.h15
-rw-r--r--drivers/block/rnull/configfs.rs9
-rw-r--r--drivers/block/rnull/rnull.rs3
-rw-r--r--drivers/block/ublk_drv.c407
-rw-r--r--drivers/block/virtio_blk.c24
-rw-r--r--drivers/block/zloop.c160
-rw-r--r--drivers/block/zram/zram_drv.c483
-rw-r--r--drivers/block/zram/zram_drv.h2
17 files changed, 837 insertions, 449 deletions
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 85ca000a0564..d90fa3e7f4cf 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1210,7 +1210,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
return err;
}
-/**
+/*
* drbd_bm_read() - Read the whole bitmap from its on disk location.
* @device: DRBD device.
*/
@@ -1221,7 +1221,7 @@ int drbd_bm_read(struct drbd_device *device,
return bm_rw(device, BM_AIO_READ, 0);
}
-/**
+/*
* drbd_bm_write() - Write the whole bitmap to its on disk location.
* @device: DRBD device.
*
@@ -1233,7 +1233,7 @@ int drbd_bm_write(struct drbd_device *device,
return bm_rw(device, 0, 0);
}
-/**
+/*
* drbd_bm_write_all() - Write the whole bitmap to its on disk location.
* @device: DRBD device.
*
@@ -1255,7 +1255,7 @@ int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_ho
return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
}
-/**
+/*
* drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
* @device: DRBD device.
*
@@ -1272,7 +1272,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device,
return bm_rw(device, BM_AIO_COPY_PAGES, 0);
}
-/**
+/*
* drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
* @device: DRBD device.
*/
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index caaf2781136d..3de919b6f0e1 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -450,7 +450,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection)
* a free one dynamically.
*/
what = "bind before connect";
- err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
+ err = sock->ops->bind(sock, (struct sockaddr_unsized *) &src_in6, my_addr_len);
if (err < 0)
goto out;
@@ -458,7 +458,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection)
* stay C_WF_CONNECTION, don't go Disconnecting! */
disconnect_on_error = 0;
what = "connect";
- err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
+ err = sock->ops->connect(sock, (struct sockaddr_unsized *) &peer_in6, peer_addr_len, 0);
out:
if (err < 0) {
@@ -537,7 +537,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce
drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
what = "bind before listen";
- err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
+ err = s_listen->ops->bind(s_listen, (struct sockaddr_unsized *)&my_addr, my_addr_len);
if (err < 0)
goto out;
@@ -1736,13 +1736,13 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
page = peer_req->pages;
page_chain_for_each(page) {
unsigned len = min_t(int, ds, PAGE_SIZE);
- data = kmap(page);
+ data = kmap_local_page(page);
err = drbd_recv_all_warn(peer_device->connection, data, len);
if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
drbd_err(device, "Fault injection: Corrupting data on receive\n");
data[0] = data[0] ^ (unsigned long)-1;
}
- kunmap(page);
+ kunmap_local(data);
if (err) {
drbd_free_peer_req(device, peer_req);
return NULL;
@@ -1777,7 +1777,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
page = drbd_alloc_pages(peer_device, 1, 1);
- data = kmap(page);
+ data = kmap_local_page(page);
while (data_size) {
unsigned int len = min_t(int, data_size, PAGE_SIZE);
@@ -1786,7 +1786,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
break;
data_size -= len;
}
- kunmap(page);
+ kunmap_local(data);
drbd_free_pages(peer_device->device, page);
return err;
}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 5336c3c5ca36..c28786e0fe1c 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -329,7 +329,7 @@ static bool initialized;
* This default is used whenever the current disk size is unknown.
* [Now it is rather a minimum]
*/
-#define MAX_DISK_SIZE 4 /* 3984 */
+#define MAX_DISK_SIZE (PAGE_SIZE / 1024)
/*
* globals used by 'result()'
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 13ce229d450c..ebe751f39742 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1908,6 +1908,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
goto failed;
}
+ /* We can block in this context, so ignore REQ_NOWAIT. */
+ if (rq->cmd_flags & REQ_NOWAIT)
+ rq->cmd_flags &= ~REQ_NOWAIT;
+
if (cmd_blkcg_css)
kthread_associate_blkcg(cmd_blkcg_css);
if (cmd_memcg_css)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a853c65ac65d..f6c33b21f69e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -52,7 +52,6 @@
static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex);
static struct workqueue_struct *nbd_del_wq;
-static struct cred *nbd_cred;
static int nbd_total_devices = 0;
struct nbd_sock {
@@ -555,7 +554,6 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
int result;
struct msghdr msg = {} ;
unsigned int noreclaim_flag;
- const struct cred *old_cred;
if (unlikely(!sock)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
@@ -564,33 +562,32 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
return -EINVAL;
}
- old_cred = override_creds(nbd_cred);
-
msg.msg_iter = *iter;
noreclaim_flag = memalloc_noreclaim_save();
- do {
- sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
- sock->sk->sk_use_task_frag = false;
- msg.msg_flags = msg_flags | MSG_NOSIGNAL;
-
- if (send)
- result = sock_sendmsg(sock, &msg);
- else
- result = sock_recvmsg(sock, &msg, msg.msg_flags);
-
- if (result <= 0) {
- if (result == 0)
- result = -EPIPE; /* short read */
- break;
- }
- if (sent)
- *sent += result;
- } while (msg_data_left(&msg));
- memalloc_noreclaim_restore(noreclaim_flag);
+ scoped_with_kernel_creds() {
+ do {
+ sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
+ sock->sk->sk_use_task_frag = false;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+ if (send)
+ result = sock_sendmsg(sock, &msg);
+ else
+ result = sock_recvmsg(sock, &msg, msg.msg_flags);
- revert_creds(old_cred);
+ if (result <= 0) {
+ if (result == 0)
+ result = -EPIPE; /* short read */
+ break;
+ }
+ if (sent)
+ *sent += result;
+ } while (msg_data_left(&msg));
+ }
+
+ memalloc_noreclaim_restore(noreclaim_flag);
return result;
}
@@ -1024,9 +1021,9 @@ static void recv_work(struct work_struct *work)
nbd_mark_nsock_dead(nbd, nsock, 1);
mutex_unlock(&nsock->tx_lock);
- nbd_config_put(nbd);
atomic_dec(&config->recv_threads);
wake_up(&config->recv_wq);
+ nbd_config_put(nbd);
kfree(args);
}
@@ -2241,12 +2238,13 @@ again:
ret = nbd_start_device(nbd);
out:
- mutex_unlock(&nbd->config_lock);
if (!ret) {
set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
refcount_inc(&nbd->config_refs);
nbd_connect_reply(info, nbd->index);
}
+ mutex_unlock(&nbd->config_lock);
+
nbd_config_put(nbd);
if (put_dev)
nbd_put(nbd);
@@ -2683,15 +2681,7 @@ static int __init nbd_init(void)
return -ENOMEM;
}
- nbd_cred = prepare_kernel_cred(&init_task);
- if (!nbd_cred) {
- destroy_workqueue(nbd_del_wq);
- unregister_blkdev(NBD_MAJOR, "nbd");
- return -ENOMEM;
- }
-
if (genl_register_family(&nbd_genl_family)) {
- put_cred(nbd_cred);
destroy_workqueue(nbd_del_wq);
unregister_blkdev(NBD_MAJOR, "nbd");
return -EINVAL;
@@ -2746,7 +2736,6 @@ static void __exit nbd_cleanup(void)
/* Also wait for nbd_dev_remove_work() completes */
destroy_workqueue(nbd_del_wq);
- put_cred(nbd_cred);
idr_destroy(&nbd_index_idr);
unregister_blkdev(NBD_MAJOR, "nbd");
}
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index f982027e8c85..c7c0fb79a6bf 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1129,26 +1129,28 @@ again:
return 0;
}
-static int copy_to_nullb(struct nullb *nullb, struct page *source,
- unsigned int off, sector_t sector, size_t n, bool is_fua)
+static blk_status_t copy_to_nullb(struct nullb *nullb, void *source,
+ loff_t pos, size_t n, bool is_fua)
{
size_t temp, count = 0;
- unsigned int offset;
struct nullb_page *t_page;
+ sector_t sector;
while (count < n) {
- temp = min_t(size_t, nullb->dev->blocksize, n - count);
+ temp = min3(nullb->dev->blocksize, n - count,
+ PAGE_SIZE - offset_in_page(pos));
+ sector = pos >> SECTOR_SHIFT;
if (null_cache_active(nullb) && !is_fua)
null_make_cache_space(nullb, PAGE_SIZE);
- offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
t_page = null_insert_page(nullb, sector,
!null_cache_active(nullb) || is_fua);
if (!t_page)
- return -ENOSPC;
+ return BLK_STS_NOSPC;
- memcpy_page(t_page->page, offset, source, off + count, temp);
+ memcpy_to_page(t_page->page, offset_in_page(pos),
+ source + count, temp);
__set_bit(sector & SECTOR_MASK, t_page->bitmap);
@@ -1156,41 +1158,34 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source,
null_free_sector(nullb, sector, true);
count += temp;
- sector += temp >> SECTOR_SHIFT;
+ pos += temp;
}
- return 0;
+ return BLK_STS_OK;
}
-static int copy_from_nullb(struct nullb *nullb, struct page *dest,
- unsigned int off, sector_t sector, size_t n)
+static void copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos,
+ size_t n)
{
size_t temp, count = 0;
- unsigned int offset;
struct nullb_page *t_page;
+ sector_t sector;
while (count < n) {
- temp = min_t(size_t, nullb->dev->blocksize, n - count);
+ temp = min3(nullb->dev->blocksize, n - count,
+ PAGE_SIZE - offset_in_page(pos));
+ sector = pos >> SECTOR_SHIFT;
- offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
t_page = null_lookup_page(nullb, sector, false,
!null_cache_active(nullb));
-
if (t_page)
- memcpy_page(dest, off + count, t_page->page, offset,
- temp);
+ memcpy_from_page(dest + count, t_page->page,
+ offset_in_page(pos), temp);
else
- memzero_page(dest, off + count, temp);
+ memset(dest + count, 0, temp);
count += temp;
- sector += temp >> SECTOR_SHIFT;
+ pos += temp;
}
- return 0;
-}
-
-static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
- unsigned int len, unsigned int off)
-{
- memset_page(page, off, 0xff, len);
}
blk_status_t null_handle_discard(struct nullb_device *dev,
@@ -1234,34 +1229,39 @@ static blk_status_t null_handle_flush(struct nullb *nullb)
return errno_to_blk_status(err);
}
-static int null_transfer(struct nullb *nullb, struct page *page,
- unsigned int len, unsigned int off, bool is_write, sector_t sector,
+static blk_status_t null_transfer(struct nullb *nullb, struct page *page,
+ unsigned int len, unsigned int off, bool is_write, loff_t pos,
bool is_fua)
{
struct nullb_device *dev = nullb->dev;
+ blk_status_t err = BLK_STS_OK;
unsigned int valid_len = len;
- int err = 0;
+ void *p;
+ p = kmap_local_page(page) + off;
if (!is_write) {
- if (dev->zoned)
+ if (dev->zoned) {
valid_len = null_zone_valid_read_len(nullb,
- sector, len);
+ pos >> SECTOR_SHIFT, len);
+ if (valid_len && valid_len != len)
+ valid_len -= pos & (SECTOR_SIZE - 1);
+ }
if (valid_len) {
- err = copy_from_nullb(nullb, page, off,
- sector, valid_len);
+ copy_from_nullb(nullb, p, pos, valid_len);
off += valid_len;
len -= valid_len;
}
if (len)
- nullb_fill_pattern(nullb, page, len, off);
+ memset(p + valid_len, 0xff, len);
flush_dcache_page(page);
} else {
flush_dcache_page(page);
- err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+ err = copy_to_nullb(nullb, p, pos, len, is_fua);
}
+ kunmap_local(p);
return err;
}
@@ -1274,9 +1274,9 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb;
- int err = 0;
+ blk_status_t err = BLK_STS_OK;
unsigned int len;
- sector_t sector = blk_rq_pos(rq);
+ loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
unsigned int transferred_bytes = 0;
struct req_iterator iter;
@@ -1288,18 +1288,18 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
if (transferred_bytes + len > max_bytes)
len = max_bytes - transferred_bytes;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
- op_is_write(req_op(rq)), sector,
+ op_is_write(req_op(rq)), pos,
rq->cmd_flags & REQ_FUA);
if (err)
break;
- sector += len >> SECTOR_SHIFT;
+ pos += len;
transferred_bytes += len;
if (transferred_bytes >= max_bytes)
break;
}
spin_unlock_irq(&nullb->lock);
- return errno_to_blk_status(err);
+ return err;
}
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
@@ -1949,6 +1949,7 @@ static int null_add_dev(struct nullb_device *dev)
.logical_block_size = dev->blocksize,
.physical_block_size = dev->blocksize,
.max_hw_sectors = dev->max_sectors,
+ .dma_alignment = 1,
};
struct nullb *nullb;
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 7bb6128dbaaf..6c4c4bbe7dad 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -143,7 +143,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
int null_register_zoned_dev(struct nullb *nullb);
void null_free_zoned_dev(struct nullb_device *dev);
int null_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args);
blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, sector_t nr_sectors);
size_t null_zone_valid_read_len(struct nullb *nullb,
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 4e5728f45989..0ada35dc0989 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -191,7 +191,7 @@ void null_free_zoned_dev(struct nullb_device *dev)
}
int null_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct nullb *nullb = disk->private_data;
struct nullb_device *dev = nullb->dev;
@@ -225,7 +225,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector,
blkz.capacity = zone->capacity;
null_unlock_zone(dev, zone);
- error = cb(&blkz, i, data);
+ error = disk_report_zone(disk, &blkz, i, args);
if (error)
return error;
}
@@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
{
struct nullb_device *dev = nullb->dev;
struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
- unsigned int nr_sectors = len >> SECTOR_SHIFT;
+ unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SIZE);
/* Read must be below the write pointer position */
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index dc9e4a14b885..8892f218a814 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -85,10 +85,14 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
struct bio_vec bvec;
rq_for_each_segment(bvec, req, iter) {
+ dev_dbg(&dev->sbd.core, "%s:%u: %u sectors from %llu\n",
+ __func__, __LINE__, bio_sectors(iter.bio),
+ iter.bio->bi_iter.bi_sector);
if (gather)
memcpy_from_bvec(dev->bounce_buf + offset, &bvec);
else
memcpy_to_bvec(&bvec, dev->bounce_buf + offset);
+ offset += bvec.bv_len;
}
}
diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h
index f35be51d213c..77360c2a6069 100644
--- a/drivers/block/rnbd/rnbd-proto.h
+++ b/drivers/block/rnbd/rnbd-proto.h
@@ -24,7 +24,7 @@
#define RTRS_PORT 1234
/**
- * enum rnbd_msg_types - RNBD message types
+ * enum rnbd_msg_type - RNBD message types
* @RNBD_MSG_SESS_INFO: initial session info from client to server
* @RNBD_MSG_SESS_INFO_RSP: initial session info from server to client
* @RNBD_MSG_OPEN: open (map) device request
@@ -47,10 +47,11 @@ enum rnbd_msg_type {
*/
struct rnbd_msg_hdr {
__le16 type;
+ /* private: */
__le16 __padding;
};
-/**
+/*
* We allow to map RO many times and RW only once. We allow to map yet another
* time RW, if MIGRATION is provided (second RW export can be required for
* example for VM migration)
@@ -78,6 +79,7 @@ static const __maybe_unused struct {
struct rnbd_msg_sess_info {
struct rnbd_msg_hdr hdr;
u8 ver;
+ /* private: */
u8 reserved[31];
};
@@ -89,6 +91,7 @@ struct rnbd_msg_sess_info {
struct rnbd_msg_sess_info_rsp {
struct rnbd_msg_hdr hdr;
u8 ver;
+ /* private: */
u8 reserved[31];
};
@@ -97,13 +100,16 @@ struct rnbd_msg_sess_info_rsp {
* @hdr: message header
* @access_mode: the mode to open remote device, valid values see:
* enum rnbd_access_mode
- * @device_name: device path on remote side
+ * @dev_name: device path on remote side
*/
struct rnbd_msg_open {
struct rnbd_msg_hdr hdr;
u8 access_mode;
+ /* private: */
u8 resv1;
+ /* public: */
s8 dev_name[NAME_MAX];
+ /* private: */
u8 reserved[3];
};
@@ -155,6 +161,7 @@ struct rnbd_msg_open_rsp {
__le16 secure_discard;
u8 obsolete_rotational;
u8 cache_policy;
+ /* private: */
u8 reserved[10];
};
@@ -187,7 +194,7 @@ struct rnbd_msg_io {
* @RNBD_OP_DISCARD: discard sectors
* @RNBD_OP_SECURE_ERASE: securely erase sectors
* @RNBD_OP_WRITE_ZEROES: write zeroes sectors
-
+ *
* @RNBD_F_SYNC: request is sync (sync write or read)
* @RNBD_F_FUA: forced unit access
*/
diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs
index 8498e9bae6fd..6713a6d92391 100644
--- a/drivers/block/rnull/configfs.rs
+++ b/drivers/block/rnull/configfs.rs
@@ -1,12 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
use super::{NullBlkDevice, THIS_MODULE};
-use core::fmt::{Display, Write};
use kernel::{
block::mq::gen_disk::{GenDisk, GenDiskBuilder},
c_str,
configfs::{self, AttributeOperations},
- configfs_attrs, new_mutex,
+ configfs_attrs,
+ fmt::{self, Write as _},
+ new_mutex,
page::PAGE_SIZE,
prelude::*,
str::{kstrtobool_bytes, CString},
@@ -99,8 +100,8 @@ impl TryFrom<u8> for IRQMode {
}
}
-impl Display for IRQMode {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+impl fmt::Display for IRQMode {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::None => f.write_str("0")?,
Self::Soft => f.write_str("1")?,
diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs
index 1ec694d7f1a6..a9d5e575a2c4 100644
--- a/drivers/block/rnull/rnull.rs
+++ b/drivers/block/rnull/rnull.rs
@@ -17,8 +17,7 @@ use kernel::{
error::Result,
pr_info,
prelude::*,
- sync::Arc,
- types::ARef,
+ sync::{aref::ARef, Arc},
};
use pin_init::PinInit;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0c74a41a6753..2c715df63f23 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -155,12 +155,13 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
+union ublk_io_buf {
+ __u64 addr;
+ struct ublk_auto_buf_reg auto_reg;
+};
+
struct ublk_io {
- /* userspace buffer address from io cmd */
- union {
- __u64 addr;
- struct ublk_auto_buf_reg buf;
- };
+ union ublk_io_buf buf;
unsigned int flags;
int res;
@@ -203,15 +204,12 @@ struct ublk_queue {
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
spinlock_t cancel_lock;
struct ublk_device *dev;
- struct ublk_io ios[];
+ struct ublk_io ios[] __counted_by(q_depth);
};
struct ublk_device {
struct gendisk *ub_disk;
- char *__queues;
-
- unsigned int queue_size;
struct ublksrv_ctrl_dev_info dev_info;
struct blk_mq_tag_set tag_set;
@@ -239,6 +237,8 @@ struct ublk_device {
bool canceling;
pid_t ublksrv_tgid;
struct delayed_work exit_work;
+
+ struct ublk_queue *queues[];
};
/* header of ublk_params */
@@ -265,7 +265,7 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
return ub->dev_info.flags & UBLK_F_ZONED;
}
-static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
+static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_ZONED;
}
@@ -368,7 +368,7 @@ static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
}
static int ublk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct ublk_device *ub = disk->private_data;
unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
@@ -431,7 +431,7 @@ free_req:
if (!zone->len)
break;
- ret = cb(zone, i, data);
+ ret = disk_report_zone(disk, zone, i, args);
if (ret)
goto out;
@@ -499,7 +499,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_sectors = blk_rq_sectors(req);
iod->start_sector = blk_rq_pos(req);
- iod->addr = io->addr;
+ iod->addr = io->buf.addr;
return BLK_STS_OK;
}
@@ -781,7 +781,7 @@ static noinline void ublk_put_device(struct ublk_device *ub)
static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
int qid)
{
- return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
+ return dev->queues[qid];
}
static inline bool ublk_rq_has_data(const struct request *rq)
@@ -914,73 +914,6 @@ static const struct block_device_operations ub_fops = {
.report_zones = ublk_report_zones,
};
-#define UBLK_MAX_PIN_PAGES 32
-
-struct ublk_io_iter {
- struct page *pages[UBLK_MAX_PIN_PAGES];
- struct bio *bio;
- struct bvec_iter iter;
-};
-
-/* return how many pages are copied */
-static void ublk_copy_io_pages(struct ublk_io_iter *data,
- size_t total, size_t pg_off, int dir)
-{
- unsigned done = 0;
- unsigned pg_idx = 0;
-
- while (done < total) {
- struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
- unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
- (unsigned)(PAGE_SIZE - pg_off));
- void *bv_buf = bvec_kmap_local(&bv);
- void *pg_buf = kmap_local_page(data->pages[pg_idx]);
-
- if (dir == ITER_DEST)
- memcpy(pg_buf + pg_off, bv_buf, bytes);
- else
- memcpy(bv_buf, pg_buf + pg_off, bytes);
-
- kunmap_local(pg_buf);
- kunmap_local(bv_buf);
-
- /* advance page array */
- pg_off += bytes;
- if (pg_off == PAGE_SIZE) {
- pg_idx += 1;
- pg_off = 0;
- }
-
- done += bytes;
-
- /* advance bio */
- bio_advance_iter_single(data->bio, &data->iter, bytes);
- if (!data->iter.bi_size) {
- data->bio = data->bio->bi_next;
- if (data->bio == NULL)
- break;
- data->iter = data->bio->bi_iter;
- }
- }
-}
-
-static bool ublk_advance_io_iter(const struct request *req,
- struct ublk_io_iter *iter, unsigned int offset)
-{
- struct bio *bio = req->bio;
-
- for_each_bio(bio) {
- if (bio->bi_iter.bi_size > offset) {
- iter->bio = bio;
- iter->iter = bio->bi_iter;
- bio_advance_iter(iter->bio, &iter->iter, offset);
- return true;
- }
- offset -= bio->bi_iter.bi_size;
- }
- return false;
-}
-
/*
* Copy data between request pages and io_iter, and 'offset'
* is the start point of linear offset of request.
@@ -988,34 +921,35 @@ static bool ublk_advance_io_iter(const struct request *req,
static size_t ublk_copy_user_pages(const struct request *req,
unsigned offset, struct iov_iter *uiter, int dir)
{
- struct ublk_io_iter iter;
+ struct req_iterator iter;
+ struct bio_vec bv;
size_t done = 0;
- if (!ublk_advance_io_iter(req, &iter, offset))
- return 0;
+ rq_for_each_segment(bv, req, iter) {
+ void *bv_buf;
+ size_t copied;
- while (iov_iter_count(uiter) && iter.bio) {
- unsigned nr_pages;
- ssize_t len;
- size_t off;
- int i;
-
- len = iov_iter_get_pages2(uiter, iter.pages,
- iov_iter_count(uiter),
- UBLK_MAX_PIN_PAGES, &off);
- if (len <= 0)
- return done;
-
- ublk_copy_io_pages(&iter, len, off, dir);
- nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
- for (i = 0; i < nr_pages; i++) {
- if (dir == ITER_DEST)
- set_page_dirty(iter.pages[i]);
- put_page(iter.pages[i]);
+ if (offset >= bv.bv_len) {
+ offset -= bv.bv_len;
+ continue;
}
- done += len;
- }
+ bv.bv_offset += offset;
+ bv.bv_len -= offset;
+ bv_buf = bvec_kmap_local(&bv);
+ if (dir == ITER_DEST)
+ copied = copy_to_iter(bv_buf, bv.bv_len, uiter);
+ else
+ copied = copy_from_iter(bv_buf, bv.bv_len, uiter);
+
+ kunmap_local(bv_buf);
+
+ done += copied;
+ if (copied < bv.bv_len)
+ break;
+
+ offset = 0;
+ }
return done;
}
@@ -1030,8 +964,9 @@ static inline bool ublk_need_unmap_req(const struct request *req)
(req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
}
-static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
- const struct ublk_io *io)
+static unsigned int ublk_map_io(const struct ublk_queue *ubq,
+ const struct request *req,
+ const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
@@ -1047,13 +982,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
struct iov_iter iter;
const int dir = ITER_DEST;
- import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
+ import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
return ublk_copy_user_pages(req, 0, &iter, dir);
}
return rq_bytes;
}
-static int ublk_unmap_io(bool need_map,
+static unsigned int ublk_unmap_io(bool need_map,
const struct request *req,
const struct ublk_io *io)
{
@@ -1068,7 +1003,7 @@ static int ublk_unmap_io(bool need_map,
WARN_ON_ONCE(io->res > rq_bytes);
- import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
+ import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
return ublk_copy_user_pages(req, 0, &iter, dir);
}
return rq_bytes;
@@ -1134,7 +1069,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_sectors = blk_rq_sectors(req);
iod->start_sector = blk_rq_pos(req);
- iod->addr = io->addr;
+ iod->addr = io->buf.addr;
return BLK_STS_OK;
}
@@ -1233,45 +1168,65 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
}
static void
-ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io)
+ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
{
- unsigned tag = io - ubq->ios;
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
}
-static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
- struct ublk_io *io, unsigned int issue_flags)
+enum auto_buf_reg_res {
+ AUTO_BUF_REG_FAIL,
+ AUTO_BUF_REG_FALLBACK,
+ AUTO_BUF_REG_OK,
+};
+
+static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq,
+ struct request *req, struct ublk_io *io,
+ struct io_uring_cmd *cmd,
+ enum auto_buf_reg_res res)
+{
+ if (res == AUTO_BUF_REG_OK) {
+ io->task_registered_buffers = 1;
+ io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
+ io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
+ }
+ ublk_init_req_ref(ubq, io);
+ __ublk_prep_compl_io_cmd(io, req);
+}
+
+static enum auto_buf_reg_res
+__ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io, struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
int ret;
- ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
- io->buf.index, issue_flags);
+ ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
+ io->buf.auto_reg.index, issue_flags);
if (ret) {
- if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
- ublk_auto_buf_reg_fallback(ubq, io);
- return true;
+ if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
+ ublk_auto_buf_reg_fallback(ubq, req->tag);
+ return AUTO_BUF_REG_FALLBACK;
}
blk_mq_end_request(req, BLK_STS_IOERR);
- return false;
+ return AUTO_BUF_REG_FAIL;
}
- io->task_registered_buffers = 1;
- io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
- io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
- return true;
+ return AUTO_BUF_REG_OK;
}
-static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
- struct request *req, struct ublk_io *io,
- unsigned int issue_flags)
+static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io, struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
- ublk_init_req_ref(ubq, io);
- if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
- return ublk_auto_buf_reg(ubq, req, io, issue_flags);
+ enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd,
+ issue_flags);
- return true;
+ if (res != AUTO_BUF_REG_FAIL) {
+ ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res);
+ io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
+ }
}
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
@@ -1302,10 +1257,9 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
return true;
}
-static void ublk_dispatch_req(struct ublk_queue *ubq,
- struct request *req,
- unsigned int issue_flags)
+static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
{
+ unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag];
@@ -1344,17 +1298,21 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
if (!ublk_start_io(ubq, req, io))
return;
- if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
+ if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
+ ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags);
+ } else {
+ ublk_init_req_ref(ubq, io);
ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
+ }
}
-static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
+static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
{
+ struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
struct ublk_queue *ubq = pdu->ubq;
- ublk_dispatch_req(ubq, pdu->req, issue_flags);
+ ublk_dispatch_req(ubq, pdu->req);
}
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
@@ -1366,9 +1324,9 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
}
-static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
+static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
{
+ struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
struct request *rq = pdu->req_list;
struct request *next;
@@ -1376,7 +1334,7 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
do {
next = rq->rq_next;
rq->rq_next = NULL;
- ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags);
+ ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
rq = next;
} while (rq);
}
@@ -1537,7 +1495,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
*/
io->flags &= UBLK_IO_FLAG_CANCELED;
io->cmd = NULL;
- io->addr = 0;
+ io->buf.addr = 0;
/*
* old task is PF_EXITING, put it now
@@ -2098,13 +2056,16 @@ static inline int ublk_check_cmd_op(u32 cmd_op)
static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
{
- io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+ struct ublk_auto_buf_reg buf;
- if (io->buf.reserved0 || io->buf.reserved1)
+ buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+
+ if (buf.reserved0 || buf.reserved1)
return -EINVAL;
- if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
+ if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
return -EINVAL;
+ io->buf.auto_reg = buf;
return 0;
}
@@ -2126,7 +2087,7 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io,
* this ublk request gets stuck.
*/
if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
- *buf_idx = io->buf.index;
+ *buf_idx = io->buf.auto_reg.index;
}
return ublk_set_auto_buf_reg(io, cmd);
@@ -2154,7 +2115,7 @@ ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
if (ublk_dev_support_auto_buf_reg(ub))
return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
- io->addr = buf_addr;
+ io->buf.addr = buf_addr;
return 0;
}
@@ -2272,39 +2233,41 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
return 0;
}
-static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
- struct ublk_io *io, __u64 buf_addr)
+static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
+ struct ublk_io *io)
{
- int ret = 0;
-
- /*
- * When handling FETCH command for setting up ublk uring queue,
- * ub->mutex is the innermost lock, and we won't block for handling
- * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
- */
- mutex_lock(&ub->mutex);
/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
- if (ublk_dev_ready(ub)) {
- ret = -EBUSY;
- goto out;
- }
+ if (ublk_dev_ready(ub))
+ return -EBUSY;
/* allow each command to be FETCHed at most once */
- if (io->flags & UBLK_IO_FLAG_ACTIVE) {
- ret = -EINVAL;
- goto out;
- }
+ if (io->flags & UBLK_IO_FLAG_ACTIVE)
+ return -EINVAL;
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
- if (ret)
- goto out;
WRITE_ONCE(io->task, get_task_struct(current));
ublk_mark_io_ready(ub);
-out:
+
+ return 0;
+}
+
+static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
+ struct ublk_io *io, __u64 buf_addr)
+{
+ int ret;
+
+ /*
+ * When handling FETCH command for setting up ublk uring queue,
+ * ub->mutex is the innermost lock, and we won't block for handling
+ * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
+ */
+ mutex_lock(&ub->mutex);
+ ret = __ublk_fetch(cmd, ub, io);
+ if (!ret)
+ ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
mutex_unlock(&ub->mutex);
return ret;
}
@@ -2351,7 +2314,7 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
*/
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
/* update iod->addr because ublksrv may have passed a new io buffer */
- ublk_get_iod(ubq, req->tag)->addr = io->addr;
+ ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
__func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
@@ -2367,7 +2330,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
u16 buf_idx = UBLK_INVALID_BUF_IDX;
struct ublk_device *ub = cmd->file->private_data;
struct ublk_queue *ubq;
- struct ublk_io *io;
+ struct ublk_io *io = NULL;
u32 cmd_op = cmd->cmd_op;
u16 q_id = READ_ONCE(ub_src->q_id);
u16 tag = READ_ONCE(ub_src->tag);
@@ -2488,7 +2451,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
out:
pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
- __func__, cmd_op, tag, ret, io->flags);
+ __func__, cmd_op, tag, ret, io ? io->flags : 0);
return ret;
}
@@ -2523,9 +2486,10 @@ fail_put:
return NULL;
}
-static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
+static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
{
+ unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
+ struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
if (ret != -EIOCBQUEUED)
@@ -2575,9 +2539,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
size_t buf_off;
u16 tag, q_id;
- if (!ub)
- return ERR_PTR(-EACCES);
-
if (!user_backed_iter(iter))
return ERR_PTR(-EACCES);
@@ -2603,9 +2564,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
if (!req)
return ERR_PTR(-EINVAL);
- if (!req->mq_hctx || !req->mq_hctx->driver_data)
- goto fail;
-
if (!ublk_check_ubuf_dir(req, dir))
goto fail;
@@ -2662,9 +2620,13 @@ static const struct file_operations ublk_ch_fops = {
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
{
- int size = ublk_queue_cmd_buf_size(ub);
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- int i;
+ struct ublk_queue *ubq = ub->queues[q_id];
+ int size, i;
+
+ if (!ubq)
+ return;
+
+ size = ublk_queue_cmd_buf_size(ub);
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -2676,57 +2638,76 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
if (ubq->io_cmd_buf)
free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
+
+ kvfree(ubq);
+ ub->queues[q_id] = NULL;
+}
+
+static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
+{
+ unsigned int cpu;
+
+ /* Find first CPU mapped to this queue */
+ for_each_possible_cpu(cpu) {
+ if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
+ return cpu_to_node(cpu);
+ }
+
+ return NUMA_NO_NODE;
}
static int ublk_init_queue(struct ublk_device *ub, int q_id)
{
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+ int depth = ub->dev_info.queue_depth;
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
- void *ptr;
+ struct ublk_queue *ubq;
+ struct page *page;
+ int numa_node;
int size;
+ /* Determine NUMA node based on queue's CPU affinity */
+ numa_node = ublk_get_queue_numa_node(ub, q_id);
+
+ /* Allocate queue structure on local NUMA node */
+ ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
+ numa_node);
+ if (!ubq)
+ return -ENOMEM;
+
spin_lock_init(&ubq->cancel_lock);
ubq->flags = ub->dev_info.flags;
ubq->q_id = q_id;
- ubq->q_depth = ub->dev_info.queue_depth;
+ ubq->q_depth = depth;
size = ublk_queue_cmd_buf_size(ub);
- ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
- if (!ptr)
+ /* Allocate I/O command buffer on local NUMA node */
+ page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
+ if (!page) {
+ kvfree(ubq);
return -ENOMEM;
+ }
+ ubq->io_cmd_buf = page_address(page);
- ubq->io_cmd_buf = ptr;
+ ub->queues[q_id] = ubq;
ubq->dev = ub;
return 0;
}
static void ublk_deinit_queues(struct ublk_device *ub)
{
- int nr_queues = ub->dev_info.nr_hw_queues;
int i;
- if (!ub->__queues)
- return;
-
- for (i = 0; i < nr_queues; i++)
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_deinit_queue(ub, i);
- kvfree(ub->__queues);
}
static int ublk_init_queues(struct ublk_device *ub)
{
- int nr_queues = ub->dev_info.nr_hw_queues;
- int depth = ub->dev_info.queue_depth;
- int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
- int i, ret = -ENOMEM;
+ int i, ret;
- ub->queue_size = ubq_size;
- ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL);
- if (!ub->__queues)
- return ret;
-
- for (i = 0; i < nr_queues; i++) {
- if (ublk_init_queue(ub, i))
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ ret = ublk_init_queue(ub, i);
+ if (ret)
goto fail;
}
@@ -3128,7 +3109,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
goto out_unlock;
ret = -ENOMEM;
- ub = kzalloc(sizeof(*ub), GFP_KERNEL);
+ ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL);
if (!ub)
goto out_unlock;
mutex_init(&ub->mutex);
@@ -3178,17 +3159,17 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
ub->dev_info.nr_hw_queues, nr_cpu_ids);
ublk_align_max_io_size(ub);
- ret = ublk_init_queues(ub);
+ ret = ublk_add_tag_set(ub);
if (ret)
goto out_free_dev_number;
- ret = ublk_add_tag_set(ub);
+ ret = ublk_init_queues(ub);
if (ret)
- goto out_deinit_queues;
+ goto out_free_tag_set;
ret = -EFAULT;
if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
- goto out_free_tag_set;
+ goto out_deinit_queues;
/*
* Add the char dev so that ublksrv daemon can be setup.
@@ -3197,10 +3178,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
ret = ublk_add_chdev(ub);
goto out_unlock;
-out_free_tag_set:
- blk_mq_free_tag_set(&ub->tag_set);
out_deinit_queues:
ublk_deinit_queues(ub);
+out_free_tag_set:
+ blk_mq_free_tag_set(&ub->tag_set);
out_free_dev_number:
ublk_free_dev_number(ub);
out_free_ub:
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f061420dfb10..357434bdae99 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -584,7 +584,8 @@ out:
static int virtblk_parse_zone(struct virtio_blk *vblk,
struct virtio_blk_zone_descriptor *entry,
- unsigned int idx, report_zones_cb cb, void *data)
+ unsigned int idx,
+ struct blk_report_zones_args *args)
{
struct blk_zone zone = { };
@@ -650,12 +651,12 @@ static int virtblk_parse_zone(struct virtio_blk *vblk,
* The callback below checks the validity of the reported
* entry data, no need to further validate it here.
*/
- return cb(&zone, idx, data);
+ return disk_report_zone(vblk->disk, &zone, idx, args);
}
static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb,
- void *data)
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
{
struct virtio_blk *vblk = disk->private_data;
struct virtio_blk_zone_report *report;
@@ -693,7 +694,7 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
ret = virtblk_parse_zone(vblk, &report->zones[i],
- zone_idx, cb, data);
+ zone_idx, args);
if (ret)
goto fail_report;
@@ -1026,8 +1027,13 @@ static int init_vq(struct virtio_blk *vblk)
out:
kfree(vqs);
kfree(vqs_info);
- if (err)
+ if (err) {
kfree(vblk->vqs);
+ /*
+ * Set to NULL to prevent freeing vqs again during freezing.
+ */
+ vblk->vqs = NULL;
+ }
return err;
}
@@ -1598,6 +1604,12 @@ static int virtblk_freeze_priv(struct virtio_device *vdev)
vdev->config->del_vqs(vdev);
kfree(vblk->vqs);
+ /*
+ * Set to NULL to prevent freeing vqs again after a failed vqs
+ * allocation during resume. Note that kfree() already handles NULL
+ * pointers safely.
+ */
+ vblk->vqs = NULL;
return 0;
}
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
index a423228e201b..3f50321aa4a7 100644
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -32,6 +32,8 @@ enum {
ZLOOP_OPT_NR_QUEUES = (1 << 6),
ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
ZLOOP_OPT_BUFFERED_IO = (1 << 8),
+ ZLOOP_OPT_ZONE_APPEND = (1 << 9),
+ ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10),
};
static const match_table_t zloop_opt_tokens = {
@@ -44,6 +46,8 @@ static const match_table_t zloop_opt_tokens = {
{ ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
{ ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
{ ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
+ { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" },
+ { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" },
{ ZLOOP_OPT_ERR, NULL }
};
@@ -56,6 +60,8 @@ static const match_table_t zloop_opt_tokens = {
#define ZLOOP_DEF_NR_QUEUES 1
#define ZLOOP_DEF_QUEUE_DEPTH 128
#define ZLOOP_DEF_BUFFERED_IO false
+#define ZLOOP_DEF_ZONE_APPEND true
+#define ZLOOP_DEF_ORDERED_ZONE_APPEND false
/* Arbitrary limit on the zone size (16GB). */
#define ZLOOP_MAX_ZONE_SIZE_MB 16384
@@ -71,6 +77,8 @@ struct zloop_options {
unsigned int nr_queues;
unsigned int queue_depth;
bool buffered_io;
+ bool zone_append;
+ bool ordered_zone_append;
};
/*
@@ -92,6 +100,7 @@ struct zloop_zone {
unsigned long flags;
struct mutex lock;
+ spinlock_t wp_lock;
enum blk_zone_cond cond;
sector_t start;
sector_t wp;
@@ -108,6 +117,8 @@ struct zloop_device {
struct workqueue_struct *workqueue;
bool buffered_io;
+ bool zone_append;
+ bool ordered_zone_append;
const char *base_dir;
struct file *data_dir;
@@ -147,6 +158,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
struct zloop_zone *zone = &zlo->zones[zone_no];
struct kstat stat;
sector_t file_sectors;
+ unsigned long flags;
int ret;
lockdep_assert_held(&zone->lock);
@@ -172,16 +184,18 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
return -EINVAL;
}
+ spin_lock_irqsave(&zone->wp_lock, flags);
if (!file_sectors) {
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
} else if (file_sectors == zlo->zone_capacity) {
zone->cond = BLK_ZONE_COND_FULL;
- zone->wp = zone->start + zlo->zone_size;
+ zone->wp = ULLONG_MAX;
} else {
zone->cond = BLK_ZONE_COND_CLOSED;
zone->wp = zone->start + file_sectors;
}
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
return 0;
}
@@ -225,6 +239,7 @@ unlock:
static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
+ unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -243,10 +258,12 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
break;
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
+ spin_lock_irqsave(&zone->wp_lock, flags);
if (zone->wp == zone->start)
zone->cond = BLK_ZONE_COND_EMPTY;
else
zone->cond = BLK_ZONE_COND_CLOSED;
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
break;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_FULL:
@@ -264,6 +281,7 @@ unlock:
static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
+ unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -281,9 +299,11 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
goto unlock;
}
+ spin_lock_irqsave(&zone->wp_lock, flags);
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
unlock:
mutex_unlock(&zone->lock);
@@ -308,6 +328,7 @@ static int zloop_reset_all_zones(struct zloop_device *zlo)
static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
+ unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -325,9 +346,11 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
goto unlock;
}
+ spin_lock_irqsave(&zone->wp_lock, flags);
zone->cond = BLK_ZONE_COND_FULL;
- zone->wp = zone->start + zlo->zone_size;
+ zone->wp = ULLONG_MAX;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
unlock:
mutex_unlock(&zone->lock);
@@ -369,6 +392,7 @@ static void zloop_rw(struct zloop_cmd *cmd)
struct zloop_zone *zone;
struct iov_iter iter;
struct bio_vec tmp;
+ unsigned long flags;
sector_t zone_end;
int nr_bvec = 0;
int ret;
@@ -378,6 +402,11 @@ static void zloop_rw(struct zloop_cmd *cmd)
cmd->nr_sectors = nr_sectors;
cmd->ret = 0;
+ if (WARN_ON_ONCE(is_append && !zlo->zone_append)) {
+ ret = -EIO;
+ goto out;
+ }
+
/* We should never get an I/O beyond the device capacity. */
if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
ret = -EIO;
@@ -406,16 +435,31 @@ static void zloop_rw(struct zloop_cmd *cmd)
if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
mutex_lock(&zone->lock);
- if (is_append) {
- sector = zone->wp;
- cmd->sector = sector;
- }
+ spin_lock_irqsave(&zone->wp_lock, flags);
/*
- * Write operations must be aligned to the write pointer and
- * fully contained within the zone capacity.
+ * Zone append operations always go at the current write
+ * pointer, but regular write operations must already be
+ * aligned to the write pointer when submitted.
*/
- if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
+ if (is_append) {
+ /*
+ * If ordered zone append is in use, we already checked
+ * and set the target sector in zloop_queue_rq().
+ */
+ if (!zlo->ordered_zone_append) {
+ if (zone->cond == BLK_ZONE_COND_FULL ||
+ zone->wp + nr_sectors > zone_end) {
+ spin_unlock_irqrestore(&zone->wp_lock,
+ flags);
+ ret = -EIO;
+ goto unlock;
+ }
+ sector = zone->wp;
+ }
+ cmd->sector = sector;
+ } else if (sector != zone->wp) {
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
zone_no, sector, zone->wp);
ret = -EIO;
@@ -428,13 +472,19 @@ static void zloop_rw(struct zloop_cmd *cmd)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
/*
- * Advance the write pointer of sequential zones. If the write
- * fails, the wp position will be corrected when the next I/O
- * copmpletes.
+ * Advance the write pointer, unless ordered zone append is in
+ * use. If the write fails, the write pointer position will be
+ * corrected when the next I/O starts execution.
*/
- zone->wp += nr_sectors;
- if (zone->wp == zone_end)
- zone->cond = BLK_ZONE_COND_FULL;
+ if (!is_append || !zlo->ordered_zone_append) {
+ zone->wp += nr_sectors;
+ if (zone->wp == zone_end) {
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = ULLONG_MAX;
+ }
+ }
+
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
}
rq_for_each_bvec(tmp, rq, rq_iter)
@@ -498,6 +548,10 @@ static void zloop_handle_cmd(struct zloop_cmd *cmd)
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct zloop_device *zlo = rq->q->queuedata;
+ /* We can block in this context, so ignore REQ_NOWAIT. */
+ if (rq->cmd_flags & REQ_NOWAIT)
+ rq->cmd_flags &= ~REQ_NOWAIT;
+
switch (req_op(rq)) {
case REQ_OP_READ:
case REQ_OP_WRITE:
@@ -608,6 +662,35 @@ static void zloop_complete_rq(struct request *rq)
blk_mq_end_request(rq, sts);
}
+static bool zloop_set_zone_append_sector(struct request *rq)
+{
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = rq_zone_no(rq);
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ sector_t zone_end = zone->start + zlo->zone_capacity;
+ sector_t nr_sectors = blk_rq_sectors(rq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->wp_lock, flags);
+
+ if (zone->cond == BLK_ZONE_COND_FULL ||
+ zone->wp + nr_sectors > zone_end) {
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
+ return false;
+ }
+
+ rq->__sector = zone->wp;
+ zone->wp += blk_rq_sectors(rq);
+ if (zone->wp >= zone_end) {
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = ULLONG_MAX;
+ }
+
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
+
+ return true;
+}
+
static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -618,6 +701,16 @@ static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
if (zlo->state == Zlo_deleting)
return BLK_STS_IOERR;
+ /*
+ * If we need to strongly order zone append operations, set the request
+ * sector to the zone write pointer location now instead of when the
+ * command work runs.
+ */
+ if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) {
+ if (!zloop_set_zone_append_sector(rq))
+ return BLK_STS_IOERR;
+ }
+
blk_mq_start_request(rq);
INIT_WORK(&cmd->work, zloop_cmd_workfn);
@@ -647,11 +740,12 @@ static int zloop_open(struct gendisk *disk, blk_mode_t mode)
}
static int zloop_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct zloop_device *zlo = disk->private_data;
struct blk_zone blkz = {};
unsigned int first, i;
+ unsigned long flags;
int ret;
first = disk_zone_no(disk, sector);
@@ -675,7 +769,9 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector,
blkz.start = zone->start;
blkz.len = zlo->zone_size;
+ spin_lock_irqsave(&zone->wp_lock, flags);
blkz.wp = zone->wp;
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
blkz.cond = zone->cond;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
@@ -687,7 +783,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector,
mutex_unlock(&zone->lock);
- ret = cb(&blkz, i, data);
+ ret = disk_report_zone(disk, &blkz, i, args);
if (ret)
return ret;
}
@@ -783,6 +879,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
int ret;
mutex_init(&zone->lock);
+ spin_lock_init(&zone->wp_lock);
zone->start = (sector_t)zone_no << zlo->zone_shift;
if (!restore)
@@ -884,7 +981,6 @@ static int zloop_ctl_add(struct zloop_options *opts)
{
struct queue_limits lim = {
.max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
- .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
.chunk_sectors = opts->zone_size,
.features = BLK_FEAT_ZONED,
};
@@ -936,6 +1032,9 @@ static int zloop_ctl_add(struct zloop_options *opts)
zlo->nr_zones = nr_zones;
zlo->nr_conv_zones = opts->nr_conv_zones;
zlo->buffered_io = opts->buffered_io;
+ zlo->zone_append = opts->zone_append;
+ if (zlo->zone_append)
+ zlo->ordered_zone_append = opts->ordered_zone_append;
zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
opts->nr_queues * opts->queue_depth, zlo->id);
@@ -976,6 +1075,8 @@ static int zloop_ctl_add(struct zloop_options *opts)
lim.physical_block_size = zlo->block_size;
lim.logical_block_size = zlo->block_size;
+ if (zlo->zone_append)
+ lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
zlo->tag_set.ops = &zloop_mq_ops;
zlo->tag_set.nr_hw_queues = opts->nr_queues;
@@ -1016,10 +1117,14 @@ static int zloop_ctl_add(struct zloop_options *opts)
zlo->state = Zlo_live;
mutex_unlock(&zloop_ctl_mutex);
- pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
+ pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n",
zlo->id, zlo->nr_zones,
((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
zlo->block_size);
+ pr_info("zloop%d: using %s%s zone append\n",
+ zlo->id,
+ zlo->ordered_zone_append ? "ordered " : "",
+ zlo->zone_append ? "native" : "emulated");
return 0;
@@ -1106,6 +1211,8 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
+ opts->zone_append = ZLOOP_DEF_ZONE_APPEND;
+ opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND;
if (!buf)
return 0;
@@ -1215,6 +1322,21 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
case ZLOOP_OPT_BUFFERED_IO:
opts->buffered_io = true;
break;
+ case ZLOOP_OPT_ZONE_APPEND:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token != 0 && token != 1) {
+ pr_err("Invalid zone_append value\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_append = token;
+ break;
+ case ZLOOP_OPT_ORDERED_ZONE_APPEND:
+ opts->ordered_zone_append = true;
+ break;
case ZLOOP_OPT_ERR:
default:
pr_warn("unknown parameter or missing value '%s'\n", p);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a43074657531..5759823d6314 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -500,8 +500,31 @@ out:
}
#ifdef CONFIG_ZRAM_WRITEBACK
+#define INVALID_BDEV_BLOCK (~0UL)
+
+struct zram_wb_ctl {
+ /* idle list is accessed only by the writeback task, no concurency */
+ struct list_head idle_reqs;
+ /* done list is accessed concurrently, protect by done_lock */
+ struct list_head done_reqs;
+ wait_queue_head_t done_wait;
+ spinlock_t done_lock;
+ atomic_t num_inflight;
+};
+
+struct zram_wb_req {
+ unsigned long blk_idx;
+ struct page *page;
+ struct zram_pp_slot *pps;
+ struct bio_vec bio_vec;
+ struct bio bio;
+
+ struct list_head entry;
+};
+
static ssize_t writeback_limit_enable_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
u64 val;
@@ -510,33 +533,31 @@ static ssize_t writeback_limit_enable_store(struct device *dev,
if (kstrtoull(buf, 10, &val))
return ret;
- down_read(&zram->init_lock);
- spin_lock(&zram->wb_limit_lock);
+ down_write(&zram->init_lock);
zram->wb_limit_enable = val;
- spin_unlock(&zram->wb_limit_lock);
- up_read(&zram->init_lock);
+ up_write(&zram->init_lock);
ret = len;
return ret;
}
static ssize_t writeback_limit_enable_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr,
+ char *buf)
{
bool val;
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
- spin_lock(&zram->wb_limit_lock);
val = zram->wb_limit_enable;
- spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
return sysfs_emit(buf, "%d\n", val);
}
static ssize_t writeback_limit_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+ struct device_attribute *attr,
+ const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
u64 val;
@@ -545,31 +566,71 @@ static ssize_t writeback_limit_store(struct device *dev,
if (kstrtoull(buf, 10, &val))
return ret;
- down_read(&zram->init_lock);
- spin_lock(&zram->wb_limit_lock);
+ /*
+ * When the page size is greater than 4KB, if bd_wb_limit is set to
+ * a value that is not page - size aligned, it will cause value
+ * wrapping. For example, when the page size is set to 16KB and
+ * bd_wb_limit is set to 3, a single write - back operation will
+ * cause bd_wb_limit to become -1. Even more terrifying is that
+ * bd_wb_limit is an unsigned number.
+ */
+ val = rounddown(val, PAGE_SIZE / 4096);
+
+ down_write(&zram->init_lock);
zram->bd_wb_limit = val;
- spin_unlock(&zram->wb_limit_lock);
- up_read(&zram->init_lock);
+ up_write(&zram->init_lock);
ret = len;
return ret;
}
static ssize_t writeback_limit_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr, char *buf)
{
u64 val;
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
- spin_lock(&zram->wb_limit_lock);
val = zram->bd_wb_limit;
- spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
return sysfs_emit(buf, "%llu\n", val);
}
+static ssize_t writeback_batch_size_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct zram *zram = dev_to_zram(dev);
+ u32 val;
+
+ if (kstrtouint(buf, 10, &val))
+ return -EINVAL;
+
+ if (!val)
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ zram->wb_batch_size = val;
+ up_write(&zram->init_lock);
+
+ return len;
+}
+
+static ssize_t writeback_batch_size_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ u32 val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->wb_batch_size;
+ up_read(&zram->init_lock);
+
+ return sysfs_emit(buf, "%u\n", val);
+}
+
static void reset_bdev(struct zram *zram)
{
if (!zram->backing_dev)
@@ -697,23 +758,20 @@ out:
return err;
}
-static unsigned long alloc_block_bdev(struct zram *zram)
+static unsigned long zram_reserve_bdev_block(struct zram *zram)
{
- unsigned long blk_idx = 1;
-retry:
- /* skip 0 bit to confuse zram.handle = 0 */
- blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
- if (blk_idx == zram->nr_pages)
- return 0;
+ unsigned long blk_idx;
- if (test_and_set_bit(blk_idx, zram->bitmap))
- goto retry;
+ blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, 0);
+ if (blk_idx == zram->nr_pages)
+ return INVALID_BDEV_BLOCK;
+ set_bit(blk_idx, zram->bitmap);
atomic64_inc(&zram->stats.bd_count);
return blk_idx;
}
-static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
+static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
{
int was_set;
@@ -734,32 +792,249 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
submit_bio(bio);
}
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
+static void release_wb_req(struct zram_wb_req *req)
{
- unsigned long blk_idx = 0;
- struct page *page = NULL;
- struct zram_pp_slot *pps;
- struct bio_vec bio_vec;
- struct bio bio;
+ __free_page(req->page);
+ kfree(req);
+}
+
+static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
+{
+ if (!wb_ctl)
+ return;
+
+ /* We should never have inflight requests at this point */
+ WARN_ON(atomic_read(&wb_ctl->num_inflight));
+ WARN_ON(!list_empty(&wb_ctl->done_reqs));
+
+ while (!list_empty(&wb_ctl->idle_reqs)) {
+ struct zram_wb_req *req;
+
+ req = list_first_entry(&wb_ctl->idle_reqs,
+ struct zram_wb_req, entry);
+ list_del(&req->entry);
+ release_wb_req(req);
+ }
+
+ kfree(wb_ctl);
+}
+
+static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
+{
+ struct zram_wb_ctl *wb_ctl;
+ int i;
+
+ wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL);
+ if (!wb_ctl)
+ return NULL;
+
+ INIT_LIST_HEAD(&wb_ctl->idle_reqs);
+ INIT_LIST_HEAD(&wb_ctl->done_reqs);
+ atomic_set(&wb_ctl->num_inflight, 0);
+ init_waitqueue_head(&wb_ctl->done_wait);
+ spin_lock_init(&wb_ctl->done_lock);
+
+ for (i = 0; i < zram->wb_batch_size; i++) {
+ struct zram_wb_req *req;
+
+ /*
+ * This is fatal condition only if we couldn't allocate
+ * any requests at all. Otherwise we just work with the
+ * requests that we have successfully allocated, so that
+ * writeback can still proceed, even if there is only one
+ * request on the idle list.
+ */
+ req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_NOWARN);
+ if (!req)
+ break;
+
+ req->page = alloc_page(GFP_KERNEL | __GFP_NOWARN);
+ if (!req->page) {
+ kfree(req);
+ break;
+ }
+
+ list_add(&req->entry, &wb_ctl->idle_reqs);
+ }
+
+ /* We couldn't allocate any requests, so writeabck is not possible */
+ if (list_empty(&wb_ctl->idle_reqs))
+ goto release_wb_ctl;
+
+ return wb_ctl;
+
+release_wb_ctl:
+ release_wb_ctl(wb_ctl);
+ return NULL;
+}
+
+static void zram_account_writeback_rollback(struct zram *zram)
+{
+ lockdep_assert_held_read(&zram->init_lock);
+
+ if (zram->wb_limit_enable)
+ zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12);
+}
+
+static void zram_account_writeback_submit(struct zram *zram)
+{
+ lockdep_assert_held_read(&zram->init_lock);
+
+ if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+ zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
+}
+
+static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
+{
+ u32 index = req->pps->index;
+ int err;
+
+ err = blk_status_to_errno(req->bio.bi_status);
+ if (err) {
+ /*
+ * Failed wb requests should not be accounted in wb_limit
+ * (if enabled).
+ */
+ zram_account_writeback_rollback(zram);
+ zram_release_bdev_block(zram, req->blk_idx);
+ return err;
+ }
+
+ atomic64_inc(&zram->stats.bd_writes);
+ zram_slot_lock(zram, index);
+ /*
+ * We release slot lock during writeback so slot can change under us:
+ * slot_free() or slot_free() and zram_write_page(). In both cases
+ * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can
+ * set ZRAM_PP_SLOT on such slots until current post-processing
+ * finishes.
+ */
+ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) {
+ zram_release_bdev_block(zram, req->blk_idx);
+ goto out;
+ }
+
+ zram_free_page(zram, index);
+ zram_set_flag(zram, index, ZRAM_WB);
+ zram_set_handle(zram, index, req->blk_idx);
+ atomic64_inc(&zram->stats.pages_stored);
+
+out:
+ zram_slot_unlock(zram, index);
+ return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+ struct zram_wb_req *req = container_of(bio, struct zram_wb_req, bio);
+ struct zram_wb_ctl *wb_ctl = bio->bi_private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&wb_ctl->done_lock, flags);
+ list_add(&req->entry, &wb_ctl->done_reqs);
+ spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
+
+ wake_up(&wb_ctl->done_wait);
+}
+
+static void zram_submit_wb_request(struct zram *zram,
+ struct zram_wb_ctl *wb_ctl,
+ struct zram_wb_req *req)
+{
+ /*
+ * wb_limit (if enabled) should be adjusted before submission,
+ * so that we don't over-submit.
+ */
+ zram_account_writeback_submit(zram);
+ atomic_inc(&wb_ctl->num_inflight);
+ req->bio.bi_private = wb_ctl;
+ submit_bio(&req->bio);
+}
+
+static int zram_complete_done_reqs(struct zram *zram,
+ struct zram_wb_ctl *wb_ctl)
+{
+ struct zram_wb_req *req;
+ unsigned long flags;
int ret = 0, err;
- u32 index;
- page = alloc_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
+ while (atomic_read(&wb_ctl->num_inflight) > 0) {
+ spin_lock_irqsave(&wb_ctl->done_lock, flags);
+ req = list_first_entry_or_null(&wb_ctl->done_reqs,
+ struct zram_wb_req, entry);
+ if (req)
+ list_del(&req->entry);
+ spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
+
+ /* ->num_inflight > 0 doesn't mean we have done requests */
+ if (!req)
+ break;
+
+ err = zram_writeback_complete(zram, req);
+ if (err)
+ ret = err;
+
+ atomic_dec(&wb_ctl->num_inflight);
+ release_pp_slot(zram, req->pps);
+ req->pps = NULL;
+
+ list_add(&req->entry, &wb_ctl->idle_reqs);
+ }
+
+ return ret;
+}
+
+static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl)
+{
+ struct zram_wb_req *req;
+
+ req = list_first_entry_or_null(&wb_ctl->idle_reqs,
+ struct zram_wb_req, entry);
+ if (req)
+ list_del(&req->entry);
+ return req;
+}
+
+static int zram_writeback_slots(struct zram *zram,
+ struct zram_pp_ctl *ctl,
+ struct zram_wb_ctl *wb_ctl)
+{
+ unsigned long blk_idx = INVALID_BDEV_BLOCK;
+ struct zram_wb_req *req = NULL;
+ struct zram_pp_slot *pps;
+ int ret = 0, err = 0;
+ u32 index = 0;
while ((pps = select_pp_slot(ctl))) {
- spin_lock(&zram->wb_limit_lock);
if (zram->wb_limit_enable && !zram->bd_wb_limit) {
- spin_unlock(&zram->wb_limit_lock);
ret = -EIO;
break;
}
- spin_unlock(&zram->wb_limit_lock);
- if (!blk_idx) {
- blk_idx = alloc_block_bdev(zram);
- if (!blk_idx) {
+ while (!req) {
+ req = zram_select_idle_req(wb_ctl);
+ if (req)
+ break;
+
+ wait_event(wb_ctl->done_wait,
+ !list_empty(&wb_ctl->done_reqs));
+
+ err = zram_complete_done_reqs(zram, wb_ctl);
+ /*
+ * BIO errors are not fatal, we continue and simply
+ * attempt to writeback the remaining objects (pages).
+ * At the same time we need to signal user-space that
+ * some writes (at least one, but also could be all of
+ * them) were not successful and we do so by returning
+ * the most recent BIO error.
+ */
+ if (err)
+ ret = err;
+ }
+
+ if (blk_idx == INVALID_BDEV_BLOCK) {
+ blk_idx = zram_reserve_bdev_block(zram);
+ if (blk_idx == INVALID_BDEV_BLOCK) {
ret = -ENOSPC;
break;
}
@@ -768,74 +1043,54 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
index = pps->index;
zram_slot_lock(zram, index);
/*
- * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
+ * scan_slots() sets ZRAM_PP_SLOT and releases slot lock, so
* slots can change in the meantime. If slots are accessed or
* freed they lose ZRAM_PP_SLOT flag and hence we don't
* post-process them.
*/
if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
goto next;
- if (zram_read_from_zspool(zram, page, index))
+ if (zram_read_from_zspool(zram, req->page, index))
goto next;
zram_slot_unlock(zram, index);
- bio_init(&bio, zram->bdev, &bio_vec, 1,
- REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
-
- /*
- * XXX: A single page IO would be inefficient for write
- * but it would be not bad as starter.
- */
- err = submit_bio_wait(&bio);
- if (err) {
- release_pp_slot(zram, pps);
- /*
- * BIO errors are not fatal, we continue and simply
- * attempt to writeback the remaining objects (pages).
- * At the same time we need to signal user-space that
- * some writes (at least one, but also could be all of
- * them) were not successful and we do so by returning
- * the most recent BIO error.
- */
- ret = err;
- continue;
- }
-
- atomic64_inc(&zram->stats.bd_writes);
- zram_slot_lock(zram, index);
/*
- * Same as above, we release slot lock during writeback so
- * slot can change under us: slot_free() or slot_free() and
- * reallocation (zram_write_page()). In both cases slot loses
- * ZRAM_PP_SLOT flag. No concurrent post-processing can set
- * ZRAM_PP_SLOT on such slots until current post-processing
- * finishes.
+ * From now on pp-slot is owned by the req, remove it from
+ * its pp bucket.
*/
- if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
- goto next;
+ list_del_init(&pps->entry);
+
+ req->blk_idx = blk_idx;
+ req->pps = pps;
+ bio_init(&req->bio, zram->bdev, &req->bio_vec, 1, REQ_OP_WRITE);
+ req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
+ req->bio.bi_end_io = zram_writeback_endio;
+ __bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
+
+ zram_submit_wb_request(zram, wb_ctl, req);
+ blk_idx = INVALID_BDEV_BLOCK;
+ req = NULL;
+ cond_resched();
+ continue;
- zram_free_page(zram, index);
- zram_set_flag(zram, index, ZRAM_WB);
- zram_set_handle(zram, index, blk_idx);
- blk_idx = 0;
- atomic64_inc(&zram->stats.pages_stored);
- spin_lock(&zram->wb_limit_lock);
- if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
- zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
- spin_unlock(&zram->wb_limit_lock);
next:
zram_slot_unlock(zram, index);
release_pp_slot(zram, pps);
-
- cond_resched();
}
- if (blk_idx)
- free_block_bdev(zram, blk_idx);
- if (page)
- __free_page(page);
+ /*
+ * Selected idle req, but never submitted it due to some error or
+ * wb limit.
+ */
+ if (req)
+ release_wb_req(req);
+
+ while (atomic_read(&wb_ctl->num_inflight) > 0) {
+ wait_event(wb_ctl->done_wait, !list_empty(&wb_ctl->done_reqs));
+ err = zram_complete_done_reqs(zram, wb_ctl);
+ if (err)
+ ret = err;
+ }
return ret;
}
@@ -948,7 +1203,8 @@ static ssize_t writeback_store(struct device *dev,
struct zram *zram = dev_to_zram(dev);
u64 nr_pages = zram->disksize >> PAGE_SHIFT;
unsigned long lo = 0, hi = nr_pages;
- struct zram_pp_ctl *ctl = NULL;
+ struct zram_pp_ctl *pp_ctl = NULL;
+ struct zram_wb_ctl *wb_ctl = NULL;
char *args, *param, *val;
ssize_t ret = len;
int err, mode = 0;
@@ -970,8 +1226,14 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- ctl = init_pp_ctl();
- if (!ctl) {
+ pp_ctl = init_pp_ctl();
+ if (!pp_ctl) {
+ ret = -ENOMEM;
+ goto release_init_lock;
+ }
+
+ wb_ctl = init_wb_ctl(zram);
+ if (!wb_ctl) {
ret = -ENOMEM;
goto release_init_lock;
}
@@ -1000,7 +1262,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
break;
}
@@ -1011,7 +1273,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
break;
}
@@ -1022,7 +1284,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
continue;
}
@@ -1033,17 +1295,18 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
continue;
}
}
- err = zram_writeback_slots(zram, ctl);
+ err = zram_writeback_slots(zram, pp_ctl, wb_ctl);
if (err)
ret = err;
release_init_lock:
- release_pp_ctl(zram, ctl);
+ release_pp_ctl(zram, pp_ctl);
+ release_wb_ctl(wb_ctl);
atomic_set(&zram->pp_in_progress, 0);
up_read(&zram->init_lock);
@@ -1112,7 +1375,9 @@ static int read_from_bdev(struct zram *zram, struct page *page,
return -EIO;
}
-static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
+static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
+{
+}
#endif
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
@@ -1634,7 +1899,7 @@ static void zram_free_page(struct zram *zram, size_t index)
if (zram_test_flag(zram, index, ZRAM_WB)) {
zram_clear_flag(zram, index, ZRAM_WB);
- free_block_bdev(zram, zram_get_handle(zram, index));
+ zram_release_bdev_block(zram, zram_get_handle(zram, index));
goto out;
}
@@ -1740,14 +2005,14 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index,
ret = zram_read_from_zspool(zram, page, index);
zram_slot_unlock(zram, index);
} else {
+ unsigned long blk_idx = zram_get_handle(zram, index);
+
/*
* The slot should be unlocked before reading from the backing
* device.
*/
zram_slot_unlock(zram, index);
-
- ret = read_from_bdev(zram, page, zram_get_handle(zram, index),
- parent);
+ ret = read_from_bdev(zram, page, blk_idx, parent);
}
/* Should NEVER happen. Return bio error if it does. */
@@ -2610,6 +2875,7 @@ static DEVICE_ATTR_RW(backing_dev);
static DEVICE_ATTR_WO(writeback);
static DEVICE_ATTR_RW(writeback_limit);
static DEVICE_ATTR_RW(writeback_limit_enable);
+static DEVICE_ATTR_RW(writeback_batch_size);
#endif
#ifdef CONFIG_ZRAM_MULTI_COMP
static DEVICE_ATTR_RW(recomp_algorithm);
@@ -2631,6 +2897,7 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_writeback.attr,
&dev_attr_writeback_limit.attr,
&dev_attr_writeback_limit_enable.attr,
+ &dev_attr_writeback_batch_size.attr,
#endif
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
@@ -2692,7 +2959,7 @@ static int zram_add(void)
init_rwsem(&zram->init_lock);
#ifdef CONFIG_ZRAM_WRITEBACK
- spin_lock_init(&zram->wb_limit_lock);
+ zram->wb_batch_size = 32;
#endif
/* gendisk structure */
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 6cee93f9c0d0..c6d94501376c 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -127,8 +127,8 @@ struct zram {
bool claim; /* Protected by disk->open_mutex */
#ifdef CONFIG_ZRAM_WRITEBACK
struct file *backing_dev;
- spinlock_t wb_limit_lock;
bool wb_limit_enable;
+ u32 wb_batch_size;
u64 bd_wb_limit;
struct block_device *bdev;
unsigned long *bitmap;