summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-08-01 14:17:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-08-01 14:17:48 -0700
commit821c9e515db512904250e1d460109a1dc4c7ef6b (patch)
tree4ed49f2833b0ceb172773a87053b6f116c2bea00
parent0bd0a41a5120f78685a132834865b0a631b9026a (diff)
parent6693731487a8145a9b039bc983d77edc47693855 (diff)
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio updates from Michael Tsirkin: - vhost can now support legacy threading if enabled in Kconfig - vsock memory allocation strategies for large buffers have been improved, reducing pressure on kmalloc - vhost now supports the in-order feature. guest bits missed the merge window. - fixes, cleanups all over the place * tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (30 commits) vsock/virtio: Allocate nonlinear SKBs for handling large transmit buffers vsock/virtio: Rename virtio_vsock_skb_rx_put() vhost/vsock: Allocate nonlinear SKBs for handling large receive buffers vsock/virtio: Move SKB allocation lower-bound check to callers vsock/virtio: Rename virtio_vsock_alloc_skb() vsock/virtio: Resize receive buffers so that each SKB fits in a 4K page vsock/virtio: Move length check to callers of virtio_vsock_skb_rx_put() vsock/virtio: Validate length in packet header before skb_put() vhost/vsock: Avoid allocating arbitrarily-sized SKBs vhost_net: basic in_order support vhost: basic in order support vhost: fail early when __vhost_add_used() fails vhost: Reintroduce kthread API and add mode selection vdpa: Fix IDR memory leak in VDUSE module exit vdpa/mlx5: Fix release of uninitialized resources on error path vhost-scsi: Fix check for inline_sg_cnt exceeding preallocated limit virtio: virtio_dma_buf: fix missing parameter documentation vhost: Fix typos vhost: vringh: Remove unused functions vhost: vringh: Remove unused iotlb functions ...
-rw-r--r--drivers/gpu/drm/virtio/virtgpu_drv.c8
-rw-r--r--drivers/vdpa/mlx5/core/mr.c3
-rw-r--r--drivers/vdpa/mlx5/net/mlx5_vnet.c12
-rw-r--r--drivers/vdpa/vdpa_user/vduse_dev.c1
-rw-r--r--drivers/vhost/Kconfig18
-rw-r--r--drivers/vhost/net.c88
-rw-r--r--drivers/vhost/scsi.c24
-rw-r--r--drivers/vhost/vhost.c377
-rw-r--r--drivers/vhost/vhost.h30
-rw-r--r--drivers/vhost/vringh.c118
-rw-r--r--drivers/vhost/vsock.c15
-rw-r--r--drivers/virtio/virtio.c7
-rw-r--r--drivers/virtio/virtio_dma_buf.c2
-rw-r--r--drivers/virtio/virtio_mmio.c52
-rw-r--r--drivers/virtio/virtio_ring.c4
-rw-r--r--drivers/virtio/virtio_vdpa.c44
-rw-r--r--include/linux/virtio.h2
-rw-r--r--include/linux/virtio_vsock.h46
-rw-r--r--include/linux/vringh.h12
-rw-r--r--include/uapi/linux/vhost.h28
-rw-r--r--kernel/vhost_task.c2
-rw-r--r--net/vmw_vsock/virtio_transport.c20
-rw-r--r--net/vmw_vsock/virtio_transport_common.c3
23 files changed, 574 insertions, 342 deletions
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c
index e32e680c7197..71c6ccad4b99 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.c
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
@@ -130,10 +130,10 @@ static void virtio_gpu_remove(struct virtio_device *vdev)
static void virtio_gpu_shutdown(struct virtio_device *vdev)
{
- /*
- * drm does its own synchronization on shutdown.
- * Do nothing here, opt out of device reset.
- */
+ struct drm_device *dev = vdev->priv;
+
+ /* stop talking to the device */
+ drm_dev_unplug(dev);
}
static void virtio_gpu_config_changed(struct virtio_device *vdev)
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 61424342c096..c7a20278bc3c 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -908,6 +908,9 @@ void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev)
{
struct mlx5_vdpa_mr_resources *mres = &mvdev->mres;
+ if (!mres->wq_gc)
+ return;
+
atomic_set(&mres->shutdown, 1);
flush_delayed_work(&mres->gc_dwork_ent);
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index cccc49a08a1a..0ed2fc28e1ce 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2491,7 +2491,7 @@ static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
}
mvq = &ndev->vqs[idx];
- ndev->needs_teardown = num != mvq->num_ent;
+ ndev->needs_teardown |= num != mvq->num_ent;
mvq->num_ent = num;
}
@@ -3432,15 +3432,17 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
ndev = to_mlx5_vdpa_ndev(mvdev);
+ /* Functions called here should be able to work with
+ * uninitialized resources.
+ */
free_fixed_resources(ndev);
mlx5_vdpa_clean_mrs(mvdev);
mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
- mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx);
-
if (!is_zero_ether_addr(ndev->config.mac)) {
pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
}
+ mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx);
mlx5_vdpa_free_resources(&ndev->mvdev);
free_irqs(ndev);
kfree(ndev->event_cbs);
@@ -3888,6 +3890,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
mvdev->actual_features =
(device_features & BIT_ULL(VIRTIO_F_VERSION_1));
+ mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx);
+
ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
if (!ndev->vqs || !ndev->event_cbs) {
@@ -3960,8 +3964,6 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
ndev->rqt_size = 1;
}
- mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx);
-
ndev->mvdev.mlx_features = device_features;
mvdev->vdev.dma_dev = &mdev->pdev->dev;
err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 6a9a37351310..04620bb77203 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -2216,6 +2216,7 @@ static void vduse_exit(void)
cdev_del(&vduse_ctrl_cdev);
unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX);
class_unregister(&vduse_class);
+ idr_destroy(&vduse_idr);
}
module_exit(vduse_exit);
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 020d4fbb947c..bc0f38574497 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -95,4 +95,22 @@ config VHOST_CROSS_ENDIAN_LEGACY
If unsure, say "N".
+config VHOST_ENABLE_FORK_OWNER_CONTROL
+ bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL"
+ default y
+ help
+ This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and
+ VHOST_GET_FORK_FROM_OWNER. These allow userspace applications
+ to modify the vhost worker mode for vhost devices.
+
+ Also expose module parameter 'fork_from_owner_default' to allow users
+ to configure the default mode for vhost workers.
+
+ By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`,
+ users can change the worker thread mode as needed.
+ If this config is disabled (n),the related IOCTLs and parameters will
+ be unavailable.
+
+ If unsure, say "Y".
+
endif
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index bfb774c273ea..6edac0c1ba9b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,7 +74,8 @@ static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = {
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
(1ULL << VIRTIO_F_ACCESS_PLATFORM) |
- (1ULL << VIRTIO_F_RING_RESET),
+ (1ULL << VIRTIO_F_RING_RESET) |
+ (1ULL << VIRTIO_F_IN_ORDER),
VIRTIO_BIT(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) |
VIRTIO_BIT(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO),
};
@@ -376,7 +377,8 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
while (j) {
add = min(UIO_MAXIOV - nvq->done_idx, j);
vhost_add_used_and_signal_n(vq->dev, vq,
- &vq->heads[nvq->done_idx], add);
+ &vq->heads[nvq->done_idx],
+ NULL, add);
nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
j -= add;
}
@@ -451,7 +453,8 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
}
-static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
+static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq,
+ unsigned int count)
{
struct vhost_virtqueue *vq = &nvq->vq;
struct vhost_dev *dev = vq->dev;
@@ -459,7 +462,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
if (!nvq->done_idx)
return;
- vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
+ vhost_add_used_and_signal_n(dev, vq, vq->heads,
+ vq->nheads, count);
nvq->done_idx = 0;
}
@@ -468,6 +472,8 @@ static void vhost_tx_batch(struct vhost_net *net,
struct socket *sock,
struct msghdr *msghdr)
{
+ struct vhost_virtqueue *vq = &nvq->vq;
+ bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
struct tun_msg_ctl ctl = {
.type = TUN_MSG_PTR,
.num = nvq->batched_xdp,
@@ -475,6 +481,11 @@ static void vhost_tx_batch(struct vhost_net *net,
};
int i, err;
+ if (in_order) {
+ vq->heads[0].len = 0;
+ vq->nheads[0] = nvq->done_idx;
+ }
+
if (nvq->batched_xdp == 0)
goto signal_used;
@@ -496,7 +507,7 @@ static void vhost_tx_batch(struct vhost_net *net,
}
signal_used:
- vhost_net_signal_used(nvq);
+ vhost_net_signal_used(nvq, in_order ? 1 : nvq->done_idx);
nvq->batched_xdp = 0;
}
@@ -750,6 +761,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
int sent_pkts = 0;
bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
bool busyloop_intr;
+ bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
do {
busyloop_intr = false;
@@ -786,11 +798,13 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
break;
}
- /* We can't build XDP buff, go for single
- * packet path but let's flush batched
- * packets.
- */
- vhost_tx_batch(net, nvq, sock, &msg);
+ if (nvq->batched_xdp) {
+ /* We can't build XDP buff, go for single
+ * packet path but let's flush batched
+ * packets.
+ */
+ vhost_tx_batch(net, nvq, sock, &msg);
+ }
msg.msg_control = NULL;
} else {
if (tx_can_batch(vq, total_len))
@@ -811,8 +825,12 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
pr_debug("Truncated TX packet: len %d != %zd\n",
err, len);
done:
- vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
- vq->heads[nvq->done_idx].len = 0;
+ if (in_order) {
+ vq->heads[0].id = cpu_to_vhost32(vq, head);
+ } else {
+ vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
+ vq->heads[nvq->done_idx].len = 0;
+ }
++nvq->done_idx;
} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
@@ -991,7 +1009,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
}
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
- bool *busyloop_intr)
+ bool *busyloop_intr, unsigned int count)
{
struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -1001,7 +1019,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
if (!len && rvq->busyloop_timeout) {
/* Flush batched heads first */
- vhost_net_signal_used(rnvq);
+ vhost_net_signal_used(rnvq, count);
/* Both tx vq and rx socket were polled here */
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
@@ -1013,7 +1031,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
/* This is a multi-buffer version of vhost_get_desc, that works if
* vq has read descriptors only.
- * @vq - the relevant virtqueue
+ * @nvq - the relevant vhost_net virtqueue
* @datalen - data length we'll be reading
* @iovcount - returned count of io vectors we fill
* @log - vhost log
@@ -1021,14 +1039,17 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
* @quota - headcount quota, 1 for big buffer
* returns number of buffer heads allocated, negative on error
*/
-static int get_rx_bufs(struct vhost_virtqueue *vq,
+static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
struct vring_used_elem *heads,
+ u16 *nheads,
int datalen,
unsigned *iovcount,
struct vhost_log *log,
unsigned *log_num,
unsigned int quota)
{
+ struct vhost_virtqueue *vq = &nvq->vq;
+ bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
unsigned int out, in;
int seg = 0;
int headcount = 0;
@@ -1065,14 +1086,16 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
- heads[headcount].id = cpu_to_vhost32(vq, d);
len = iov_length(vq->iov + seg, in);
- heads[headcount].len = cpu_to_vhost32(vq, len);
- datalen -= len;
+ if (!in_order) {
+ heads[headcount].id = cpu_to_vhost32(vq, d);
+ heads[headcount].len = cpu_to_vhost32(vq, len);
+ }
++headcount;
+ datalen -= len;
seg += in;
}
- heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+
*iovcount = seg;
if (unlikely(log))
*log_num = nlogs;
@@ -1082,6 +1105,15 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
r = UIO_MAXIOV + 1;
goto err;
}
+
+ if (!in_order)
+ heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+ else {
+ heads[0].len = cpu_to_vhost32(vq, len + datalen);
+ heads[0].id = cpu_to_vhost32(vq, d);
+ nheads[0] = headcount;
+ }
+
return headcount;
err:
vhost_discard_vq_desc(vq, headcount);
@@ -1094,6 +1126,8 @@ static void handle_rx(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_virtqueue *vq = &nvq->vq;
+ bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
+ unsigned int count = 0;
unsigned in, log;
struct vhost_log *vq_log;
struct msghdr msg = {
@@ -1141,12 +1175,13 @@ static void handle_rx(struct vhost_net *net)
do {
sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
- &busyloop_intr);
+ &busyloop_intr, count);
if (!sock_len)
break;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
- headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
+ headcount = get_rx_bufs(nvq, vq->heads + count,
+ vq->nheads + count,
vhost_len, &in, vq_log, &log,
likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
@@ -1222,8 +1257,11 @@ static void handle_rx(struct vhost_net *net)
goto out;
}
nvq->done_idx += headcount;
- if (nvq->done_idx > VHOST_NET_BATCH)
- vhost_net_signal_used(nvq);
+ count += in_order ? 1 : headcount;
+ if (nvq->done_idx > VHOST_NET_BATCH) {
+ vhost_net_signal_used(nvq, count);
+ count = 0;
+ }
if (unlikely(vq_log))
vhost_log_write(vq, vq_log, log, vhost_len,
vq->iov, in);
@@ -1235,7 +1273,7 @@ static void handle_rx(struct vhost_net *net)
else if (!sock_len)
vhost_net_enable_vq(net, vq);
out:
- vhost_net_signal_used(nvq);
+ vhost_net_signal_used(nvq, count);
mutex_unlock(&vq->mutex);
}
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index c12a0d4e6386..abf51332a5c5 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -71,7 +71,7 @@ static int vhost_scsi_set_inline_sg_cnt(const char *buf,
if (ret)
return ret;
- if (ret > VHOST_SCSI_PREALLOC_SGLS) {
+ if (cnt > VHOST_SCSI_PREALLOC_SGLS) {
pr_err("Max inline_sg_cnt is %u\n", VHOST_SCSI_PREALLOC_SGLS);
return -EINVAL;
}
@@ -152,7 +152,7 @@ struct vhost_scsi_nexus {
struct vhost_scsi_tpg {
/* Vhost port target portal group tag for TCM */
u16 tport_tpgt;
- /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */
+ /* Used to track number of TPG Port/Lun Links wrt to explicit I_T Nexus shutdown */
int tv_tpg_port_count;
/* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */
int tv_tpg_vhost_count;
@@ -311,12 +311,12 @@ static void vhost_scsi_init_inflight(struct vhost_scsi *vs,
mutex_lock(&vq->mutex);
- /* store old infight */
+ /* store old inflight */
idx = vs->vqs[i].inflight_idx;
if (old_inflight)
old_inflight[i] = &vs->vqs[i].inflights[idx];
- /* setup new infight */
+ /* setup new inflight */
vs->vqs[i].inflight_idx = idx ^ 1;
new_inflight = &vs->vqs[i].inflights[idx ^ 1];
kref_init(&new_inflight->kref);
@@ -1226,10 +1226,8 @@ vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc,
/* validated at handler entry */
vs_tpg = vhost_vq_get_backend(vq);
tpg = READ_ONCE(vs_tpg[*vc->target]);
- if (unlikely(!tpg)) {
- vq_err(vq, "Target 0x%x does not exist\n", *vc->target);
+ if (unlikely(!tpg))
goto out;
- }
}
if (tpgp)
@@ -1249,7 +1247,7 @@ vhost_scsi_setup_resp_iovs(struct vhost_scsi_cmd *cmd, struct iovec *in_iovs,
if (!in_iovs_cnt)
return 0;
/*
- * Initiator's normally just put the virtio_scsi_cmd_resp in the first
+ * Initiators normally just put the virtio_scsi_cmd_resp in the first
* iov, but just in case they wedged in some data with it we check for
* greater than or equal to the response struct.
*/
@@ -1457,7 +1455,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
cmd = vhost_scsi_get_cmd(vq, tag);
if (IS_ERR(cmd)) {
ret = PTR_ERR(cmd);
- vq_err(vq, "vhost_scsi_get_tag failed %dd\n", ret);
+ vq_err(vq, "vhost_scsi_get_tag failed %d\n", ret);
goto err;
}
cmd->tvc_vq = vq;
@@ -2609,7 +2607,7 @@ static int vhost_scsi_make_nexus(struct vhost_scsi_tpg *tpg,
return -ENOMEM;
}
/*
- * Since we are running in 'demo mode' this call with generate a
+ * Since we are running in 'demo mode' this call will generate a
* struct se_node_acl for the vhost_scsi struct se_portal_group with
* the SCSI Initiator port name of the passed configfs group 'name'.
*/
@@ -2915,7 +2913,7 @@ static ssize_t
vhost_scsi_wwn_version_show(struct config_item *item, char *page)
{
return sysfs_emit(page, "TCM_VHOST fabric module %s on %s/%s"
- "on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname,
+ " on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname,
utsname()->machine);
}
@@ -2983,13 +2981,13 @@ out_vhost_scsi_deregister:
vhost_scsi_deregister();
out:
return ret;
-};
+}
static void vhost_scsi_exit(void)
{
target_unregister_template(&vhost_scsi_ops);
vhost_scsi_deregister();
-};
+}
MODULE_DESCRIPTION("VHOST_SCSI series fabric driver");
MODULE_ALIAS("tcm_vhost");
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1094256a943c..23286e4d7b49 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -22,6 +22,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/kthread.h>
+#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sort.h>
#include <linux/sched/mm.h>
@@ -41,6 +42,13 @@ static int max_iotlb_entries = 2048;
module_param(max_iotlb_entries, int, 0444);
MODULE_PARM_DESC(max_iotlb_entries,
"Maximum number of iotlb entries. (default: 2048)");
+static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK;
+
+#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
+module_param(fork_from_owner_default, bool, 0444);
+MODULE_PARM_DESC(fork_from_owner_default,
+ "Set task mode as the default(default: Y)");
+#endif
enum {
VHOST_MEMORY_F_LOG = 0x1,
@@ -242,7 +250,7 @@ static void vhost_worker_queue(struct vhost_worker *worker,
* test_and_set_bit() implies a memory barrier.
*/
llist_add(&work->node, &worker->work_list);
- vhost_task_wake(worker->vtsk);
+ worker->ops->wakeup(worker);
}
}
@@ -364,6 +372,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->avail = NULL;
vq->used = NULL;
vq->last_avail_idx = 0;
+ vq->next_avail_head = 0;
vq->avail_idx = 0;
vq->last_used_idx = 0;
vq->signalled_used = 0;
@@ -388,6 +397,44 @@ static void vhost_vq_reset(struct vhost_dev *dev,
__vhost_vq_meta_reset(vq);
}
+static int vhost_run_work_kthread_list(void *data)
+{
+ struct vhost_worker *worker = data;
+ struct vhost_work *work, *work_next;
+ struct vhost_dev *dev = worker->dev;
+ struct llist_node *node;
+
+ kthread_use_mm(dev->mm);
+
+ for (;;) {
+ /* mb paired w/ kthread_stop */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+ node = llist_del_all(&worker->work_list);
+ if (!node)
+ schedule();
+
+ node = llist_reverse_order(node);
+ /* make sure flag is seen after deletion */
+ smp_wmb();
+ llist_for_each_entry_safe(work, work_next, node, node) {
+ clear_bit(VHOST_WORK_QUEUED, &work->flags);
+ __set_current_state(TASK_RUNNING);
+ kcov_remote_start_common(worker->kcov_handle);
+ work->fn(work);
+ kcov_remote_stop();
+ cond_resched();
+ }
+ }
+ kthread_unuse_mm(dev->mm);
+
+ return 0;
+}
+
static bool vhost_run_work_list(void *data)
{
struct vhost_worker *worker = data;
@@ -455,6 +502,8 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
vq->log = NULL;
kfree(vq->heads);
vq->heads = NULL;
+ kfree(vq->nheads);
+ vq->nheads = NULL;
}
/* Helper to allocate iovec buffers for all vqs. */
@@ -472,7 +521,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
GFP_KERNEL);
vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
GFP_KERNEL);
- if (!vq->indirect || !vq->log || !vq->heads)
+ vq->nheads = kmalloc_array(dev->iov_limit, sizeof(*vq->nheads),
+ GFP_KERNEL);
+ if (!vq->indirect || !vq->log || !vq->heads || !vq->nheads)
goto err_nomem;
}
return 0;
@@ -552,6 +603,7 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->byte_weight = byte_weight;
dev->use_worker = use_worker;
dev->msg_handler = msg_handler;
+ dev->fork_owner = fork_from_owner_default;
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
@@ -581,6 +633,46 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
}
EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
+struct vhost_attach_cgroups_struct {
+ struct vhost_work work;
+ struct task_struct *owner;
+ int ret;
+};
+
+static void vhost_attach_cgroups_work(struct vhost_work *work)
+{
+ struct vhost_attach_cgroups_struct *s;
+
+ s = container_of(work, struct vhost_attach_cgroups_struct, work);
+ s->ret = cgroup_attach_task_all(s->owner, current);
+}
+
+static int vhost_attach_task_to_cgroups(struct vhost_worker *worker)
+{
+ struct vhost_attach_cgroups_struct attach;
+ int saved_cnt;
+
+ attach.owner = current;
+
+ vhost_work_init(&attach.work, vhost_attach_cgroups_work);
+ vhost_worker_queue(worker, &attach.work);
+
+ mutex_lock(&worker->mutex);
+
+ /*
+ * Bypass attachment_cnt check in __vhost_worker_flush:
+ * Temporarily change it to INT_MAX to bypass the check
+ */
+ saved_cnt = worker->attachment_cnt;
+ worker->attachment_cnt = INT_MAX;
+ __vhost_worker_flush(worker);
+ worker->attachment_cnt = saved_cnt;
+
+ mutex_unlock(&worker->mutex);
+
+ return attach.ret;
+}
+
/* Caller should have device mutex */
bool vhost_dev_has_owner(struct vhost_dev *dev)
{
@@ -594,10 +686,10 @@ static void vhost_attach_mm(struct vhost_dev *dev)
if (dev->use_worker) {
dev->mm = get_task_mm(current);
} else {
- /* vDPA device does not use worker thead, so there's
- * no need to hold the address space for mm. This help
+ /* vDPA device does not use worker thread, so there's
+ * no need to hold the address space for mm. This helps
* to avoid deadlock in the case of mmap() which may
- * held the refcnt of the file and depends on release
+ * hold the refcnt of the file and depends on release
* method to remove vma.
*/
dev->mm = current->mm;
@@ -626,7 +718,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev,
WARN_ON(!llist_empty(&worker->work_list));
xa_erase(&dev->worker_xa, worker->id);
- vhost_task_stop(worker->vtsk);
+ worker->ops->stop(worker);
kfree(worker);
}
@@ -649,42 +741,115 @@ static void vhost_workers_free(struct vhost_dev *dev)
xa_destroy(&dev->worker_xa);
}
+static void vhost_task_wakeup(struct vhost_worker *worker)
+{
+ return vhost_task_wake(worker->vtsk);
+}
+
+static void vhost_kthread_wakeup(struct vhost_worker *worker)
+{
+ wake_up_process(worker->kthread_task);
+}
+
+static void vhost_task_do_stop(struct vhost_worker *worker)
+{
+ return vhost_task_stop(worker->vtsk);
+}
+
+static void vhost_kthread_do_stop(struct vhost_worker *worker)
+{
+ kthread_stop(worker->kthread_task);
+}
+
+static int vhost_task_worker_create(struct vhost_worker *worker,
+ struct vhost_dev *dev, const char *name)
+{
+ struct vhost_task *vtsk;
+ u32 id;
+ int ret;
+
+ vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
+ worker, name);
+ if (IS_ERR(vtsk))
+ return PTR_ERR(vtsk);
+
+ worker->vtsk = vtsk;
+ vhost_task_start(vtsk);
+ ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+ if (ret < 0) {
+ vhost_task_do_stop(worker);
+ return ret;
+ }
+ worker->id = id;
+ return 0;
+}
+
+static int vhost_kthread_worker_create(struct vhost_worker *worker,
+ struct vhost_dev *dev, const char *name)
+{
+ struct task_struct *task;
+ u32 id;
+ int ret;
+
+ task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ worker->kthread_task = task;
+ wake_up_process(task);
+ ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+ if (ret < 0)
+ goto stop_worker;
+
+ ret = vhost_attach_task_to_cgroups(worker);
+ if (ret)
+ goto stop_worker;
+
+ worker->id = id;
+ return 0;
+
+stop_worker:
+ vhost_kthread_do_stop(worker);
+ return ret;
+}
+
+static const struct vhost_worker_ops kthread_ops = {
+ .create = vhost_kthread_worker_create,
+ .stop = vhost_kthread_do_stop,
+ .wakeup = vhost_kthread_wakeup,
+};
+
+static const struct vhost_worker_ops vhost_task_ops = {
+ .create = vhost_task_worker_create,
+ .stop = vhost_task_do_stop,
+ .wakeup = vhost_task_wakeup,
+};
+
static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
{
struct vhost_worker *worker;
- struct vhost_task *vtsk;
char name[TASK_COMM_LEN];
int ret;
- u32 id;
+ const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops :
+ &kthread_ops;
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
if (!worker)
return NULL;
worker->dev = dev;
+ worker->ops = ops;
snprintf(name, sizeof(name), "vhost-%d", current->pid);
- vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
- worker, name);
- if (IS_ERR(vtsk))
- goto free_worker;
-
mutex_init(&worker->mutex);
init_llist_head(&worker->work_list);
worker->kcov_handle = kcov_common_handle();
- worker->vtsk = vtsk;
-
- vhost_task_start(vtsk);
-
- ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+ ret = ops->create(worker, dev, name);
if (ret < 0)
- goto stop_worker;
- worker->id = id;
+ goto free_worker;
return worker;
-stop_worker:
- vhost_task_stop(vtsk);
free_worker:
kfree(worker);
return NULL;
@@ -731,7 +896,7 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
* We don't want to call synchronize_rcu for every vq during setup
* because it will slow down VM startup. If we haven't done
* VHOST_SET_VRING_KICK and not done the driver specific
- * SET_ENDPOINT/RUNNUNG then we can skip the sync since there will
+ * SET_ENDPOINT/RUNNING then we can skip the sync since there will
* not be any works queued for scsi and net.
*/
mutex_lock(&vq->mutex);
@@ -865,6 +1030,14 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
switch (ioctl) {
/* dev worker ioctls */
case VHOST_NEW_WORKER:
+ /*
+ * vhost_tasks will account for worker threads under the parent's
+ * NPROC value but kthreads do not. To avoid userspace overflowing
+ * the system with worker threads fork_owner must be true.
+ */
+ if (!dev->fork_owner)
+ return -EFAULT;
+
ret = vhost_new_worker(dev, &state);
if (!ret && copy_to_user(argp, &state, sizeof(state)))
ret = -EFAULT;
@@ -982,6 +1155,7 @@ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
vhost_dev_cleanup(dev);
+ dev->fork_owner = fork_from_owner_default;
dev->umem = umem;
/* We don't need VQ locks below since vhost_dev_cleanup makes sure
* VQs aren't running.
@@ -1990,14 +2164,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
break;
}
if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
- vq->last_avail_idx = s.num & 0xffff;
+ vq->next_avail_head = vq->last_avail_idx =
+ s.num & 0xffff;
vq->last_used_idx = (s.num >> 16) & 0xffff;
} else {
if (s.num > 0xffff) {
r = -EINVAL;
break;
}
- vq->last_avail_idx = s.num;
+ vq->next_avail_head = vq->last_avail_idx = s.num;
}
/* Forget the cached index value. */
vq->avail_idx = vq->last_avail_idx;
@@ -2135,6 +2310,45 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
goto done;
}
+#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
+ if (ioctl == VHOST_SET_FORK_FROM_OWNER) {
+ /* Only allow modification before owner is set */
+ if (vhost_dev_has_owner(d)) {
+ r = -EBUSY;
+ goto done;
+ }
+ u8 fork_owner_val;
+
+ if (get_user(fork_owner_val, (u8 __user *)argp)) {
+ r = -EFAULT;
+ goto done;
+ }
+ if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
+ fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
+ r = -EINVAL;
+ goto done;
+ }
+ d->fork_owner = !!fork_owner_val;
+ r = 0;
+ goto done;
+ }
+ if (ioctl == VHOST_GET_FORK_FROM_OWNER) {
+ u8 fork_owner_val = d->fork_owner;
+
+ if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
+ fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
+ r = -EINVAL;
+ goto done;
+ }
+ if (put_user(fork_owner_val, (u8 __user *)argp)) {
+ r = -EFAULT;
+ goto done;
+ }
+ r = 0;
+ goto done;
+ }
+#endif
+
/* You must be the owner to do anything else */
r = vhost_dev_check_owner(d);
if (r)
@@ -2590,11 +2804,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num)
{
+ bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
struct vring_desc desc;
unsigned int i, head, found = 0;
u16 last_avail_idx = vq->last_avail_idx;
__virtio16 ring_head;
- int ret, access;
+ int ret, access, c = 0;
if (vq->avail_idx == vq->last_avail_idx) {
ret = vhost_get_avail_idx(vq);
@@ -2605,17 +2820,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
return vq->num;
}
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return -EFAULT;
+ if (in_order)
+ head = vq->next_avail_head & (vq->num - 1);
+ else {
+ /* Grab the next descriptor number they're
+ * advertising, and increment the index we've seen. */
+ if (unlikely(vhost_get_avail_head(vq, &ring_head,
+ last_avail_idx))) {
+ vq_err(vq, "Failed to read head: idx %d address %p\n",
+ last_avail_idx,
+ &vq->avail->ring[last_avail_idx % vq->num]);
+ return -EFAULT;
+ }
+ head = vhost16_to_cpu(vq, ring_head);
}
- head = vhost16_to_cpu(vq, ring_head);
-
/* If their number is silly, that's an error. */
if (unlikely(head >= vq->num)) {
vq_err(vq, "Guest says index %u > %u is available",
@@ -2658,6 +2877,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
"in indirect descriptor at idx %d\n", i);
return ret;
}
+ ++c;
continue;
}
@@ -2693,10 +2913,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
*out_num += ret;
}
+ ++c;
} while ((i = next_desc(vq, &desc)) != -1);
/* On success, increment avail index. */
vq->last_avail_idx++;
+ vq->next_avail_head += c;
/* Assume notifications from guest are disabled at this point,
* if they aren't we would need to update avail_event index. */
@@ -2720,8 +2942,9 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
cpu_to_vhost32(vq, head),
cpu_to_vhost32(vq, len)
};
+ u16 nheads = 1;
- return vhost_add_used_n(vq, &heads, 1);
+ return vhost_add_used_n(vq, &heads, &nheads, 1);
}
EXPORT_SYMBOL_GPL(vhost_add_used);
@@ -2757,10 +2980,9 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
return 0;
}
-/* After we've used one of their buffers, we tell them about it. We'll then
- * want to notify the guest, using eventfd. */
-int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
- unsigned count)
+static int vhost_add_used_n_ooo(struct vhost_virtqueue *vq,
+ struct vring_used_elem *heads,
+ unsigned count)
{
int start, n, r;
@@ -2773,7 +2995,72 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
heads += n;
count -= n;
}
- r = __vhost_add_used_n(vq, heads, count);
+ return __vhost_add_used_n(vq, heads, count);
+}
+
+static int vhost_add_used_n_in_order(struct vhost_virtqueue *vq,
+ struct vring_used_elem *heads,
+ const u16 *nheads,
+ unsigned count)
+{
+ vring_used_elem_t __user *used;
+ u16 old, new = vq->last_used_idx;
+ int start, i;
+
+ if (!nheads)
+ return -EINVAL;
+
+ start = vq->last_used_idx & (vq->num - 1);
+ used = vq->used->ring + start;
+
+ for (i = 0; i < count; i++) {
+ if (vhost_put_used(vq, &heads[i], start, 1)) {
+ vq_err(vq, "Failed to write used");
+ return -EFAULT;
+ }
+ start += nheads[i];
+ new += nheads[i];
+ if (start >= vq->num)
+ start -= vq->num;
+ }
+
+ if (unlikely(vq->log_used)) {
+ /* Make sure data is seen before log. */
+ smp_wmb();
+ /* Log used ring entry write. */
+ log_used(vq, ((void __user *)used - (void __user *)vq->used),
+ (vq->num - start) * sizeof *used);
+ if (start + count > vq->num)
+ log_used(vq, 0,
+ (start + count - vq->num) * sizeof *used);
+ }
+
+ old = vq->last_used_idx;
+ vq->last_used_idx = new;
+ /* If the driver never bothers to signal in a very long while,
+ * used index might wrap around. If that happens, invalidate
+ * signalled_used index we stored. TODO: make sure driver
+ * signals at least once in 2^16 and remove this. */
+ if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
+ vq->signalled_used_valid = false;
+ return 0;
+}
+
+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+ u16 *nheads, unsigned count)
+{
+ bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
+ int r;
+
+ if (!in_order || !nheads)
+ r = vhost_add_used_n_ooo(vq, heads, count);
+ else
+ r = vhost_add_used_n_in_order(vq, heads, nheads, count);
+
+ if (r < 0)
+ return r;
/* Make sure buffer is written before we update index. */
smp_wmb();
@@ -2853,14 +3140,16 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
/* multi-buffer version of vhost_add_used_and_signal */
void vhost_add_used_and_signal_n(struct vhost_dev *dev,
struct vhost_virtqueue *vq,
- struct vring_used_elem *heads, unsigned count)
+ struct vring_used_elem *heads,
+ u16 *nheads,
+ unsigned count)
{
- vhost_add_used_n(vq, heads, count);
+ vhost_add_used_n(vq, heads, nheads, count);
vhost_signal(dev, vq);
}
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
-/* return true if we're sure that avaiable ring is empty */
+/* return true if we're sure that available ring is empty */
bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
int r;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d1aed35c4b07..621a6d9a8791 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -26,7 +26,18 @@ struct vhost_work {
unsigned long flags;
};
+struct vhost_worker;
+struct vhost_dev;
+
+struct vhost_worker_ops {
+ int (*create)(struct vhost_worker *worker, struct vhost_dev *dev,
+ const char *name);
+ void (*stop)(struct vhost_worker *worker);
+ void (*wakeup)(struct vhost_worker *worker);
+};
+
struct vhost_worker {
+ struct task_struct *kthread_task;
struct vhost_task *vtsk;
struct vhost_dev *dev;
/* Used to serialize device wide flushing with worker swapping. */
@@ -36,6 +47,7 @@ struct vhost_worker {
u32 id;
int attachment_cnt;
bool killed;
+ const struct vhost_worker_ops *ops;
};
/* Poll a file (eventfd or socket) */
@@ -103,6 +115,8 @@ struct vhost_virtqueue {
* Values are limited to 0x7fff, and the high bit is used as
* a wrap counter when using VIRTIO_F_RING_PACKED. */
u16 last_avail_idx;
+ /* Next avail ring head when VIRTIO_F_IN_ORDER is negoitated */
+ u16 next_avail_head;
/* Caches available index value from user. */
u16 avail_idx;
@@ -129,6 +143,7 @@ struct vhost_virtqueue {
struct iovec iotlb_iov[64];
struct iovec *indirect;
struct vring_used_elem *heads;
+ u16 *nheads;
/* Protected by virtqueue mutex. */
struct vhost_iotlb *umem;
struct vhost_iotlb *iotlb;
@@ -176,6 +191,16 @@ struct vhost_dev {
int byte_weight;
struct xarray worker_xa;
bool use_worker;
+ /*
+ * If fork_owner is true we use vhost_tasks to create
+ * the worker so all settings/limits like cgroups, NPROC,
+ * scheduler, etc are inherited from the owner. If false,
+ * we use kthreads and only attach to the same cgroups
+ * as the owner for compat with older kernels.
+ * here we use true as default value.
+ * The default value is set by fork_from_owner_default
+ */
+ bool fork_owner;
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
struct vhost_iotlb_msg *msg);
};
@@ -213,11 +238,12 @@ bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
int vhost_vq_init_access(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
- unsigned count);
+ u16 *nheads, unsigned count);
void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
unsigned int id, int len);
void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
- struct vring_used_elem *heads, unsigned count);
+ struct vring_used_elem *heads, u16 *nheads,
+ unsigned count);
void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index bbce65452701..9f27c3f6091b 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -780,22 +780,6 @@ ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
EXPORT_SYMBOL(vringh_iov_push_user);
/**
- * vringh_abandon_user - we've decided not to handle the descriptor(s).
- * @vrh: the vring.
- * @num: the number of descriptors to put back (ie. num
- * vringh_get_user() to undo).
- *
- * The next vringh_get_user() will return the old descriptor(s) again.
- */
-void vringh_abandon_user(struct vringh *vrh, unsigned int num)
-{
- /* We only update vring_avail_event(vr) when we want to be notified,
- * so we haven't changed that yet. */
- vrh->last_avail_idx -= num;
-}
-EXPORT_SYMBOL(vringh_abandon_user);
-
-/**
* vringh_complete_user - we've finished with descriptor, publish it.
* @vrh: the vring.
* @head: the head as filled in by vringh_getdesc_user.
@@ -900,20 +884,6 @@ static inline int putused_kern(const struct vringh *vrh,
return 0;
}
-static inline int xfer_kern(const struct vringh *vrh, void *src,
- void *dst, size_t len)
-{
- memcpy(dst, src, len);
- return 0;
-}
-
-static inline int kern_xfer(const struct vringh *vrh, void *dst,
- void *src, size_t len)
-{
- memcpy(dst, src, len);
- return 0;
-}
-
/**
* vringh_init_kern - initialize a vringh for a kernelspace vring.
* @vrh: the vringh to initialize.
@@ -999,51 +969,6 @@ int vringh_getdesc_kern(struct vringh *vrh,
EXPORT_SYMBOL(vringh_getdesc_kern);
/**
- * vringh_iov_pull_kern - copy bytes from vring_iov.
- * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
- * @dst: the place to copy.
- * @len: the maximum length to copy.
- *
- * Returns the bytes copied <= len or a negative errno.
- */
-ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
-{
- return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern);
-}
-EXPORT_SYMBOL(vringh_iov_pull_kern);
-
-/**
- * vringh_iov_push_kern - copy bytes into vring_iov.
- * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
- * @src: the place to copy from.
- * @len: the maximum length to copy.
- *
- * Returns the bytes copied <= len or a negative errno.
- */
-ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
- const void *src, size_t len)
-{
- return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer);
-}
-EXPORT_SYMBOL(vringh_iov_push_kern);
-
-/**
- * vringh_abandon_kern - we've decided not to handle the descriptor(s).
- * @vrh: the vring.
- * @num: the number of descriptors to put back (ie. num
- * vringh_get_kern() to undo).
- *
- * The next vringh_get_kern() will return the old descriptor(s) again.
- */
-void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
-{
- /* We only update vring_avail_event(vr) when we want to be notified,
- * so we haven't changed that yet. */
- vrh->last_avail_idx -= num;
-}
-EXPORT_SYMBOL(vringh_abandon_kern);
-
-/**
* vringh_complete_kern - we've finished with descriptor, publish it.
* @vrh: the vring.
* @head: the head as filled in by vringh_getdesc_kern.
@@ -1535,23 +1460,6 @@ ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
EXPORT_SYMBOL(vringh_iov_push_iotlb);
/**
- * vringh_abandon_iotlb - we've decided not to handle the descriptor(s).
- * @vrh: the vring.
- * @num: the number of descriptors to put back (ie. num
- * vringh_get_iotlb() to undo).
- *
- * The next vringh_get_iotlb() will return the old descriptor(s) again.
- */
-void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num)
-{
- /* We only update vring_avail_event(vr) when we want to be notified,
- * so we haven't changed that yet.
- */
- vrh->last_avail_idx -= num;
-}
-EXPORT_SYMBOL(vringh_abandon_iotlb);
-
-/**
* vringh_complete_iotlb - we've finished with descriptor, publish it.
* @vrh: the vring.
* @head: the head as filled in by vringh_getdesc_iotlb.
@@ -1572,32 +1480,6 @@ int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len)
EXPORT_SYMBOL(vringh_complete_iotlb);
/**
- * vringh_notify_enable_iotlb - we want to know if something changes.
- * @vrh: the vring.
- *
- * This always enables notifications, but returns false if there are
- * now more buffers available in the vring.
- */
-bool vringh_notify_enable_iotlb(struct vringh *vrh)
-{
- return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb);
-}
-EXPORT_SYMBOL(vringh_notify_enable_iotlb);
-
-/**
- * vringh_notify_disable_iotlb - don't tell us if something changes.
- * @vrh: the vring.
- *
- * This is our normal running state: we disable and then only enable when
- * we're going to sleep.
- */
-void vringh_notify_disable_iotlb(struct vringh *vrh)
-{
- __vringh_notify_disable(vrh, putu16_iotlb);
-}
-EXPORT_SYMBOL(vringh_notify_disable_iotlb);
-
-/**
* vringh_need_notify_iotlb - must we tell the other side about used buffers?
* @vrh: the vring we've called vringh_complete_iotlb() on.
*
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 802153e23073..ae01457ea2cd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -344,6 +344,10 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
len = iov_length(vq->iov, out);
+ if (len < VIRTIO_VSOCK_SKB_HEADROOM ||
+ len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM)
+ return NULL;
+
/* len contains both payload and hdr */
skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
if (!skb)
@@ -367,18 +371,15 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
return skb;
/* The pkt is too big or the length in the header is invalid */
- if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
- payload_len + sizeof(*hdr) > len) {
+ if (payload_len + sizeof(*hdr) > len) {
kfree_skb(skb);
return NULL;
}
- virtio_vsock_skb_rx_put(skb);
+ virtio_vsock_skb_put(skb, payload_len);
- nbytes = copy_from_iter(skb->data, payload_len, &iov_iter);
- if (nbytes != payload_len) {
- vq_err(vq, "Expected %zu byte payload, got %zu bytes\n",
- payload_len, nbytes);
+ if (skb_copy_datagram_from_iter(skb, 0, &iov_iter, payload_len)) {
+ vq_err(vq, "Failed to copy %zu byte payload\n", payload_len);
kfree_skb(skb);
return NULL;
}
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 5c48788cdbec..a09eb4d62f82 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(virtio_config_changed);
/**
* virtio_config_driver_disable - disable config change reporting by drivers
- * @dev: the device to reset
+ * @dev: the device to disable
*
* This is only allowed to be called by a driver and disabling can't
* be nested.
@@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(virtio_config_driver_disable);
/**
* virtio_config_driver_enable - enable config change reporting by drivers
- * @dev: the device to reset
+ * @dev: the device to enable
*
* This is only allowed to be called by a driver and enabling can't
* be nested.
@@ -512,7 +512,7 @@ out:
* On error, the caller must call put_device on &@dev->dev (and not kfree),
* as another code path may have obtained a reference to @dev.
*
- * Returns: 0 on suceess, -error on failure
+ * Returns: 0 on success, -error on failure
*/
int register_virtio_device(struct virtio_device *dev)
{
@@ -536,6 +536,7 @@ int register_virtio_device(struct virtio_device *dev)
goto out_ida_remove;
spin_lock_init(&dev->config_lock);
+ dev->config_driver_disabled = false;
dev->config_core_enabled = false;
dev->config_change_pending = false;
diff --git a/drivers/virtio/virtio_dma_buf.c b/drivers/virtio/virtio_dma_buf.c
index 3fe1d03b0645..95c10632f84a 100644
--- a/drivers/virtio/virtio_dma_buf.c
+++ b/drivers/virtio/virtio_dma_buf.c
@@ -36,6 +36,8 @@ EXPORT_SYMBOL(virtio_dma_buf_export);
/**
* virtio_dma_buf_attach - mandatory attach callback for virtio dma-bufs
+ * @dma_buf: [in] buffer to attach
+ * @attach: [in] attachment structure
*/
int virtio_dma_buf_attach(struct dma_buf *dma_buf,
struct dma_buf_attachment *attach)
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 5d78c2d572ab..b152a1eca05a 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -65,7 +65,6 @@
#include <linux/platform_device.h>
#include <linux/pm.h>
#include <linux/slab.h>
-#include <linux/spinlock.h>
#include <linux/virtio.h>
#include <linux/virtio_config.h>
#include <uapi/linux/virtio_mmio.h>
@@ -88,22 +87,8 @@ struct virtio_mmio_device {
void __iomem *base;
unsigned long version;
-
- /* a list of queues so we can dispatch IRQs */
- spinlock_t lock;
- struct list_head virtqueues;
-};
-
-struct virtio_mmio_vq_info {
- /* the actual virtqueue */
- struct virtqueue *vq;
-
- /* the list node for the virtqueues list */
- struct list_head node;
};
-
-
/* Configuration interface */
static u64 vm_get_features(struct virtio_device *vdev)
@@ -300,9 +285,8 @@ static bool vm_notify_with_data(struct virtqueue *vq)
static irqreturn_t vm_interrupt(int irq, void *opaque)
{
struct virtio_mmio_device *vm_dev = opaque;
- struct virtio_mmio_vq_info *info;
+ struct virtqueue *vq;
unsigned long status;
- unsigned long flags;
irqreturn_t ret = IRQ_NONE;
/* Read and acknowledge interrupts */
@@ -315,10 +299,8 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
}
if (likely(status & VIRTIO_MMIO_INT_VRING)) {
- spin_lock_irqsave(&vm_dev->lock, flags);
- list_for_each_entry(info, &vm_dev->virtqueues, node)
- ret |= vring_interrupt(irq, info->vq);
- spin_unlock_irqrestore(&vm_dev->lock, flags);
+ virtio_device_for_each_vq(&vm_dev->vdev, vq)
+ ret |= vring_interrupt(irq, vq);
}
return ret;
@@ -329,14 +311,8 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
static void vm_del_vq(struct virtqueue *vq)
{
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
- struct virtio_mmio_vq_info *info = vq->priv;
- unsigned long flags;
unsigned int index = vq->index;
- spin_lock_irqsave(&vm_dev->lock, flags);
- list_del(&info->node);
- spin_unlock_irqrestore(&vm_dev->lock, flags);
-
/* Select and deactivate the queue */
writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
if (vm_dev->version == 1) {
@@ -347,8 +323,6 @@ static void vm_del_vq(struct virtqueue *vq)
}
vring_del_virtqueue(vq);
-
- kfree(info);
}
static void vm_del_vqs(struct virtio_device *vdev)
@@ -375,9 +349,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
{
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
bool (*notify)(struct virtqueue *vq);
- struct virtio_mmio_vq_info *info;
struct virtqueue *vq;
- unsigned long flags;
unsigned int num;
int err;
@@ -399,13 +371,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
goto error_available;
}
- /* Allocate and fill out our active queue description */
- info = kmalloc(sizeof(*info), GFP_KERNEL);
- if (!info) {
- err = -ENOMEM;
- goto error_kmalloc;
- }
-
num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
if (num == 0) {
err = -ENOENT;
@@ -463,13 +428,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
}
- vq->priv = info;
- info->vq = vq;
-
- spin_lock_irqsave(&vm_dev->lock, flags);
- list_add(&info->node, &vm_dev->virtqueues);
- spin_unlock_irqrestore(&vm_dev->lock, flags);
-
return vq;
error_bad_pfn:
@@ -481,8 +439,6 @@ error_new_virtqueue:
writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
}
- kfree(info);
-error_kmalloc:
error_available:
return ERR_PTR(err);
}
@@ -627,8 +583,6 @@ static int virtio_mmio_probe(struct platform_device *pdev)
vm_dev->vdev.dev.release = virtio_mmio_release_dev;
vm_dev->vdev.config = &virtio_mmio_config_ops;
vm_dev->pdev = pdev;
- INIT_LIST_HEAD(&vm_dev->virtqueues);
- spin_lock_init(&vm_dev->lock);
vm_dev->base = devm_platform_ioremap_resource(pdev, 0);
if (IS_ERR(vm_dev->base)) {
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 4397392bfef0..f5062061c408 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2296,6 +2296,10 @@ static inline int virtqueue_add(struct virtqueue *_vq,
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ *
+ * NB: ENOSPC is a special code that is only returned on an attempt to add a
+ * buffer to a full VQ. It indicates that some buffers are outstanding and that
+ * the operation can be retried after some buffers have been used.
*/
int virtqueue_add_sgs(struct virtqueue *_vq,
struct scatterlist *sgs[],
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index a7b297dae489..657b07a60788 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -28,19 +28,6 @@ struct virtio_vdpa_device {
struct virtio_device vdev;
struct vdpa_device *vdpa;
u64 features;
-
- /* The lock to protect virtqueue list */
- spinlock_t lock;
- /* List of virtio_vdpa_vq_info */
- struct list_head virtqueues;
-};
-
-struct virtio_vdpa_vq_info {
- /* the actual virtqueue */
- struct virtqueue *vq;
-
- /* the list node for the virtqueues list */
- struct list_head node;
};
static inline struct virtio_vdpa_device *
@@ -135,9 +122,9 @@ static irqreturn_t virtio_vdpa_config_cb(void *private)
static irqreturn_t virtio_vdpa_virtqueue_cb(void *private)
{
- struct virtio_vdpa_vq_info *info = private;
+ struct virtqueue *vq = private;
- return vring_interrupt(0, info->vq);
+ return vring_interrupt(0, vq);
}
static struct virtqueue *
@@ -145,18 +132,15 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
void (*callback)(struct virtqueue *vq),
const char *name, bool ctx)
{
- struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
struct vdpa_device *vdpa = vd_get_vdpa(vdev);
struct device *dma_dev;
const struct vdpa_config_ops *ops = vdpa->config;
- struct virtio_vdpa_vq_info *info;
bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify;
struct vdpa_callback cb;
struct virtqueue *vq;
u64 desc_addr, driver_addr, device_addr;
/* Assume split virtqueue, switch to packed if necessary */
struct vdpa_vq_state state = {0};
- unsigned long flags;
u32 align, max_num, min_num = 1;
bool may_reduce_num = true;
int err;
@@ -179,10 +163,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
if (ops->get_vq_ready(vdpa, index))
return ERR_PTR(-ENOENT);
- /* Allocate and fill out our active queue description */
- info = kmalloc(sizeof(*info), GFP_KERNEL);
- if (!info)
- return ERR_PTR(-ENOMEM);
if (ops->get_vq_size)
max_num = ops->get_vq_size(vdpa, index);
else
@@ -217,7 +197,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
/* Setup virtqueue callback */
cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL;
- cb.private = info;
+ cb.private = vq;
cb.trigger = NULL;
ops->set_vq_cb(vdpa, index, &cb);
ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
@@ -248,13 +228,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
ops->set_vq_ready(vdpa, index, 1);
- vq->priv = info;
- info->vq = vq;
-
- spin_lock_irqsave(&vd_dev->lock, flags);
- list_add(&info->node, &vd_dev->virtqueues);
- spin_unlock_irqrestore(&vd_dev->lock, flags);
-
return vq;
err_vq:
@@ -263,7 +236,6 @@ error_new_virtqueue:
ops->set_vq_ready(vdpa, index, 0);
/* VDPA driver should make sure vq is stopeed here */
WARN_ON(ops->get_vq_ready(vdpa, index));
- kfree(info);
return ERR_PTR(err);
}
@@ -272,20 +244,12 @@ static void virtio_vdpa_del_vq(struct virtqueue *vq)
struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
struct vdpa_device *vdpa = vd_dev->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
- struct virtio_vdpa_vq_info *info = vq->priv;
unsigned int index = vq->index;
- unsigned long flags;
-
- spin_lock_irqsave(&vd_dev->lock, flags);
- list_del(&info->node);
- spin_unlock_irqrestore(&vd_dev->lock, flags);
/* Select and deactivate the queue (best effort) */
ops->set_vq_ready(vdpa, index, 0);
vring_del_virtqueue(vq);
-
- kfree(info);
}
static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
@@ -502,8 +466,6 @@ static int virtio_vdpa_probe(struct vdpa_device *vdpa)
vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
vd_dev->vdev.config = &virtio_vdpa_config_ops;
vd_dev->vdpa = vdpa;
- INIT_LIST_HEAD(&vd_dev->virtqueues);
- spin_lock_init(&vd_dev->lock);
vd_dev->vdev.id.device = ops->get_device_id(vdpa);
if (vd_dev->vdev.id.device == 0)
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 04b90c88d164..db31fc6f4f1f 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -199,7 +199,7 @@ int virtio_device_reset_done(struct virtio_device *dev);
size_t virtio_max_dma_size(const struct virtio_device *vdev);
#define virtio_device_for_each_vq(vdev, vq) \
- list_for_each_entry(vq, &vdev->vqs, list)
+ list_for_each_entry(vq, &(vdev)->vqs, list)
/**
* struct virtio_driver - operations for a virtio I/O driver
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 36fb3edfa403..0c67543a45c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -47,31 +47,50 @@ static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb)
VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false;
}
-static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb)
+static inline void virtio_vsock_skb_put(struct sk_buff *skb, u32 len)
{
- u32 len;
+ DEBUG_NET_WARN_ON_ONCE(skb->len);
- len = le32_to_cpu(virtio_vsock_hdr(skb)->len);
-
- if (len > 0)
+ if (skb_is_nonlinear(skb))
+ skb->len = len;
+ else
skb_put(skb, len);
}
-static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask)
+static inline struct sk_buff *
+__virtio_vsock_alloc_skb_with_frags(unsigned int header_len,
+ unsigned int data_len,
+ gfp_t mask)
{
struct sk_buff *skb;
+ int err;
- if (size < VIRTIO_VSOCK_SKB_HEADROOM)
- return NULL;
-
- skb = alloc_skb(size, mask);
+ skb = alloc_skb_with_frags(header_len, data_len,
+ PAGE_ALLOC_COSTLY_ORDER, &err, mask);
if (!skb)
return NULL;
skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM);
+ skb->data_len = data_len;
return skb;
}
+static inline struct sk_buff *
+virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask)
+{
+ return __virtio_vsock_alloc_skb_with_frags(size, 0, mask);
+}
+
+static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask)
+{
+ if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ return virtio_vsock_alloc_linear_skb(size, mask);
+
+ size -= VIRTIO_VSOCK_SKB_HEADROOM;
+ return __virtio_vsock_alloc_skb_with_frags(VIRTIO_VSOCK_SKB_HEADROOM,
+ size, mask);
+}
+
static inline void
virtio_vsock_skb_queue_head(struct sk_buff_head *list, struct sk_buff *skb)
{
@@ -111,7 +130,12 @@ static inline size_t virtio_vsock_skb_len(struct sk_buff *skb)
return (size_t)(skb_end_pointer(skb) - skb->head);
}
-#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4)
+/* Dimension the RX SKB so that the entire thing fits exactly into
+ * a single 4KiB page. This avoids wasting memory due to alloc_skb()
+ * rounding up to the next page order and also means that we
+ * don't leave higher-order pages sitting around in the RX queue.
+ */
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE SKB_WITH_OVERHEAD(1024 * 4)
#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL
#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64)
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index c3a8117dabe8..49e7cbc9697a 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -175,9 +175,6 @@ int vringh_complete_multi_user(struct vringh *vrh,
const struct vring_used_elem used[],
unsigned num_used);
-/* Pretend we've never seen descriptor (for easy error handling). */
-void vringh_abandon_user(struct vringh *vrh, unsigned int num);
-
/* Do we need to fire the eventfd to notify the other side? */
int vringh_need_notify_user(struct vringh *vrh);
@@ -235,10 +232,6 @@ int vringh_getdesc_kern(struct vringh *vrh,
u16 *head,
gfp_t gfp);
-ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
-ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
- const void *src, size_t len);
-void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
bool vringh_notify_enable_kern(struct vringh *vrh);
@@ -319,13 +312,8 @@ ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
struct vringh_kiov *wiov,
const void *src, size_t len);
-void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num);
-
int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len);
-bool vringh_notify_enable_iotlb(struct vringh *vrh);
-void vringh_notify_disable_iotlb(struct vringh *vrh);
-
int vringh_need_notify_iotlb(struct vringh *vrh);
#endif /* CONFIG_VHOST_IOTLB */
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index d6ad01fbb8d2..283348b64af9 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -242,4 +242,32 @@
#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \
struct vhost_features_array)
+/* fork_owner values for vhost */
+#define VHOST_FORK_OWNER_KTHREAD 0
+#define VHOST_FORK_OWNER_TASK 1
+
+/**
+ * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device,
+ * This ioctl must called before VHOST_SET_OWNER.
+ * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
+ *
+ * @param fork_owner: An 8-bit value that determines the vhost thread mode
+ *
+ * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value):
+ * - Vhost will create vhost worker as tasks forked from the owner,
+ * inheriting all of the owner's attributes.
+ *
+ * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD:
+ * - Vhost will create vhost workers as kernel threads.
+ */
+#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8)
+
+/**
+ * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device.
+ * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
+ *
+ * @return: An 8-bit value indicating the current thread mode.
+ */
+#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8)
+
#endif
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index 2f844c279a3e..bc738fa90c1d 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
if (IS_ERR(tsk)) {
kfree(vtsk);
- return ERR_PTR(PTR_ERR(tsk));
+ return ERR_CAST(tsk);
}
vtsk->task = tsk;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index f0e48e6911fc..b6569b0ca2bb 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -307,7 +307,7 @@ out_rcu:
static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
{
- int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM;
+ int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
struct scatterlist pkt, *p;
struct virtqueue *vq;
struct sk_buff *skb;
@@ -316,7 +316,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
vq = vsock->vqs[VSOCK_VQ_RX];
do {
- skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL);
+ skb = virtio_vsock_alloc_linear_skb(total_len, GFP_KERNEL);
if (!skb)
break;
@@ -624,8 +624,9 @@ static void virtio_transport_rx_work(struct work_struct *work)
do {
virtqueue_disable_cb(vq);
for (;;) {
+ unsigned int len, payload_len;
+ struct virtio_vsock_hdr *hdr;
struct sk_buff *skb;
- unsigned int len;
if (!virtio_transport_more_replies(vsock)) {
/* Stop rx until the device processes already
@@ -642,13 +643,22 @@ static void virtio_transport_rx_work(struct work_struct *work)
vsock->rx_buf_nr--;
/* Drop short/long packets */
- if (unlikely(len < sizeof(struct virtio_vsock_hdr) ||
+ if (unlikely(len < sizeof(*hdr) ||
len > virtio_vsock_skb_len(skb))) {
kfree_skb(skb);
continue;
}
- virtio_vsock_skb_rx_put(skb);
+ hdr = virtio_vsock_hdr(skb);
+ payload_len = le32_to_cpu(hdr->len);
+ if (unlikely(payload_len > len - sizeof(*hdr))) {
+ kfree_skb(skb);
+ continue;
+ }
+
+ if (payload_len)
+ virtio_vsock_skb_put(skb, payload_len);
+
virtio_transport_deliver_tap_pkt(skb);
virtio_transport_recv_pkt(&virtio_transport, skb);
}
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 1b5d9896edae..fe92e5fa95b4 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -109,7 +109,8 @@ static int virtio_transport_fill_skb(struct sk_buff *skb,
return __zerocopy_sg_from_iter(info->msg, NULL, skb,
&info->msg->msg_iter, len, NULL);
- return memcpy_from_msg(skb_put(skb, len), info->msg, len);
+ virtio_vsock_skb_put(skb, len);
+ return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len);
}
static void virtio_transport_init_hdr(struct sk_buff *skb,