From cb9ccfb404e700dc0db59d68242d79fe386bb3f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jun 2025 17:05:19 -0600 Subject: io_uring/nop: add IORING_NOP_TW completion flag To test and profile the overhead of io_uring task_work and the various types of it, add IORING_NOP_TW which tells nop to signal completions through task_work rather than complete them inline. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cfd17e382082..8c3d43caab02 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -449,6 +449,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FILE (1U << 1) #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) +#define IORING_NOP_TW (1U << 4) /* * IO completion data structure (Completion Queue Entry) -- cgit v1.2.3 From 9e4ed359b8efad0e8ad4510d8ad22bf0b060526a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Jun 2025 10:46:29 +0100 Subject: io_uring/netcmd: add tx timestamping cmd support Add a new socket command which returns tx time stamps to the user. It provide an alternative to the existing error queue recvmsg interface. The command works in a polled multishot mode, which means io_uring will poll the socket and keep posting timestamps until the request is cancelled or fails in any other way (e.g. with no space in the CQ). It reuses the net infra and grabs timestamps from the socket's error queue. The command requires IORING_SETUP_CQE32. All non-final CQEs (marked with IORING_CQE_F_MORE) have cqe->res set to the tskey, and the upper 16 bits of cqe->flags keep tstype (i.e. offset by IORING_CQE_BUFFER_SHIFT). The timevalue is store in the upper part of the extended CQE. The final completion won't have IORING_CQE_F_MORE and will have cqe->res storing 0/error. Suggested-by: Vadim Fedorenko Acked-by: Willem de Bruijn Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/92ee66e6b33b8de062a977843d825f58f21ecd37.1750065793.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 16 +++++++++ io_uring/cmd_net.c | 82 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8c3d43caab02..85600ad0ac08 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -969,6 +969,22 @@ enum io_uring_socket_op { SOCKET_URING_OP_SIOCOUTQ, SOCKET_URING_OP_GETSOCKOPT, SOCKET_URING_OP_SETSOCKOPT, + SOCKET_URING_OP_TX_TIMESTAMP, +}; + +/* + * SOCKET_URING_OP_TX_TIMESTAMP definitions + */ + +#define IORING_TIMESTAMP_HW_SHIFT 16 +/* The cqe->flags bit from which the timestamp type is stored */ +#define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1) +/* The cqe->flags flag signifying whether it's a hardware timestamp */ +#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT); + +struct io_timespec { + __u64 tv_sec; + __u64 tv_nsec; }; /* Zero copy receive refill queue entry */ diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index e99170c7d41a..3866fe6ff541 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -1,5 +1,6 @@ #include #include +#include #include #include "uring_cmd.h" @@ -51,6 +52,85 @@ static inline int io_uring_cmd_setsockopt(struct socket *sock, optlen); } +static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk, + struct sk_buff *skb, unsigned issue_flags) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct io_uring_cqe cqe[2]; + struct io_timespec *iots; + struct timespec64 ts; + u32 tstype, tskey; + int ret; + + BUILD_BUG_ON(sizeof(struct io_uring_cqe) != sizeof(struct io_timespec)); + + ret = skb_get_tx_timestamp(skb, sk, &ts); + if (ret < 0) + return false; + + tskey = serr->ee.ee_data; + tstype = serr->ee.ee_info; + + cqe->user_data = 0; + cqe->res = tskey; + cqe->flags = IORING_CQE_F_MORE; + cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; + if (ret == SOF_TIMESTAMPING_TX_HARDWARE) + cqe->flags |= IORING_CQE_F_TSTAMP_HW; + + iots = (struct io_timespec *)&cqe[1]; + iots->tv_sec = ts.tv_sec; + iots->tv_nsec = ts.tv_nsec; + return io_uring_cmd_post_mshot_cqe32(cmd, issue_flags, cqe); +} + +static int io_uring_cmd_timestamp(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct sock *sk = sock->sk; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *tmp; + struct sk_buff_head list; + int ret; + + if (!(issue_flags & IO_URING_F_CQE32)) + return -EINVAL; + ret = io_cmd_poll_multishot(cmd, issue_flags, EPOLLERR); + if (unlikely(ret)) + return ret; + + if (skb_queue_empty_lockless(q)) + return -EAGAIN; + __skb_queue_head_init(&list); + + scoped_guard(spinlock_irq, &q->lock) { + skb_queue_walk_safe(q, skb, tmp) { + /* don't support skbs with payload */ + if (!skb_has_tx_timestamp(skb, sk) || skb->len) + continue; + __skb_unlink(skb, q); + __skb_queue_tail(&list, skb); + } + } + + while (1) { + skb = skb_peek(&list); + if (!skb) + break; + if (!io_process_timestamp_skb(cmd, sk, skb, issue_flags)) + break; + __skb_dequeue(&list); + consume_skb(skb); + } + + if (!unlikely(skb_queue_empty(&list))) { + scoped_guard(spinlock_irqsave, &q->lock) + skb_queue_splice(q, &list); + } + return -EAGAIN; +} + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct socket *sock = cmd->file->private_data; @@ -76,6 +156,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) return io_uring_cmd_getsockopt(sock, cmd, issue_flags); case SOCKET_URING_OP_SETSOCKOPT: return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_TX_TIMESTAMP: + return io_uring_cmd_timestamp(sock, cmd, issue_flags); default: return -EOPNOTSUPP; } -- cgit v1.2.3 From 94b2030968be70b33fed9a5514a5967c7f20aebc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Jun 2025 11:36:54 -0600 Subject: io_uring: remove errant ';' from IORING_CQE_F_TSTAMP_HW definition An errant ';' slipped into that definition, which will cause some compilers to complain when it's used in an application: timestamp.c:257:45: error: empty expression statement has no effect; remove unnecessary ';' to silence this warning [-Werror,-Wextra-semi-stmt] 257 | hwts = cqe->flags & IORING_CQE_F_TSTAMP_HW; | ^ Fixes: 9e4ed359b8ef ("io_uring/netcmd: add tx timestamping cmd support") Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 85600ad0ac08..b6be063693c8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -980,7 +980,7 @@ enum io_uring_socket_op { /* The cqe->flags bit from which the timestamp type is stored */ #define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1) /* The cqe->flags flag signifying whether it's a hardware timestamp */ -#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT); +#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT) struct io_timespec { __u64 tv_sec; -- cgit v1.2.3 From cf73d9970ea4f8cace5d8f02d2565a2723003112 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 21:31:54 +0100 Subject: io_uring: don't use int for ABI __kernel_rwf_t is defined as int, the actual size of which is implementation defined. It won't go well if some compiler / archs ever defines it as i64, so replace it with __u32, hoping that there is no one using i16 for it. Cc: stable@vger.kernel.org Fixes: 2b188cc1bb857 ("Add io_uring IO interface") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/47c666c4ee1df2018863af3a2028af18feef11ed.1751412511.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b6be063693c8..b8a0e70ee2fd 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -50,7 +50,7 @@ struct io_uring_sqe { }; __u32 len; /* buffer size or number of iovecs */ union { - __kernel_rwf_t rw_flags; + __u32 rw_flags; __u32 fsync_flags; __u16 poll_events; /* compatibility */ __u32 poll32_events; /* word-reversed for BE */ -- cgit v1.2.3 From 6f02527729bd31ca4e473bff19fda4ccd5889148 Mon Sep 17 00:00:00 2001 From: Norman Maurer Date: Mon, 28 Jul 2025 20:59:53 -1000 Subject: io_uring/net: Allow to do vectorized send At the moment you have to use sendmsg for vectorized send. While this works it's suboptimal as it also means you need to allocate a struct msghdr that needs to be kept alive until a submission happens. We can remove this limitation by just allowing to use send directly. Signed-off-by: Norman Maurer Link: https://lore.kernel.org/r/20250729065952.26646-1-norman_maurer@apple.com [axboe: remove -EINVAL return for SENDMSG and SEND_VECTORIZED] [axboe: allow send_zc to set SEND_VECTORIZED too] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 ++++ io_uring/net.c | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux/io_uring.h') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b8a0e70ee2fd..6957dc539d83 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -392,12 +392,16 @@ enum io_uring_op { * the starting buffer ID in cqe->flags as per * usual for provided buffer usage. The buffers * will be contiguous from the starting buffer ID. + * + * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec + * to allow vectorized send operations. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) #define IORING_SEND_ZC_REPORT_USAGE (1U << 3) #define IORING_RECVSEND_BUNDLE (1U << 4) +#define IORING_SEND_VECTORIZED (1U << 5) /* * cqe.res for IORING_CQE_F_NOTIF if diff --git a/io_uring/net.c b/io_uring/net.c index 35585bdc59f3..dd96e355982f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -382,6 +382,10 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) } if (req->flags & REQ_F_BUFFER_SELECT) return 0; + + if (sr->flags & IORING_SEND_VECTORIZED) + return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); + return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } @@ -409,7 +413,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } -#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1318,7 +1322,8 @@ void io_send_zc_cleanup(struct io_kiocb *req) } #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) -#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) +#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \ + IORING_SEND_VECTORIZED) int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -- cgit v1.2.3