From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Wed, 3 Dec 2025 15:37:44 -0800
Subject: bpf: Support associating BPF program with struct_ops

Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating
a BPF program with a struct_ops map. This command takes a file
descriptor of a struct_ops map and a BPF program and set
prog->aux->st_ops_assoc to the kdata of the struct_ops map.

The command does not accept a struct_ops program nor a non-struct_ops
map. Programs of a struct_ops map is automatically associated with the
map during map update. If a program is shared between two struct_ops
maps, prog->aux->st_ops_assoc will be poisoned to indicate that the
associated struct_ops is ambiguous. The pointer, once poisoned, cannot
be reset since we have lost track of associated struct_ops. For other
program types, the associated struct_ops map, once set, cannot be
changed later. This restriction may be lifted in the future if there is
a use case.

A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve
the associated struct_ops pointer. The returned pointer, if not NULL, is
guaranteed to be valid and point to a fully updated struct_ops struct.
For struct_ops program reused in multiple struct_ops map, the return
will be NULL.

prog->aux->st_ops_assoc is protected by bumping the refcount for
non-struct_ops programs and RCU for struct_ops programs. Since it would
be inefficient to track programs associated with a struct_ops map, every
non-struct_ops program will bump the refcount of the map to make sure
st_ops_assoc stays valid. For a struct_ops program, it is protected by
RCU as map_free will wait for an RCU grace period before disassociating
the program with the map. The helper must be called in BPF program
context or RCU read-side critical section.

struct_ops implementers should note that the struct_ops returned may not
be initialized nor attached yet. The struct_ops implementer will be
responsible for tracking and checking the state of the associated
struct_ops map if the use case expects an initialized or attached
struct_ops.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com
---
 include/uapi/linux/bpf.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f8d8513eda27..84ced3ed2d21 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -918,6 +918,16 @@ union bpf_iter_link_info {
  *		Number of bytes read from the stream on success, or -1 if an
  *		error occurred (in which case, *errno* is set appropriately).
  *
+ * BPF_PROG_ASSOC_STRUCT_OPS
+ * 	Description
+ * 		Associate a BPF program with a struct_ops map. The struct_ops
+ * 		map is identified by *map_fd* and the BPF program is
+ * 		identified by *prog_fd*.
+ *
+ * 	Return
+ * 		0 on success or -1 if an error occurred (in which case,
+ * 		*errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -974,6 +984,7 @@ enum bpf_cmd {
 	BPF_PROG_BIND_MAP,
 	BPF_TOKEN_CREATE,
 	BPF_PROG_STREAM_READ_BY_FD,
+	BPF_PROG_ASSOC_STRUCT_OPS,
 	__MAX_BPF_CMD,
 };
 
@@ -1894,6 +1905,12 @@ union bpf_attr {
 		__u32		prog_fd;
 	} prog_stream_read;
 
+	struct {
+		__u32		map_fd;
+		__u32		prog_fd;
+		__u32		flags;
+	} prog_assoc_struct_ops;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
cgit v1.2.3


From cc6b66d661fda4fb94c0099dd92b83f8de5c1bf4 Mon Sep 17 00:00:00 2001
From: Zhu Lingshan <lingshan.zhu@amd.com>
Date: Thu, 21 Aug 2025 17:22:44 +0800
Subject: amdkfd: introduce new ioctl AMDKFD_IOC_CREATE_PROCESS

This commit implemetns a new ioctl AMDKFD_IOC_CREATE_PROCESS
that creates a new secondary kfd_progress on the FD.

To keep backward compatibility, userspace programs need to invoke
this ioctl explicitly on a FD to create a secondary
kfd_process which replacing its primary kfd_process.

This commit bumps ioctl minor version.

Signed-off-by: Zhu Lingshan <lingshan.zhu@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 45 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  3 +--
 include/uapi/linux/kfd_ioctl.h           |  8 ++++--
 4 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 092a2b8aaea1..041237861107 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3150,6 +3150,48 @@ out:
 	return r;
 }
 
+/* userspace programs need to invoke this ioctl explicitly on a FD to
+ * create a secondary kfd_process which replacing its primary kfd_process
+ */
+static int kfd_ioctl_create_process(struct file *filep, struct kfd_process *p, void *data)
+{
+	struct kfd_process *process;
+	int ret;
+
+	/* Each FD owns only one kfd_process */
+	if (p->context_id != KFD_CONTEXT_ID_PRIMARY)
+		return -EINVAL;
+
+	if (!filep->private_data || !p)
+		return -EINVAL;
+
+	mutex_lock(&kfd_processes_mutex);
+	if (p != filep->private_data) {
+		mutex_unlock(&kfd_processes_mutex);
+		return -EINVAL;
+	}
+
+	process = create_process(current, false);
+	if (IS_ERR(process)) {
+		mutex_unlock(&kfd_processes_mutex);
+		return PTR_ERR(process);
+	}
+
+	filep->private_data = process;
+	mutex_unlock(&kfd_processes_mutex);
+
+	ret = kfd_create_process_sysfs(process);
+	if (ret)
+		pr_warn("Failed to create sysfs entry for the kfd_process");
+
+	/* Each open() increases kref of the primary kfd_process,
+	 * so we need to reduce it here when we create a new secondary process replacing it
+	 */
+	kfd_unref_process(p);
+
+	return 0;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
 			    .cmd_drv = 0, .name = #ioctl}
@@ -3268,6 +3310,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
 			kfd_ioctl_set_debug_trap, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_PROCESS,
+			kfd_ioctl_create_process, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 399f32689678..12f640a9370a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1053,6 +1053,7 @@ struct amdkfd_ioctl_desc {
 };
 bool kfd_dev_is_large_bar(struct kfd_node *dev);
 
+struct kfd_process *create_process(const struct task_struct *thread, bool primary);
 int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
 void kfd_cleanup_processes(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index c52e56aa9316..b4982da9234b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -68,7 +68,6 @@ static struct workqueue_struct *kfd_restore_wq;
 static struct kfd_process *find_process(const struct task_struct *thread,
 					bool ref);
 static void kfd_process_ref_release(struct kref *ref);
-static struct kfd_process *create_process(const struct task_struct *thread, bool primary);
 
 static void evict_process_worker(struct work_struct *work);
 static void restore_process_worker(struct work_struct *work);
@@ -1582,7 +1581,7 @@ void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
  * On return the kfd_process is fully operational and will be freed when the
  * mm is released
  */
-static struct kfd_process *create_process(const struct task_struct *thread, bool primary)
+struct kfd_process *create_process(const struct task_struct *thread, bool primary)
 {
 	struct kfd_process *process;
 	struct mmu_notifier *mn;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 5d1727a6d040..84aa24c02715 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -44,9 +44,10 @@
  * - 1.16 - Add contiguous VRAM allocation flag
  * - 1.17 - Add SDMA queue creation with target SDMA engine ID
  * - 1.18 - Rename pad in set_memory_policy_args to misc_process_flag
+ * - 1.19 - Add a new ioctl to craete secondary kfd processes
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 18
+#define KFD_IOCTL_MINOR_VERSION 19
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
@@ -1671,7 +1672,10 @@ struct kfd_ioctl_dbg_trap_args {
 #define AMDKFD_IOC_DBG_TRAP			\
 		AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args)
 
+#define AMDKFD_IOC_CREATE_PROCESS		\
+		AMDKFD_IO(0x27)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x27
+#define AMDKFD_COMMAND_END		0x28
 
 #endif
-- 
cgit v1.2.3


From 7a5fb05b5b18e531989aa55b10dfa4be0633207e Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 5 Dec 2025 08:04:41 -0600
Subject: amdkfd: Bump ABI to indicate presence of Trap handler support for
 expert scheduling

commit 0f0c8a6983db ("drm/amdkfd: Trap handler support for expert
scheduling mode") introduced support for a trap handler when expert
scheduling mode. However userspace needs to know whether or not a trap
handler support is present.

Bump the KFD IOCTL API so that userspace can key off this to decide.

Suggested-by: Stella Laurenzo <stella.laurenzo@amd.com>
Fixes: 423888879412 ("drm/amdkfd: Trap handler support for expert scheduling mode")
Reviewed-by: Kent Russell <kent.russell@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 84aa24c02715..4d0c1a53f9d5 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -45,9 +45,10 @@
  * - 1.17 - Add SDMA queue creation with target SDMA engine ID
  * - 1.18 - Rename pad in set_memory_policy_args to misc_process_flag
  * - 1.19 - Add a new ioctl to craete secondary kfd processes
+ * - 1.20 - Trap handler support for expert scheduling mode available
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 19
+#define KFD_IOCTL_MINOR_VERSION 20
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
-- 
cgit v1.2.3


From 0e5032237ee5530147fbdf33134297e1490d5ec3 Mon Sep 17 00:00:00 2001
From: Bhavik Sachdev <b.sachdev1904@gmail.com>
Date: Sat, 29 Nov 2025 14:41:21 +0530
Subject: statmount: accept fd as a parameter

Extend `struct mnt_id_req` to take in a fd and introduce STATMOUNT_BY_FD
flag. When a valid fd is provided and STATMOUNT_BY_FD is set, statmount
will return mountinfo about the mount the fd is on.

This even works for "unmounted" mounts (mounts that have been umounted
using umount2(mnt, MNT_DETACH)), if you have access to a file descriptor
on that mount. These "umounted" mounts will have no mountpoint and no
valid mount namespace. Hence, we unset the STATMOUNT_MNT_POINT and
STATMOUNT_MNT_NS_ID in statmount.mask for "unmounted" mounts.

In case of STATMOUNT_BY_FD, given that we already have access to an fd
on the mount, accessing mount information without a capability check
seems fine because of the following reasons:

- All fs related information is available via fstatfs() without any
  capability check.
- Mount information is also available via /proc/pid/mountinfo (without
  any capability check).
- Given that we have access to a fd on the mount which tells us that we
  had access to the mount at some point (or someone that had access gave
  us the fd). So, we should be able to access mount info.

Co-developed-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Bhavik Sachdev <b.sachdev1904@gmail.com>
Link: https://patch.msgid.link/20251129091455.757724-3-b.sachdev1904@gmail.com
Acked-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 102 +++++++++++++++++++++++++++++----------------
 include/uapi/linux/mount.h |  10 ++++-
 2 files changed, 76 insertions(+), 36 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index f6879f282dae..ec3b16fedd9f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5547,31 +5547,49 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
 
 /* locks: namespace_shared */
 static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
-			struct mnt_namespace *ns)
+                        struct file *mnt_file, struct mnt_namespace *ns)
 {
-	struct mount *m;
 	int err;
 
-	/* Has the namespace already been emptied? */
-	if (mnt_ns_id && mnt_ns_empty(ns))
-		return -ENOENT;
+	if (mnt_file) {
+		WARN_ON_ONCE(ns != NULL);
 
-	s->mnt = lookup_mnt_in_ns(mnt_id, ns);
-	if (!s->mnt)
-		return -ENOENT;
+		s->mnt = mnt_file->f_path.mnt;
+		ns = real_mount(s->mnt)->mnt_ns;
+		if (!ns)
+			/*
+			 * We can't set mount point and mnt_ns_id since we don't have a
+			 * ns for the mount. This can happen if the mount is unmounted
+			 * with MNT_DETACH.
+			 */
+			s->mask &= ~(STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID);
+	} else {
+		/* Has the namespace already been emptied? */
+		if (mnt_ns_id && mnt_ns_empty(ns))
+			return -ENOENT;
 
-	err = grab_requested_root(ns, &s->root);
-	if (err)
-		return err;
+		s->mnt = lookup_mnt_in_ns(mnt_id, ns);
+		if (!s->mnt)
+			return -ENOENT;
+	}
 
-	/*
-	 * Don't trigger audit denials. We just want to determine what
-	 * mounts to show users.
-	 */
-	m = real_mount(s->mnt);
-	if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
-	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
+	if (ns) {
+		err = grab_requested_root(ns, &s->root);
+		if (err)
+			return err;
+
+		if (!mnt_file) {
+			struct mount *m;
+			/*
+			 * Don't trigger audit denials. We just want to determine what
+			 * mounts to show users.
+			 */
+			m = real_mount(s->mnt);
+			if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
+			    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+				return -EPERM;
+		}
+	}
 
 	err = security_sb_statfs(s->mnt->mnt_root);
 	if (err)
@@ -5693,7 +5711,7 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
 }
 
 static int copy_mnt_id_req(const struct mnt_id_req __user *req,
-			   struct mnt_id_req *kreq)
+			   struct mnt_id_req *kreq, unsigned int flags)
 {
 	int ret;
 	size_t usize;
@@ -5711,11 +5729,17 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
 	ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
 	if (ret)
 		return ret;
-	if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
-		return -EINVAL;
-	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
-	if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
-		return -EINVAL;
+
+	if (flags & STATMOUNT_BY_FD) {
+		if (kreq->mnt_id || kreq->mnt_ns_id)
+			return -EINVAL;
+	} else {
+		if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
+			return -EINVAL;
+		/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+		if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
+			return -EINVAL;
+	}
 	return 0;
 }
 
@@ -5762,25 +5786,33 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
 {
 	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
 	struct kstatmount *ks __free(kfree) = NULL;
+	struct file *mnt_file __free(fput) = NULL;
 	struct mnt_id_req kreq;
 	/* We currently support retrieval of 3 strings. */
 	size_t seq_size = 3 * PATH_MAX;
 	int ret;
 
-	if (flags)
+	if (flags & ~STATMOUNT_BY_FD)
 		return -EINVAL;
 
-	ret = copy_mnt_id_req(req, &kreq);
+	ret = copy_mnt_id_req(req, &kreq, flags);
 	if (ret)
 		return ret;
 
-	ns = grab_requested_mnt_ns(&kreq);
-	if (IS_ERR(ns))
-		return PTR_ERR(ns);
+	if (flags & STATMOUNT_BY_FD) {
+		mnt_file = fget_raw(kreq.mnt_fd);
+		if (!mnt_file)
+			return -EBADF;
+		/* do_statmount sets ns in case of STATMOUNT_BY_FD */
+	} else {
+		ns = grab_requested_mnt_ns(&kreq);
+		if (IS_ERR(ns))
+			return PTR_ERR(ns);
 
-	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
-	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
+		if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+		    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+			return -EPERM;
+	}
 
 	ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
 	if (!ks)
@@ -5792,7 +5824,7 @@ retry:
 		return ret;
 
 	scoped_guard(namespace_shared)
-		ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
+		ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, mnt_file, ns);
 
 	if (!ret)
 		ret = copy_statmount_to_user(ks);
@@ -5932,7 +5964,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
 		return -EFAULT;
 
-	ret = copy_mnt_id_req(req, &kreq);
+	ret = copy_mnt_id_req(req, &kreq, 0);
 	if (ret)
 		return ret;
 
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 5d3f8c9e3a62..18c624405268 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -197,7 +197,10 @@ struct statmount {
  */
 struct mnt_id_req {
 	__u32 size;
-	__u32 mnt_ns_fd;
+	union {
+		__u32 mnt_ns_fd;
+		__u32 mnt_fd;
+	};
 	__u64 mnt_id;
 	__u64 param;
 	__u64 mnt_ns_id;
@@ -232,4 +235,9 @@ struct mnt_id_req {
 #define LSMT_ROOT		0xffffffffffffffff	/* root mount */
 #define LISTMOUNT_REVERSE	(1 << 0) /* List later mounts first */
 
+/*
+ * @flag bits for statmount(2)
+ */
+#define STATMOUNT_BY_FD		0x00000001U	/* want mountinfo for given fd */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
cgit v1.2.3


From e83f63da2ac776fbc30861e4ce8b798df6ee8a7a Mon Sep 17 00:00:00 2001
From: Jonathan Kim <jonathan.kim@amd.com>
Date: Mon, 23 Jun 2025 14:12:58 -0400
Subject: drm/amdkfd: allow debug subscription to lds violations on gfx 1250

GFX 1250 allows the debugger to subcribe to LDS out-of-range read/write
memory violations.
Bump IOCTL minor version and flag KFD capabilities for enablement
hint.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c    | 7 +++++++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 4 ++++
 include/uapi/linux/kfd_ioctl.h            | 4 +++-
 include/uapi/linux/kfd_sysfs.h            | 3 ++-
 4 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index ba9a09b6589a..f83e1238c1b3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -519,6 +519,7 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
 		struct kfd_topology_device *topo_dev =
 				kfd_topology_device_by_id(target->pdds[i]->dev->id);
 		uint32_t caps = topo_dev->node_props.capability;
+		uint32_t caps2 = topo_dev->node_props.capability2;
 
 		if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
 			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
@@ -531,6 +532,12 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
 			*flags = prev_flags;
 			return -EACCES;
 		}
+
+		if (!(caps2 & HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED) &&
+		    (*flags & KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE)) {
+			*flags = prev_flags;
+			return -EACCES;
+		}
 	}
 
 	target->dbg_flags = *flags;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index a0990dd2378c..7a402c9c1b6e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -2028,6 +2028,10 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
 		if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 0, 0))
 			dev->node_props.capability |=
 				HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED;
+
+		if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0))
+			dev->node_props.capability2 |=
+				HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED;
 	}
 
 	kfd_topology_set_dbg_firmware_support(dev);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 4d0c1a53f9d5..6e91875c10ba 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -46,9 +46,10 @@
  * - 1.18 - Rename pad in set_memory_policy_args to misc_process_flag
  * - 1.19 - Add a new ioctl to craete secondary kfd processes
  * - 1.20 - Trap handler support for expert scheduling mode available
+ * - 1.21 - Debugger support to subscribe to LDS out-of-address exceptions
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 20
+#define KFD_IOCTL_MINOR_VERSION 21
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
@@ -947,6 +948,7 @@ enum kfd_dbg_trap_address_watch_mode {
 enum kfd_dbg_trap_flags {
 	KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1,
 	KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP = 2,
+	KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE = 4
 };
 
 /* Trap exceptions */
diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h
index 1125fe47959f..0b6ce2f3c887 100644
--- a/include/uapi/linux/kfd_sysfs.h
+++ b/include/uapi/linux/kfd_sysfs.h
@@ -64,7 +64,8 @@
 #define HSA_CAP_RESERVED					0x000f8000
 
 #define HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED			0x00000001
-#define HSA_CAP2_RESERVED					0xfffffffe
+#define HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED	0x00000002
+#define HSA_CAP2_RESERVED					0xfffffffc
 
 /* debug_prop bits in node properties */
 #define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK     0x0000000f
-- 
cgit v1.2.3


From cb8fe62f87ad21f4c174aec480694c9b4b8b01c4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 20 Dec 2025 03:04:26 +0900
Subject: nilfs2: convert nilfs_super_block to kernel-doc

Eliminate 40+ kernel-doc warnings in nilfs2_ondisk.h by converting
all of the struct member comments to kernel-doc comments.

Fix one misnamed struct member in nilfs_direct_node.

Object files before and after are the same size and content.

Examples of warnings:
Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_rev_level'
 not described in 'nilfs_super_block'
Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member
 's_minor_rev_level' not described in 'nilfs_super_block'
Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_magic'
 not described in 'nilfs_super_block'
Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_bytes'
 not described in 'nilfs_super_block'
Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_flags'
 not described in 'nilfs_super_block'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
---
 include/uapi/linux/nilfs2_ondisk.h | 163 ++++++++++++++++++++++---------------
 1 file changed, 97 insertions(+), 66 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nilfs2_ondisk.h b/include/uapi/linux/nilfs2_ondisk.h
index 3196cc44a002..b3442b16ff6a 100644
--- a/include/uapi/linux/nilfs2_ondisk.h
+++ b/include/uapi/linux/nilfs2_ondisk.h
@@ -133,73 +133,104 @@ struct nilfs_super_root {
 
 /**
  * struct nilfs_super_block - structure of super block on disk
+ * @s_rev_level:		Revision level
+ * @s_minor_rev_level:		minor revision level
+ * @s_magic:			Magic signature
+ * @s_bytes:			Bytes count of CRC calculation for
+ *				  this structure.  s_reserved is excluded.
+ * @s_flags:			flags
+ * @s_crc_seed:			Seed value of CRC calculation
+ * @s_sum:			Check sum of super block
+ * @s_log_block_size:		Block size represented as follows:
+ *				  blocksize = 1 << (s_log_block_size + 10)
+ * @s_nsegments:		Number of segments in filesystem
+ * @s_dev_size:			block device size in bytes
+ * @s_first_data_block:		1st seg disk block number
+ * @s_blocks_per_segment:	number of blocks per full segment
+ * @s_r_segments_percentage:	Reserved segments percentage
+ * @s_last_cno:			Last checkpoint number
+ * @s_last_pseg:		disk block addr pseg written last
+ * @s_last_seq:			seq. number of seg written last
+ * @s_free_blocks_count:	Free blocks count
+ * @s_ctime:			Creation time (execution time of newfs)
+ * @s_mtime:			Mount time
+ * @s_wtime:			Write time
+ * @s_mnt_count:		Mount count
+ * @s_max_mnt_count:		Maximal mount count
+ * @s_state:			File system state
+ * @s_errors:			Behaviour when detecting errors
+ * @s_lastcheck:		time of last check
+ * @s_checkinterval:		max. time between checks
+ * @s_creator_os:		OS
+ * @s_def_resuid:		Default uid for reserved blocks
+ * @s_def_resgid:		Default gid for reserved blocks
+ * @s_first_ino:		First non-reserved inode
+ * @s_inode_size:		Size of an inode
+ * @s_dat_entry_size:		Size of a dat entry
+ * @s_checkpoint_size:		Size of a checkpoint
+ * @s_segment_usage_size:	Size of a segment usage
+ * @s_uuid:			128-bit uuid for volume
+ * @s_volume_name:		volume name
+ * @s_c_interval:		Commit interval of segment
+ * @s_c_block_max:		Threshold of data amount for the
+ *				  segment construction
+ * @s_feature_compat:		Compatible feature set
+ * @s_feature_compat_ro:	Read-only compatible feature set
+ * @s_feature_incompat:		Incompatible feature set
+ * @s_reserved:			padding to the end of the block
  */
 struct nilfs_super_block {
-/*00*/	__le32	s_rev_level;		/* Revision level */
-	__le16	s_minor_rev_level;	/* minor revision level */
-	__le16	s_magic;		/* Magic signature */
-
-	__le16  s_bytes;		/*
-					 * Bytes count of CRC calculation
-					 * for this structure. s_reserved
-					 * is excluded.
-					 */
-	__le16  s_flags;		/* flags */
-	__le32  s_crc_seed;		/* Seed value of CRC calculation */
-/*10*/	__le32	s_sum;			/* Check sum of super block */
-
-	__le32	s_log_block_size;	/*
-					 * Block size represented as follows
-					 * blocksize =
-					 *     1 << (s_log_block_size + 10)
-					 */
-	__le64  s_nsegments;		/* Number of segments in filesystem */
-/*20*/	__le64  s_dev_size;		/* block device size in bytes */
-	__le64	s_first_data_block;	/* 1st seg disk block number */
-/*30*/	__le32  s_blocks_per_segment;   /* number of blocks per full segment */
-	__le32	s_r_segments_percentage; /* Reserved segments percentage */
-
-	__le64  s_last_cno;		/* Last checkpoint number */
-/*40*/	__le64  s_last_pseg;		/* disk block addr pseg written last */
-	__le64  s_last_seq;             /* seq. number of seg written last */
-/*50*/	__le64	s_free_blocks_count;	/* Free blocks count */
-
-	__le64	s_ctime;		/*
-					 * Creation time (execution time of
-					 * newfs)
-					 */
-/*60*/	__le64	s_mtime;		/* Mount time */
-	__le64	s_wtime;		/* Write time */
-/*70*/	__le16	s_mnt_count;		/* Mount count */
-	__le16	s_max_mnt_count;	/* Maximal mount count */
-	__le16	s_state;		/* File system state */
-	__le16	s_errors;		/* Behaviour when detecting errors */
-	__le64	s_lastcheck;		/* time of last check */
-
-/*80*/	__le32	s_checkinterval;	/* max. time between checks */
-	__le32	s_creator_os;		/* OS */
-	__le16	s_def_resuid;		/* Default uid for reserved blocks */
-	__le16	s_def_resgid;		/* Default gid for reserved blocks */
-	__le32	s_first_ino;		/* First non-reserved inode */
-
-/*90*/	__le16  s_inode_size;		/* Size of an inode */
-	__le16  s_dat_entry_size;       /* Size of a dat entry */
-	__le16  s_checkpoint_size;      /* Size of a checkpoint */
-	__le16	s_segment_usage_size;	/* Size of a segment usage */
-
-/*98*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
-/*A8*/	char	s_volume_name[80]	/* volume name */
-			__kernel_nonstring;
-
-/*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
-	__le32  s_c_block_max;          /*
-					 * Threshold of data amount for
-					 * the segment construction
-					 */
-/*100*/	__le64  s_feature_compat;	/* Compatible feature set */
-	__le64  s_feature_compat_ro;	/* Read-only compatible feature set */
-	__le64  s_feature_incompat;	/* Incompatible feature set */
-	__u32	s_reserved[186];	/* padding to the end of the block */
+/*00*/	__le32	s_rev_level;
+	__le16	s_minor_rev_level;
+	__le16	s_magic;
+
+	__le16  s_bytes;
+	__le16  s_flags;
+	__le32  s_crc_seed;
+/*10*/	__le32	s_sum;
+
+	__le32	s_log_block_size;
+	__le64  s_nsegments;
+/*20*/	__le64  s_dev_size;
+	__le64	s_first_data_block;
+/*30*/	__le32  s_blocks_per_segment;
+	__le32	s_r_segments_percentage;
+
+	__le64  s_last_cno;
+/*40*/	__le64  s_last_pseg;
+	__le64  s_last_seq;
+/*50*/	__le64	s_free_blocks_count;
+
+	__le64	s_ctime;
+/*60*/	__le64	s_mtime;
+	__le64	s_wtime;
+/*70*/	__le16	s_mnt_count;
+	__le16	s_max_mnt_count;
+	__le16	s_state;
+	__le16	s_errors;
+	__le64	s_lastcheck;
+
+/*80*/	__le32	s_checkinterval;
+	__le32	s_creator_os;
+	__le16	s_def_resuid;
+	__le16	s_def_resgid;
+	__le32	s_first_ino;
+
+/*90*/	__le16  s_inode_size;
+	__le16  s_dat_entry_size;
+	__le16  s_checkpoint_size;
+	__le16	s_segment_usage_size;
+
+/*98*/	__u8	s_uuid[16];
+/*A8*/	char	s_volume_name[80]	__kernel_nonstring;
+
+/*F8*/	__le32  s_c_interval;
+	__le32  s_c_block_max;
+
+/*100*/	__le64  s_feature_compat;
+	__le64  s_feature_compat_ro;
+	__le64  s_feature_incompat;
+	__u32	s_reserved[186];
 };
 
 /*
@@ -449,7 +480,7 @@ struct nilfs_btree_node {
 /**
  * struct nilfs_direct_node - header of built-in bmap array
  * @dn_flags: flags
- * @dn_pad: padding
+ * @pad: padding
  */
 struct nilfs_direct_node {
 	__u8 dn_flags;
-- 
cgit v1.2.3


From 6fd8a09f48d6fee184207f4e15e939898a3947f9 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Sat, 20 Dec 2025 03:04:27 +0900
Subject: nilfs2: fix missing struct keywords in nilfs2_api.h kernel-doc

Eliminate the following kernel-doc warnings in nilfs2_api.h:

Warning: include/uapi/linux/nilfs2_api.h:65 cannot understand function
 prototype: 'struct nilfs_suinfo'
Warning: include/uapi/linux/nilfs2_api.h:101 cannot understand function
 prototype: 'struct nilfs_suinfo_update'

This ensures that the documentation for nilfs_suinfo and
nilfs_suinfo_update is correctly parsed and generated by adding the
missing 'struct' keyword to their kernel-doc comments.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
---
 include/uapi/linux/nilfs2_api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nilfs2_api.h b/include/uapi/linux/nilfs2_api.h
index 8b9b89104f3d..d1b6fcde2fb8 100644
--- a/include/uapi/linux/nilfs2_api.h
+++ b/include/uapi/linux/nilfs2_api.h
@@ -58,7 +58,7 @@ NILFS_CPINFO_FNS(INVALID, invalid)
 NILFS_CPINFO_FNS(MINOR, minor)
 
 /**
- * nilfs_suinfo - segment usage information
+ * struct nilfs_suinfo - segment usage information
  * @sui_lastmod: timestamp of last modification
  * @sui_nblocks: number of written blocks in segment
  * @sui_flags: segment usage flags
@@ -93,7 +93,7 @@ static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si)
 }
 
 /**
- * nilfs_suinfo_update - segment usage information update
+ * struct nilfs_suinfo_update - segment usage information update
  * @sup_segnum: segment number
  * @sup_flags: flags for which fields are active in sup_sui
  * @sup_reserved: reserved necessary for alignment
-- 
cgit v1.2.3


From 98b9f207afa53aff2edb0e52910c4348b456b37d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 22 Dec 2025 09:04:13 +0100
Subject: dmaengine: idxd: uapi: use UAPI types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using libc types and headers from the UAPI headers is problematic as it
introduces a dependency on a full C toolchain.

Use the fixed-width integer types provided by the UAPI headers instead.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251222-uapi-idxd-v1-1-baa183adb20d@linutronix.de
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 270 +++++++++++++++++++++++-----------------------
 1 file changed, 133 insertions(+), 137 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 3d1987e1bb2d..fdcc8eefb925 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -3,11 +3,7 @@
 #ifndef _USR_IDXD_H_
 #define _USR_IDXD_H_
 
-#ifdef __KERNEL__
 #include <linux/types.h>
-#else
-#include <stdint.h>
-#endif
 
 /* Driver command error status */
 enum idxd_scmd_stat {
@@ -176,132 +172,132 @@ enum iax_completion_status {
 #define DSA_COMP_STATUS(status)		((status) & DSA_COMP_STATUS_MASK)
 
 struct dsa_hw_desc {
-	uint32_t	pasid:20;
-	uint32_t	rsvd:11;
-	uint32_t	priv:1;
-	uint32_t	flags:24;
-	uint32_t	opcode:8;
-	uint64_t	completion_addr;
+	__u32	pasid:20;
+	__u32	rsvd:11;
+	__u32	priv:1;
+	__u32	flags:24;
+	__u32	opcode:8;
+	__u64	completion_addr;
 	union {
-		uint64_t	src_addr;
-		uint64_t	rdback_addr;
-		uint64_t	pattern;
-		uint64_t	desc_list_addr;
-		uint64_t	pattern_lower;
-		uint64_t	transl_fetch_addr;
+		__u64	src_addr;
+		__u64	rdback_addr;
+		__u64	pattern;
+		__u64	desc_list_addr;
+		__u64	pattern_lower;
+		__u64	transl_fetch_addr;
 	};
 	union {
-		uint64_t	dst_addr;
-		uint64_t	rdback_addr2;
-		uint64_t	src2_addr;
-		uint64_t	comp_pattern;
+		__u64	dst_addr;
+		__u64	rdback_addr2;
+		__u64	src2_addr;
+		__u64	comp_pattern;
 	};
 	union {
-		uint32_t	xfer_size;
-		uint32_t	desc_count;
-		uint32_t	region_size;
+		__u32	xfer_size;
+		__u32	desc_count;
+		__u32	region_size;
 	};
-	uint16_t	int_handle;
-	uint16_t	rsvd1;
+	__u16	int_handle;
+	__u16	rsvd1;
 	union {
-		uint8_t		expected_res;
+		__u8		expected_res;
 		/* create delta record */
 		struct {
-			uint64_t	delta_addr;
-			uint32_t	max_delta_size;
-			uint32_t 	delt_rsvd;
-			uint8_t 	expected_res_mask;
+			__u64	delta_addr;
+			__u32	max_delta_size;
+			__u32	delt_rsvd;
+			__u8	expected_res_mask;
 		};
-		uint32_t	delta_rec_size;
-		uint64_t	dest2;
+		__u32	delta_rec_size;
+		__u64	dest2;
 		/* CRC */
 		struct {
-			uint32_t	crc_seed;
-			uint32_t	crc_rsvd;
-			uint64_t	seed_addr;
+			__u32	crc_seed;
+			__u32	crc_rsvd;
+			__u64	seed_addr;
 		};
 		/* DIF check or strip */
 		struct {
-			uint8_t		src_dif_flags;
-			uint8_t		dif_chk_res;
-			uint8_t		dif_chk_flags;
-			uint8_t		dif_chk_res2[5];
-			uint32_t	chk_ref_tag_seed;
-			uint16_t	chk_app_tag_mask;
-			uint16_t	chk_app_tag_seed;
+			__u8	src_dif_flags;
+			__u8	dif_chk_res;
+			__u8	dif_chk_flags;
+			__u8	dif_chk_res2[5];
+			__u32	chk_ref_tag_seed;
+			__u16	chk_app_tag_mask;
+			__u16	chk_app_tag_seed;
 		};
 		/* DIF insert */
 		struct {
-			uint8_t		dif_ins_res;
-			uint8_t		dest_dif_flag;
-			uint8_t		dif_ins_flags;
-			uint8_t		dif_ins_res2[13];
-			uint32_t	ins_ref_tag_seed;
-			uint16_t	ins_app_tag_mask;
-			uint16_t	ins_app_tag_seed;
+			__u8	dif_ins_res;
+			__u8	dest_dif_flag;
+			__u8	dif_ins_flags;
+			__u8	dif_ins_res2[13];
+			__u32	ins_ref_tag_seed;
+			__u16	ins_app_tag_mask;
+			__u16	ins_app_tag_seed;
 		};
 		/* DIF update */
 		struct {
-			uint8_t		src_upd_flags;
-			uint8_t		upd_dest_flags;
-			uint8_t		dif_upd_flags;
-			uint8_t		dif_upd_res[5];
-			uint32_t	src_ref_tag_seed;
-			uint16_t	src_app_tag_mask;
-			uint16_t	src_app_tag_seed;
-			uint32_t	dest_ref_tag_seed;
-			uint16_t	dest_app_tag_mask;
-			uint16_t	dest_app_tag_seed;
+			__u8	src_upd_flags;
+			__u8	upd_dest_flags;
+			__u8	dif_upd_flags;
+			__u8	dif_upd_res[5];
+			__u32	src_ref_tag_seed;
+			__u16	src_app_tag_mask;
+			__u16	src_app_tag_seed;
+			__u32	dest_ref_tag_seed;
+			__u16	dest_app_tag_mask;
+			__u16	dest_app_tag_seed;
 		};
 
 		/* Fill */
-		uint64_t	pattern_upper;
+		__u64	pattern_upper;
 
 		/* Translation fetch */
 		struct {
-			uint64_t	transl_fetch_res;
-			uint32_t	region_stride;
+			__u64	transl_fetch_res;
+			__u32	region_stride;
 		};
 
 		/* DIX generate */
 		struct {
-			uint8_t		dix_gen_res;
-			uint8_t		dest_dif_flags;
-			uint8_t		dif_flags;
-			uint8_t		dix_gen_res2[13];
-			uint32_t	ref_tag_seed;
-			uint16_t	app_tag_mask;
-			uint16_t	app_tag_seed;
+			__u8	dix_gen_res;
+			__u8	dest_dif_flags;
+			__u8	dif_flags;
+			__u8	dix_gen_res2[13];
+			__u32	ref_tag_seed;
+			__u16	app_tag_mask;
+			__u16	app_tag_seed;
 		};
 
-		uint8_t		op_specific[24];
+		__u8		op_specific[24];
 	};
 } __attribute__((packed));
 
 struct iax_hw_desc {
-	uint32_t        pasid:20;
-	uint32_t        rsvd:11;
-	uint32_t        priv:1;
-	uint32_t        flags:24;
-	uint32_t        opcode:8;
-	uint64_t        completion_addr;
-	uint64_t        src1_addr;
-	uint64_t        dst_addr;
-	uint32_t        src1_size;
-	uint16_t        int_handle;
+	__u32        pasid:20;
+	__u32        rsvd:11;
+	__u32        priv:1;
+	__u32        flags:24;
+	__u32        opcode:8;
+	__u64        completion_addr;
+	__u64        src1_addr;
+	__u64        dst_addr;
+	__u32        src1_size;
+	__u16        int_handle;
 	union {
-		uint16_t        compr_flags;
-		uint16_t        decompr_flags;
+		__u16        compr_flags;
+		__u16        decompr_flags;
 	};
-	uint64_t        src2_addr;
-	uint32_t        max_dst_size;
-	uint32_t        src2_size;
-	uint32_t	filter_flags;
-	uint32_t	num_inputs;
+	__u64	src2_addr;
+	__u32	max_dst_size;
+	__u32	src2_size;
+	__u32	filter_flags;
+	__u32	num_inputs;
 } __attribute__((packed));
 
 struct dsa_raw_desc {
-	uint64_t	field[8];
+	__u64	field[8];
 } __attribute__((packed));
 
 /*
@@ -309,91 +305,91 @@ struct dsa_raw_desc {
  * volatile and prevent the compiler from optimize the read.
  */
 struct dsa_completion_record {
-	volatile uint8_t	status;
+	volatile __u8	status;
 	union {
-		uint8_t		result;
-		uint8_t		dif_status;
+		__u8		result;
+		__u8		dif_status;
 	};
-	uint8_t			fault_info;
-	uint8_t			rsvd;
+	__u8			fault_info;
+	__u8			rsvd;
 	union {
-		uint32_t		bytes_completed;
-		uint32_t		descs_completed;
+		__u32		bytes_completed;
+		__u32		descs_completed;
 	};
-	uint64_t		fault_addr;
+	__u64		fault_addr;
 	union {
 		/* common record */
 		struct {
-			uint32_t	invalid_flags:24;
-			uint32_t	rsvd2:8;
+			__u32	invalid_flags:24;
+			__u32	rsvd2:8;
 		};
 
-		uint32_t	delta_rec_size;
-		uint64_t	crc_val;
+		__u32	delta_rec_size;
+		__u64	crc_val;
 
 		/* DIF check & strip */
 		struct {
-			uint32_t	dif_chk_ref_tag;
-			uint16_t	dif_chk_app_tag_mask;
-			uint16_t	dif_chk_app_tag;
+			__u32	dif_chk_ref_tag;
+			__u16	dif_chk_app_tag_mask;
+			__u16	dif_chk_app_tag;
 		};
 
 		/* DIF insert */
 		struct {
-			uint64_t	dif_ins_res;
-			uint32_t	dif_ins_ref_tag;
-			uint16_t	dif_ins_app_tag_mask;
-			uint16_t	dif_ins_app_tag;
+			__u64	dif_ins_res;
+			__u32	dif_ins_ref_tag;
+			__u16	dif_ins_app_tag_mask;
+			__u16	dif_ins_app_tag;
 		};
 
 		/* DIF update */
 		struct {
-			uint32_t	dif_upd_src_ref_tag;
-			uint16_t	dif_upd_src_app_tag_mask;
-			uint16_t	dif_upd_src_app_tag;
-			uint32_t	dif_upd_dest_ref_tag;
-			uint16_t	dif_upd_dest_app_tag_mask;
-			uint16_t	dif_upd_dest_app_tag;
+			__u32	dif_upd_src_ref_tag;
+			__u16	dif_upd_src_app_tag_mask;
+			__u16	dif_upd_src_app_tag;
+			__u32	dif_upd_dest_ref_tag;
+			__u16	dif_upd_dest_app_tag_mask;
+			__u16	dif_upd_dest_app_tag;
 		};
 
 		/* DIX generate */
 		struct {
-			uint64_t	dix_gen_res;
-			uint32_t	dix_ref_tag;
-			uint16_t	dix_app_tag_mask;
-			uint16_t	dix_app_tag;
+			__u64	dix_gen_res;
+			__u32	dix_ref_tag;
+			__u16	dix_app_tag_mask;
+			__u16	dix_app_tag;
 		};
 
-		uint8_t		op_specific[16];
+		__u8		op_specific[16];
 	};
 } __attribute__((packed));
 
 struct dsa_raw_completion_record {
-	uint64_t	field[4];
+	__u64	field[4];
 } __attribute__((packed));
 
 struct iax_completion_record {
-	volatile uint8_t        status;
-	uint8_t                 error_code;
-	uint8_t			fault_info;
-	uint8_t			rsvd;
-	uint32_t                bytes_completed;
-	uint64_t                fault_addr;
-	uint32_t                invalid_flags;
-	uint32_t                rsvd2;
-	uint32_t                output_size;
-	uint8_t                 output_bits;
-	uint8_t                 rsvd3;
-	uint16_t                xor_csum;
-	uint32_t                crc;
-	uint32_t                min;
-	uint32_t                max;
-	uint32_t                sum;
-	uint64_t                rsvd4[2];
+	volatile __u8        status;
+	__u8                 error_code;
+	__u8		     fault_info;
+	__u8		     rsvd;
+	__u32                bytes_completed;
+	__u64                fault_addr;
+	__u32                invalid_flags;
+	__u32                rsvd2;
+	__u32                output_size;
+	__u8                 output_bits;
+	__u8                 rsvd3;
+	__u16                xor_csum;
+	__u32                crc;
+	__u32                min;
+	__u32                max;
+	__u32                sum;
+	__u64                rsvd4[2];
 } __attribute__((packed));
 
 struct iax_raw_completion_record {
-	uint64_t	field[8];
+	__u64	field[8];
 } __attribute__((packed));
 
 #endif
-- 
cgit v1.2.3


From 9e541b3cee70a3bbe86b176c903c23b29fe033cd Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai@linux.alibaba.com>
Date: Wed, 10 Dec 2025 21:29:05 +0800
Subject: PCI: trace: Add generic RAS tracepoint for hotplug event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hotplug events are critical indicators for analyzing hardware health, and
surprise link downs can significantly impact system performance and
reliability.

Define a new TRACING_SYSTEM named "pci", add a generic RAS tracepoint
for hotplug event to help health checks. Add enum pci_hotplug_event in
include/uapi/linux/pci.h so applications like rasdaemon can register
tracepoint event handlers for it.

The following output is generated when a device is hotplugged:

  $ echo 1 > /sys/kernel/debug/tracing/events/pci/pci_hp_event/enable
  $ cat /sys/kernel/debug/tracing/trace_pipe
     irq/51-pciehp-88      [001] .....  1311.177459: pci_hp_event: 0000:00:02.0 slot:10, event:CARD_PRESENT

     irq/51-pciehp-88      [001] .....  1311.177566: pci_hp_event: 0000:00:02.0 slot:10, event:LINK_UP

Suggested-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org> # for trace event
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://patch.msgid.link/20251210132907.58799-2-xueshuai@linux.alibaba.com
---
 drivers/pci/Makefile              |  3 ++
 drivers/pci/hotplug/pciehp_ctrl.c | 31 +++++++++++++----
 drivers/pci/trace.c               | 11 ++++++
 include/trace/events/pci.h        | 72 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/pci.h          |  7 ++++
 5 files changed, 118 insertions(+), 6 deletions(-)
 create mode 100644 drivers/pci/trace.c
 create mode 100644 include/trace/events/pci.h

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index e10cfe5a280b..8c259a9a8796 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -47,3 +47,6 @@ obj-y				+= controller/
 obj-y				+= switch/
 
 subdir-ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG
+
+CFLAGS_trace.o := -I$(src)
+obj-$(CONFIG_TRACING)		+= trace.o
diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index bcc938d4420f..7805f697a02c 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c
@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <linux/pm_runtime.h>
 #include <linux/pci.h>
+#include <trace/events/pci.h>
 
 #include "../pci.h"
 #include "pciehp.h"
@@ -244,12 +245,20 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
 	case ON_STATE:
 		ctrl->state = POWEROFF_STATE;
 		mutex_unlock(&ctrl->state_lock);
-		if (events & PCI_EXP_SLTSTA_DLLSC)
+		if (events & PCI_EXP_SLTSTA_DLLSC) {
 			ctrl_info(ctrl, "Slot(%s): Link Down\n",
 				  slot_name(ctrl));
-		if (events & PCI_EXP_SLTSTA_PDC)
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_LINK_DOWN);
+		}
+		if (events & PCI_EXP_SLTSTA_PDC) {
 			ctrl_info(ctrl, "Slot(%s): Card not present\n",
 				  slot_name(ctrl));
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_CARD_NOT_PRESENT);
+		}
 		pciehp_disable_slot(ctrl, SURPRISE_REMOVAL);
 		break;
 	default:
@@ -269,6 +278,9 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
 					      INDICATOR_NOOP);
 			ctrl_info(ctrl, "Slot(%s): Card not present\n",
 				  slot_name(ctrl));
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_CARD_NOT_PRESENT);
 		}
 		mutex_unlock(&ctrl->state_lock);
 		return;
@@ -281,12 +293,19 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events)
 	case OFF_STATE:
 		ctrl->state = POWERON_STATE;
 		mutex_unlock(&ctrl->state_lock);
-		if (present)
+		if (present) {
 			ctrl_info(ctrl, "Slot(%s): Card present\n",
 				  slot_name(ctrl));
-		if (link_active)
-			ctrl_info(ctrl, "Slot(%s): Link Up\n",
-				  slot_name(ctrl));
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_CARD_PRESENT);
+		}
+		if (link_active) {
+			ctrl_info(ctrl, "Slot(%s): Link Up\n", slot_name(ctrl));
+			trace_pci_hp_event(pci_name(ctrl->pcie->port),
+					   slot_name(ctrl),
+					   PCI_HOTPLUG_LINK_UP);
+		}
 		ctrl->request_result = pciehp_enable_slot(ctrl);
 		break;
 	default:
diff --git a/drivers/pci/trace.c b/drivers/pci/trace.c
new file mode 100644
index 000000000000..cf11abca8602
--- /dev/null
+++ b/drivers/pci/trace.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Tracepoints for PCI system
+ *
+ * Copyright (C) 2025 Alibaba Corporation
+ */
+
+#include <linux/pci.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/pci.h>
diff --git a/include/trace/events/pci.h b/include/trace/events/pci.h
new file mode 100644
index 000000000000..39e512a167ee
--- /dev/null
+++ b/include/trace/events/pci.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pci
+
+#if !defined(_TRACE_HW_EVENT_PCI_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HW_EVENT_PCI_H
+
+#include <linux/tracepoint.h>
+
+#define PCI_HOTPLUG_EVENT						\
+	EM(PCI_HOTPLUG_LINK_UP,			"LINK_UP")		\
+	EM(PCI_HOTPLUG_LINK_DOWN,		"LINK_DOWN")		\
+	EM(PCI_HOTPLUG_CARD_PRESENT,		"CARD_PRESENT")		\
+	EMe(PCI_HOTPLUG_CARD_NOT_PRESENT,	"CARD_NOT_PRESENT")
+
+/* Enums require being exported to userspace, for user tool parsing */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+PCI_HOTPLUG_EVENT
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)	{a, b},
+#define EMe(a, b)	{a, b}
+
+/*
+ * Note: For generic PCI hotplug events, we pass already-resolved strings
+ * (port_name, slot) instead of driver-specific structures like 'struct
+ * controller'.  This is because different PCI hotplug drivers (pciehp, cpqphp,
+ * ibmphp, shpchp) define their own versions of 'struct controller' with
+ * different fields and helper functions. Using driver-specific structures would
+ * make the tracepoint interface non-generic and cause compatibility issues
+ * across different drivers.
+ */
+TRACE_EVENT(pci_hp_event,
+
+	TP_PROTO(const char *port_name,
+		 const char *slot,
+		 const int event),
+
+	TP_ARGS(port_name, slot, event),
+
+	TP_STRUCT__entry(
+		__string(	port_name,	port_name	)
+		__string(	slot,		slot		)
+		__field(	int,		event	)
+	),
+
+	TP_fast_assign(
+		__assign_str(port_name);
+		__assign_str(slot);
+		__entry->event = event;
+	),
+
+	TP_printk("%s slot:%s, event:%s\n",
+		__get_str(port_name),
+		__get_str(slot),
+		__print_symbolic(__entry->event, PCI_HOTPLUG_EVENT)
+	)
+);
+
+#endif /* _TRACE_HW_EVENT_PCI_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/pci.h b/include/uapi/linux/pci.h
index a769eefc5139..4f150028965d 100644
--- a/include/uapi/linux/pci.h
+++ b/include/uapi/linux/pci.h
@@ -39,4 +39,11 @@
 #define PCIIOC_MMAP_IS_MEM	(PCIIOC_BASE | 0x02)	/* Set mmap state to MEM space. */
 #define PCIIOC_WRITE_COMBINE	(PCIIOC_BASE | 0x03)	/* Enable/disable write-combining. */
 
+enum pci_hotplug_event {
+	PCI_HOTPLUG_LINK_UP,
+	PCI_HOTPLUG_LINK_DOWN,
+	PCI_HOTPLUG_CARD_PRESENT,
+	PCI_HOTPLUG_CARD_NOT_PRESENT,
+};
+
 #endif /* _UAPILINUX_PCI_H */
-- 
cgit v1.2.3


From 3c4629b68dbe18e454cce4b864c530268cffbeed Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 22 Dec 2025 09:00:33 +0100
Subject: virtio: uapi: avoid usage of libc types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using libc types and headers from the UAPI headers is problematic as it
introduces a dependency on a full C toolchain.

On Linux 'unsigned long' works as a replacement for 'uintptr_t' and does
not depend on libc.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20251222-uapi-virtio-v1-1-29390f87bcad@linutronix.de>
---
 include/uapi/linux/virtio_ring.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index f8c20d3de8da..3c478582a3c2 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -31,9 +31,6 @@
  * SUCH DAMAGE.
  *
  * Copyright Rusty Russell IBM Corporation 2007. */
-#ifndef __KERNEL__
-#include <stdint.h>
-#endif
 #include <linux/types.h>
 #include <linux/virtio_types.h>
 
@@ -202,7 +199,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p,
 	vr->num = num;
 	vr->desc = p;
 	vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc));
-	vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16)
+	vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__virtio16)
 		+ align-1) & ~(align - 1));
 }
 
-- 
cgit v1.2.3


From 40fc797ba18328e57ed1cb213b4b5e48f86f4c7c Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Mon, 15 Dec 2025 18:17:09 +0000
Subject: binder: fix trivial typo in uapi header

As reported by codespell:

  include/uapi/linux/android/binder.h:281: interupted ==> interrupted

Signed-off-by: Carlos Llamas <cmllamas@google.com>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Link: https://patch.msgid.link/20251215181724.3811977-1-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/android/binder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 03ee4c7010d7..701cad36de43 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -278,7 +278,7 @@ enum {
  * NOTE: Two special error codes you should check for when calling
  * in to the driver are:
  *
- * EINTR -- The operation has been interupted.  This should be
+ * EINTR -- The operation has been interrupted.  This should be
  * handled by retrying the ioctl() until a different error code
  * is returned.
  *
-- 
cgit v1.2.3


From c51bb53d5c68041dd02f66d9b638cda33647623e Mon Sep 17 00:00:00 2001
From: David Yat Sin <David.YatSin@amd.com>
Date: Tue, 18 Mar 2025 19:49:55 +0000
Subject: drm/amdkfd: Add metadata ring buffer for compute

Add support for separate ring-buffer for metadata packets when using
compute queues. Userspace application allocate the metadata ring-buffer
and the queue ring-buffer with a single allocation. The metadata
ring-buffer starts after the queue ring-buffer.

Signed-off-by: David Yat Sin <David.YatSin@amd.com>
Reviewed-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           |  8 ++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c | 21 +++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c             |  7 +++++--
 include/uapi/linux/kfd_ioctl.h                     |  5 +++--
 5 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 041237861107..88621cb7d409 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -221,6 +221,11 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 		pr_debug("Size lower. clamped to KFD_MIN_QUEUE_RING_SIZE");
 	}
 
+	if ((args->metadata_ring_size != 0) && !is_power_of_2(args->metadata_ring_size)) {
+		pr_err("Metadata ring size must be a power of 2 or 0\n");
+		return -EINVAL;
+	}
+
 	if (!access_ok((const void __user *) args->read_pointer_address,
 			sizeof(uint32_t))) {
 		pr_err("Can't access read pointer\n");
@@ -255,6 +260,9 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 	q_properties->priority = args->queue_priority;
 	q_properties->queue_address = args->ring_base_address;
 	q_properties->queue_size = args->ring_size;
+	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
+		q_properties->metadata_queue_size = args->metadata_ring_size;
+
 	q_properties->read_ptr = (void __user *)args->read_pointer_address;
 	q_properties->write_ptr = (void __user *)args->write_pointer_address;
 	q_properties->eop_ring_buffer_address = args->eop_buffer_address;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c
index f1c2c9e8cf6b..a06b4e89af8a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c
@@ -266,6 +266,27 @@ static void update_mqd(struct mqd_manager *mm, void *mqd,
 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
 
+	if (q->metadata_queue_size) {
+		/* On GC 12.1 is 64 DWs which is 4 times size of AQL packet */
+		if (q->metadata_queue_size == q->queue_size * 4) {
+			/*
+			 * User application allocates main queue ring and metadata queue ring
+			 * with a single allocation. metadata queue ring starts after main
+			 * queue ring.
+			 */
+			m->cp_hqd_kd_base =
+				lower_32_bits((q->queue_address + q->queue_size) >> 8);
+			m->cp_hqd_kd_base_hi =
+				upper_32_bits((q->queue_address + q->queue_size) >> 8);
+
+			m->cp_hqd_kd_cntl |= CP_HQD_KD_CNTL__KD_FETCHER_ENABLE_MASK;
+			/* KD_SIZE = 2 for metadata packet = 64 DWs */
+			m->cp_hqd_kd_cntl |= 2 << CP_HQD_KD_CNTL__KD_SIZE__SHIFT;
+		} else {
+			pr_warn("Invalid metadata ring size, metadata queue will be ignored\n");
+		}
+	}
+
 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
 	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
 	m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ebc637c38c04..d798baa7e52e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -506,7 +506,8 @@ struct queue_properties {
 	enum kfd_queue_format format;
 	unsigned int queue_id;
 	uint64_t queue_address;
-	uint64_t  queue_size;
+	uint64_t queue_size;
+	uint64_t metadata_queue_size;
 	uint32_t priority;
 	uint32_t queue_percent;
 	void __user *read_ptr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 56c97189e7f1..1b465fdb2c64 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -247,9 +247,12 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
 	    properties->format == KFD_QUEUE_FORMAT_AQL &&
 	    topo_dev->node_props.gfx_target_version >= 70000 &&
 	    topo_dev->node_props.gfx_target_version < 90000)
-		expected_queue_size = properties->queue_size / 2;
+		/* metadata_queue_size not supported on GFX7/GFX8 */
+		expected_queue_size =
+			properties->queue_size / 2;
 	else
-		expected_queue_size = properties->queue_size;
+		expected_queue_size =
+			properties->queue_size + properties->metadata_queue_size;
 
 	vm = drm_priv_to_vm(pdd->drm_priv);
 	err = amdgpu_bo_reserve(vm->root.bo, false);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 6e91875c10ba..047bcb1cc078 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -47,9 +47,10 @@
  * - 1.19 - Add a new ioctl to craete secondary kfd processes
  * - 1.20 - Trap handler support for expert scheduling mode available
  * - 1.21 - Debugger support to subscribe to LDS out-of-address exceptions
+ * - 1.22 - Add queue creation with metadata ring base address
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 21
+#define KFD_IOCTL_MINOR_VERSION 22
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
@@ -87,7 +88,7 @@ struct kfd_ioctl_create_queue_args {
 	__u32 ctx_save_restore_size;	/* to KFD */
 	__u32 ctl_stack_size;		/* to KFD */
 	__u32 sdma_engine_id;		/* to KFD */
-	__u32 pad;
+	__u32 metadata_ring_size;	/* to KFD */
 };
 
 struct kfd_ioctl_destroy_queue_args {
-- 
cgit v1.2.3


From 22cd0db47f4f65ebe8afc8c34ab120c47c73da2a Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Mon, 15 Dec 2025 13:08:12 +0100
Subject: media: uapi: mali-c55-config: Remove version identifier

The Mali C55 driver uses the v4l2-isp framework, which defines its own
versioning number which does not need to be defined again in each
platform-specific header.

Remove the definition of mali_c55_param_buffer_version enumeration from
the Mali C55 uAPI header.

Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Reviewed-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 include/uapi/linux/media/arm/mali-c55-config.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h
index 109082c5694f..3d335f950eeb 100644
--- a/include/uapi/linux/media/arm/mali-c55-config.h
+++ b/include/uapi/linux/media/arm/mali-c55-config.h
@@ -194,15 +194,6 @@ struct mali_c55_stats_buffer {
 	__u32 reserved3[15];
 } __attribute__((packed));
 
-/**
- * enum mali_c55_param_buffer_version - Mali-C55 parameters block versioning
- *
- * @MALI_C55_PARAM_BUFFER_V1: First version of Mali-C55 parameters block
- */
-enum mali_c55_param_buffer_version {
-	MALI_C55_PARAM_BUFFER_V1,
-};
-
 /**
  * enum mali_c55_param_block_type - Enumeration of Mali-C55 parameter blocks
  *
-- 
cgit v1.2.3


From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Wed, 7 Jan 2026 10:20:16 +0800
Subject: bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags

Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for
following APIs:

* 'map_lookup_elem()'
* 'map_update_elem()'
* 'generic_map_lookup_batch()'
* 'generic_map_update_batch()'

And, get the correct value size for these APIs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 23 ++++++++++++++++++++++-
 include/uapi/linux/bpf.h       |  2 ++
 kernel/bpf/syscall.c           | 31 +++++++++++++++++--------------
 tools/include/uapi/linux/bpf.h |  2 ++
 4 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a63e47d2109c..108bab1bda9d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
 }
 #endif
 
+static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type)
+{
+	return false;
+}
+
 static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags)
 {
-	if (flags & ~allowed_flags)
+	u32 cpu;
+
+	if ((u32)flags & ~allowed_flags)
 		return -EINVAL;
 
 	if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
 		return -EINVAL;
 
+	if (!(flags & BPF_F_CPU) && flags >> 32)
+		return -EINVAL;
+
+	if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) {
+		if (!bpf_map_supports_cpu_flags(map->map_type))
+			return -EINVAL;
+		if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS))
+			return -EINVAL;
+
+		cpu = flags >> 32;
+		if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus())
+			return -ERANGE;
+	}
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 84ced3ed2d21..2a2ade4be60f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1384,6 +1384,8 @@ enum {
 	BPF_NOEXIST	= 1, /* create new element if it didn't exist */
 	BPF_EXIST	= 2, /* update existing element */
 	BPF_F_LOCK	= 4, /* spin_lock-ed map_lookup/map_update */
+	BPF_F_CPU	= 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */
+	BPF_F_ALL_CPUS	= 16, /* update value across all CPUs for percpu maps */
 };
 
 /* flags for BPF_MAP_CREATE command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6dd2ad2f9e81..e8cfe9d67e64 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map)
 	return atomic64_read(&map->writecnt) != 0;
 }
 
-static u32 bpf_map_value_size(const struct bpf_map *map)
-{
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags)
+{
+	if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS))
+		return map->value_size;
+	else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+		 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+		 map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 		return round_up(map->value_size, 8) * num_possible_cpus();
 	else if (IS_FD_MAP(map))
 		return sizeof(u32);
@@ -1729,7 +1731,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
 		return -EPERM;
 
-	err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+	err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU);
 	if (err)
 		return err;
 
@@ -1737,7 +1739,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(key))
 		return PTR_ERR(key);
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, attr->flags);
 
 	err = -ENOMEM;
 	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -1804,7 +1806,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 		goto err_put;
 	}
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, attr->flags);
 	value = kvmemdup_bpfptr(uvalue, value_size);
 	if (IS_ERR(value)) {
 		err = PTR_ERR(value);
@@ -2000,11 +2002,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 	void *key, *value;
 	int err = 0;
 
-	err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+	err = bpf_map_check_op_flags(map, attr->batch.elem_flags,
+				     BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS);
 	if (err)
 		return err;
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, attr->batch.elem_flags);
 
 	max_count = attr->batch.count;
 	if (!max_count)
@@ -2059,11 +2062,11 @@ int generic_map_lookup_batch(struct bpf_map *map,
 	u32 value_size, cp, max_count;
 	int err;
 
-	err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+	err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU);
 	if (err)
 		return err;
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, attr->batch.elem_flags);
 
 	max_count = attr->batch.count;
 	if (!max_count)
@@ -2185,7 +2188,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, 0);
 
 	err = -ENOMEM;
 	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6b92b0847ec2..b816bc53d2e1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1384,6 +1384,8 @@ enum {
 	BPF_NOEXIST	= 1, /* create new element if it didn't exist */
 	BPF_EXIST	= 2, /* update existing element */
 	BPF_F_LOCK	= 4, /* spin_lock-ed map_lookup/map_update */
+	BPF_F_CPU	= 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */
+	BPF_F_ALL_CPUS	= 16, /* update value across all CPUs for percpu maps */
 };
 
 /* flags for BPF_MAP_CREATE command */
-- 
cgit v1.2.3


From caa07a815d6ee32586beb66f67e7e3c103a02efd Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Thu, 8 Jan 2026 14:32:10 +0900
Subject: PM: EM: Rename em.yaml to dev-energymodel.yaml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The EM YNL specification used many acronyms, including ‘em’, ‘pd’,
‘ps’, etc. While the acronyms are short and convenient, they could be
confusing. So, let’s spell them out to be more specific. The following
changes were made in the spec. Note that the protocol name cannot exceed
GENL_NAMSIZ (16).

  em           -> dev-energymodel
  pds          -> perf-domains
  pd           -> perf-domain
  pd-id        -> perf-domain-id
  pd-table     -> perf-table
  ps           -> perf-state
  get-pds      -> get-perf-domains
  get-pd-table -> get-perf-table
  pd-created   -> perf-domain-created
  pd-updated   -> perf-domain-updated
  pd-deleted   -> perf-domain-deleted

In addition. doc strings were added to the spec. based on the comments in
energy_model.h. Two flag attributes (perf-state-flags and
perf-domain-flags) were added for easily interpreting the bit flags.

Finally, the autogenerated files and em_netlink.c were updated accordingly
to reflect the name changes.

Suggested-by: Donald Hunter <donald.hunter@gmail.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Changwoo Min <changwoo@igalia.com>
Link: https://patch.msgid.link/20260108053212.642478-3-changwoo@igalia.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/netlink/specs/dev-energymodel.yaml | 175 +++++++++++++++++++++++
 Documentation/netlink/specs/em.yaml              | 116 ---------------
 MAINTAINERS                                      |   8 +-
 include/uapi/linux/dev_energymodel.h             |  89 ++++++++++++
 include/uapi/linux/energy_model.h                |  63 --------
 kernel/power/em_netlink.c                        | 135 ++++++++++-------
 kernel/power/em_netlink_autogen.c                |  44 +++---
 kernel/power/em_netlink_autogen.h                |  20 +--
 8 files changed, 384 insertions(+), 266 deletions(-)
 create mode 100644 Documentation/netlink/specs/dev-energymodel.yaml
 delete mode 100644 Documentation/netlink/specs/em.yaml
 create mode 100644 include/uapi/linux/dev_energymodel.h
 delete mode 100644 include/uapi/linux/energy_model.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/dev-energymodel.yaml b/Documentation/netlink/specs/dev-energymodel.yaml
new file mode 100644
index 000000000000..cbc4bc38f23c
--- /dev/null
+++ b/Documentation/netlink/specs/dev-energymodel.yaml
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+#
+# Copyright (c) 2025 Valve Corporation.
+#
+---
+name: dev-energymodel
+
+doc: |
+  Energy model netlink interface to notify its changes.
+
+protocol: genetlink
+
+uapi-header: linux/dev_energymodel.h
+
+definitions:
+  -
+    type: flags
+    name: perf-state-flags
+    entries:
+      -
+        name: perf-state-inefficient
+        doc: >-
+          The performance state is inefficient. There is in this perf-domain,
+          another performance state with a higher frequency but a lower or
+          equal power cost.
+  -
+    type: flags
+    name: perf-domain-flags
+    entries:
+      -
+        name: perf-domain-microwatts
+        doc: >-
+          The power values are in micro-Watts or some other scale.
+      -
+        name: perf-domain-skip-inefficiencies
+        doc: >-
+          Skip inefficient states when estimating energy consumption.
+      -
+        name: perf-domain-artificial
+        doc: >-
+          The power values are artificial and might be created by platform
+          missing real power information.
+
+attribute-sets:
+  -
+    name: perf-domains
+    doc: >-
+      Information on all the performance domains.
+    attributes:
+      -
+        name: perf-domain
+        type: nest
+        nested-attributes: perf-domain
+        multi-attr: true
+  -
+    name: perf-domain
+    doc: >-
+      Information on a single performance domains.
+    attributes:
+      -
+        name: pad
+        type: pad
+      -
+        name: perf-domain-id
+        type: u32
+        doc: >-
+          A unique ID number for each performance domain.
+      -
+        name: flags
+        type: u64
+        doc: >-
+          Bitmask of performance domain flags.
+        enum: perf-domain-flags
+      -
+        name: cpus
+        type: string
+        doc: >-
+          CPUs that belong to this performance domain.
+  -
+    name: perf-table
+    doc: >-
+      Performance states table.
+    attributes:
+      -
+        name: perf-domain-id
+        type: u32
+        doc: >-
+          A unique ID number for each performance domain.
+      -
+        name: perf-state
+        type: nest
+        nested-attributes: perf-state
+        multi-attr: true
+  -
+    name: perf-state
+    doc: >-
+      Performance state of a performance domain.
+    attributes:
+      -
+        name: pad
+        type: pad
+      -
+        name: performance
+        type: u64
+        doc: >-
+          CPU performance (capacity) at a given frequency.
+      -
+        name: frequency
+        type: u64
+        doc: >-
+          The frequency in KHz, for consistency with CPUFreq.
+      -
+        name: power
+        type: u64
+        doc: >-
+          The power consumed at this level (by 1 CPU or by a registered
+          device). It can be a total power: static and dynamic.
+      -
+        name: cost
+        type: u64
+        doc: >-
+          The cost coefficient associated with this level, used during energy
+          calculation. Equal to: power * max_frequency / frequency.
+      -
+        name: flags
+        type: u64
+        doc: >-
+          Bitmask of performance state flags.
+        enum: perf-state-flags
+
+operations:
+  list:
+    -
+      name: get-perf-domains
+      attribute-set: perf-domains
+      doc: Get the list of information for all performance domains.
+      do:
+        reply:
+          attributes:
+            - perf-domain
+    -
+      name: get-perf-table
+      attribute-set: perf-table
+      doc: Get the energy model table of a performance domain.
+      do:
+        request:
+          attributes:
+            - perf-domain-id
+        reply:
+          attributes:
+            - perf-domain-id
+            - perf-state
+    -
+      name: perf-domain-created
+      doc: A performance domain is created.
+      notify: get-perf-table
+      mcgrp: event
+    -
+      name: perf-domain-updated
+      doc: A performance domain is updated.
+      notify: get-perf-table
+      mcgrp: event
+    -
+      name: perf-domain-deleted
+      doc: A performance domain is deleted.
+      attribute-set: perf-table
+      event:
+        attributes:
+          - perf-domain-id
+      mcgrp: event
+
+mcast-groups:
+  list:
+    -
+      name: event
diff --git a/Documentation/netlink/specs/em.yaml b/Documentation/netlink/specs/em.yaml
deleted file mode 100644
index 0c595a874f08..000000000000
--- a/Documentation/netlink/specs/em.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
-#
-# Copyright (c) 2025 Valve Corporation.
-#
----
-name: em
-
-doc: |
-  Energy model netlink interface to notify its changes.
-
-protocol: genetlink
-
-uapi-header: linux/energy_model.h
-
-attribute-sets:
-  -
-    name: pds
-    attributes:
-      -
-        name: pd
-        type: nest
-        nested-attributes: pd
-        multi-attr: true
-  -
-    name: pd
-    attributes:
-      -
-        name: pad
-        type: pad
-      -
-        name: pd-id
-        type: u32
-      -
-        name: flags
-        type: u64
-      -
-        name: cpus
-        type: string
-  -
-    name: pd-table
-    attributes:
-      -
-        name: pd-id
-        type: u32
-      -
-        name: ps
-        type: nest
-        nested-attributes: ps
-        multi-attr: true
-  -
-    name: ps
-    attributes:
-      -
-        name: pad
-        type: pad
-      -
-        name: performance
-        type: u64
-      -
-        name: frequency
-        type: u64
-      -
-        name: power
-        type: u64
-      -
-        name: cost
-        type: u64
-      -
-        name: flags
-        type: u64
-
-operations:
-  list:
-    -
-      name: get-pds
-      attribute-set: pds
-      doc: Get the list of information for all performance domains.
-      do:
-        reply:
-          attributes:
-            - pd
-    -
-      name: get-pd-table
-      attribute-set: pd-table
-      doc: Get the energy model table of a performance domain.
-      do:
-        request:
-          attributes:
-            - pd-id
-        reply:
-          attributes:
-            - pd-id
-            - ps
-    -
-      name: pd-created
-      doc: A performance domain is created.
-      notify: get-pd-table
-      mcgrp: event
-    -
-      name: pd-updated
-      doc: A performance domain is updated.
-      notify: get-pd-table
-      mcgrp: event
-    -
-      name: pd-deleted
-      doc: A performance domain is deleted.
-      attribute-set: pd-table
-      event:
-        attributes:
-          - pd-id
-      mcgrp: event
-
-mcast-groups:
-  list:
-    -
-      name: event
diff --git a/MAINTAINERS b/MAINTAINERS
index 765ad2daa218..1e208243b28e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9298,12 +9298,12 @@ M:	Lukasz Luba <lukasz.luba@arm.com>
 M:	"Rafael J. Wysocki" <rafael@kernel.org>
 L:	linux-pm@vger.kernel.org
 S:	Maintained
-F:	kernel/power/energy_model.c
-F:	include/linux/energy_model.h
+F:	Documentation/netlink/specs/dev-energymodel.yaml
 F:	Documentation/power/energy-model.rst
-F:	Documentation/netlink/specs/em.yaml
-F:	include/uapi/linux/energy_model.h
+F:	include/linux/energy_model.h
+F:	include/uapi/linux/dev_energymodel.h
 F:	kernel/power/em_netlink*.*
+F:	kernel/power/energy_model.c
 
 EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER
 M:	Laurentiu Tudor <laurentiu.tudor@nxp.com>
diff --git a/include/uapi/linux/dev_energymodel.h b/include/uapi/linux/dev_energymodel.h
new file mode 100644
index 000000000000..3399967e1f93
--- /dev/null
+++ b/include/uapi/linux/dev_energymodel.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/dev-energymodel.yaml */
+/* YNL-GEN uapi header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _UAPI_LINUX_DEV_ENERGYMODEL_H
+#define _UAPI_LINUX_DEV_ENERGYMODEL_H
+
+#define DEV_ENERGYMODEL_FAMILY_NAME	"dev-energymodel"
+#define DEV_ENERGYMODEL_FAMILY_VERSION	1
+
+/**
+ * enum dev_energymodel_perf_state_flags
+ * @DEV_ENERGYMODEL_PERF_STATE_FLAGS_PERF_STATE_INEFFICIENT: The performance
+ *   state is inefficient. There is in this perf-domain, another performance
+ *   state with a higher frequency but a lower or equal power cost.
+ */
+enum dev_energymodel_perf_state_flags {
+	DEV_ENERGYMODEL_PERF_STATE_FLAGS_PERF_STATE_INEFFICIENT = 1,
+};
+
+/**
+ * enum dev_energymodel_perf_domain_flags
+ * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_MICROWATTS: The power values
+ *   are in micro-Watts or some other scale.
+ * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip
+ *   inefficient states when estimating energy consumption.
+ * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL: The power values
+ *   are artificial and might be created by platform missing real power
+ *   information.
+ */
+enum dev_energymodel_perf_domain_flags {
+	DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_MICROWATTS = 1,
+	DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_SKIP_INEFFICIENCIES = 2,
+	DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL = 4,
+};
+
+enum {
+	DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN = 1,
+
+	__DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX,
+	DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX - 1)
+};
+
+enum {
+	DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD = 1,
+	DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID,
+	DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS,
+	DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS,
+
+	__DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX,
+	DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX - 1)
+};
+
+enum {
+	DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID = 1,
+	DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE,
+
+	__DEV_ENERGYMODEL_A_PERF_TABLE_MAX,
+	DEV_ENERGYMODEL_A_PERF_TABLE_MAX = (__DEV_ENERGYMODEL_A_PERF_TABLE_MAX - 1)
+};
+
+enum {
+	DEV_ENERGYMODEL_A_PERF_STATE_PAD = 1,
+	DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE,
+	DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY,
+	DEV_ENERGYMODEL_A_PERF_STATE_POWER,
+	DEV_ENERGYMODEL_A_PERF_STATE_COST,
+	DEV_ENERGYMODEL_A_PERF_STATE_FLAGS,
+
+	__DEV_ENERGYMODEL_A_PERF_STATE_MAX,
+	DEV_ENERGYMODEL_A_PERF_STATE_MAX = (__DEV_ENERGYMODEL_A_PERF_STATE_MAX - 1)
+};
+
+enum {
+	DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS = 1,
+	DEV_ENERGYMODEL_CMD_GET_PERF_TABLE,
+	DEV_ENERGYMODEL_CMD_PERF_DOMAIN_CREATED,
+	DEV_ENERGYMODEL_CMD_PERF_DOMAIN_UPDATED,
+	DEV_ENERGYMODEL_CMD_PERF_DOMAIN_DELETED,
+
+	__DEV_ENERGYMODEL_CMD_MAX,
+	DEV_ENERGYMODEL_CMD_MAX = (__DEV_ENERGYMODEL_CMD_MAX - 1)
+};
+
+#define DEV_ENERGYMODEL_MCGRP_EVENT	"event"
+
+#endif /* _UAPI_LINUX_DEV_ENERGYMODEL_H */
diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h
deleted file mode 100644
index 0bcad967854f..000000000000
--- a/include/uapi/linux/energy_model.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
-/* Do not edit directly, auto-generated from: */
-/*	Documentation/netlink/specs/em.yaml */
-/* YNL-GEN uapi header */
-/* To regenerate run: tools/net/ynl/ynl-regen.sh */
-
-#ifndef _UAPI_LINUX_ENERGY_MODEL_H
-#define _UAPI_LINUX_ENERGY_MODEL_H
-
-#define EM_FAMILY_NAME		"em"
-#define EM_FAMILY_VERSION	1
-
-enum {
-	EM_A_PDS_PD = 1,
-
-	__EM_A_PDS_MAX,
-	EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1)
-};
-
-enum {
-	EM_A_PD_PAD = 1,
-	EM_A_PD_PD_ID,
-	EM_A_PD_FLAGS,
-	EM_A_PD_CPUS,
-
-	__EM_A_PD_MAX,
-	EM_A_PD_MAX = (__EM_A_PD_MAX - 1)
-};
-
-enum {
-	EM_A_PD_TABLE_PD_ID = 1,
-	EM_A_PD_TABLE_PS,
-
-	__EM_A_PD_TABLE_MAX,
-	EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1)
-};
-
-enum {
-	EM_A_PS_PAD = 1,
-	EM_A_PS_PERFORMANCE,
-	EM_A_PS_FREQUENCY,
-	EM_A_PS_POWER,
-	EM_A_PS_COST,
-	EM_A_PS_FLAGS,
-
-	__EM_A_PS_MAX,
-	EM_A_PS_MAX = (__EM_A_PS_MAX - 1)
-};
-
-enum {
-	EM_CMD_GET_PDS = 1,
-	EM_CMD_GET_PD_TABLE,
-	EM_CMD_PD_CREATED,
-	EM_CMD_PD_UPDATED,
-	EM_CMD_PD_DELETED,
-
-	__EM_CMD_MAX,
-	EM_CMD_MAX = (__EM_CMD_MAX - 1)
-};
-
-#define EM_MCGRP_EVENT	"event"
-
-#endif /* _UAPI_LINUX_ENERGY_MODEL_H */
diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c
index 4b85da138a06..6f6238c465bb 100644
--- a/kernel/power/em_netlink.c
+++ b/kernel/power/em_netlink.c
@@ -12,27 +12,31 @@
 #include <linux/energy_model.h>
 #include <net/sock.h>
 #include <net/genetlink.h>
-#include <uapi/linux/energy_model.h>
+#include <uapi/linux/dev_energymodel.h>
 
 #include "em_netlink.h"
 #include "em_netlink_autogen.h"
 
-#define EM_A_PD_CPUS_LEN		256
+#define DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN		256
 
 /*************************** Command encoding ********************************/
 static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data)
 {
-	char cpus_buf[EM_A_PD_CPUS_LEN];
+	char cpus_buf[DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN];
 	int *tot_msg_sz = data;
 	int msg_sz, cpus_sz;
 
 	cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
 			   cpumask_pr_args(to_cpumask(pd->cpus)));
 
-	msg_sz = nla_total_size(0) +			/* EM_A_PDS_PD */
-		 nla_total_size(sizeof(u32)) +		/* EM_A_PD_PD_ID */
-		 nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PD_FLAGS */
-		 nla_total_size(cpus_sz);		/* EM_A_PD_CPUS */
+	msg_sz = nla_total_size(0) +
+		 /* DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN */
+		 nla_total_size(sizeof(u32)) +
+		 /* DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID */
+		 nla_total_size_64bit(sizeof(u64)) +
+		 /* DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS */
+		 nla_total_size(cpus_sz);
+		 /* DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS */
 
 	*tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz));
 	return 0;
@@ -40,23 +44,26 @@ static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data)
 
 static int __em_nl_get_pd(struct em_perf_domain *pd, void *data)
 {
-	char cpus_buf[EM_A_PD_CPUS_LEN];
+	char cpus_buf[DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN];
 	struct sk_buff *msg = data;
 	struct nlattr *entry;
 
-	entry = nla_nest_start(msg, EM_A_PDS_PD);
+	entry = nla_nest_start(msg,
+			       DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN);
 	if (!entry)
 		goto out_cancel_nest;
 
-	if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id))
+	if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID,
+			pd->id))
 		goto out_cancel_nest;
 
-	if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD))
+	if (nla_put_u64_64bit(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS,
+			      pd->flags, DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD))
 		goto out_cancel_nest;
 
 	snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
 		 cpumask_pr_args(to_cpumask(pd->cpus)));
-	if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf))
+	if (nla_put_string(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS, cpus_buf))
 		goto out_cancel_nest;
 
 	nla_nest_end(msg, entry);
@@ -69,7 +76,8 @@ out_cancel_nest:
 	return -EMSGSIZE;
 }
 
-int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info)
+int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb,
+					      struct genl_info *info)
 {
 	struct sk_buff *msg;
 	void *hdr;
@@ -82,7 +90,7 @@ int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info)
 	if (!msg)
 		return -ENOMEM;
 
-	hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+	hdr = genlmsg_put_reply(msg, info, &dev_energymodel_nl_family, 0, cmd);
 	if (!hdr)
 		goto out_free_msg;
 
@@ -107,10 +115,10 @@ static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs)
 	struct em_perf_domain *pd;
 	int id;
 
-	if (!attrs[EM_A_PD_TABLE_PD_ID])
+	if (!attrs[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID])
 		return NULL;
 
-	id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]);
+	id = nla_get_u32(attrs[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID]);
 	pd = em_perf_domain_get_by_id(id);
 	return pd;
 }
@@ -119,25 +127,34 @@ static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd)
 {
 	int id_sz, ps_sz;
 
-	id_sz = nla_total_size(sizeof(u32));		/* EM_A_PD_TABLE_PD_ID */
-	ps_sz = nla_total_size(0) +			/* EM_A_PD_TABLE_PS */
-		nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PS_PERFORMANCE */
-		nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PS_FREQUENCY */
-		nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PS_POWER */
-		nla_total_size_64bit(sizeof(u64)) +	/* EM_A_PS_COST */
-		nla_total_size_64bit(sizeof(u64));	/* EM_A_PS_FLAGS */
+	id_sz = nla_total_size(sizeof(u32));
+		/* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID */
+	ps_sz = nla_total_size(0) +
+		/* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE */
+		nla_total_size_64bit(sizeof(u64)) +
+		/* DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE */
+		nla_total_size_64bit(sizeof(u64)) +
+		/* DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY */
+		nla_total_size_64bit(sizeof(u64)) +
+		/* DEV_ENERGYMODEL_A_PERF_STATE_POWER */
+		nla_total_size_64bit(sizeof(u64)) +
+		/* DEV_ENERGYMODEL_A_PERF_STATE_COST */
+		nla_total_size_64bit(sizeof(u64));
+		/* DEV_ENERGYMODEL_A_PERF_STATE_FLAGS */
 	ps_sz *= pd->nr_perf_states;
 
 	return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz));
 }
 
-static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd)
+static
+int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd)
 {
 	struct em_perf_state *table, *ps;
 	struct nlattr *entry;
 	int i;
 
-	if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id))
+	if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID,
+			pd->id))
 		goto out_err;
 
 	rcu_read_lock();
@@ -146,24 +163,35 @@ static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain
 	for (i = 0; i < pd->nr_perf_states; i++) {
 		ps = &table[i];
 
-		entry = nla_nest_start(msg, EM_A_PD_TABLE_PS);
+		entry = nla_nest_start(msg,
+				       DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE);
 		if (!entry)
 			goto out_unlock_ps;
 
-		if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE,
-				      ps->performance, EM_A_PS_PAD))
+		if (nla_put_u64_64bit(msg,
+				      DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE,
+				      ps->performance,
+				      DEV_ENERGYMODEL_A_PERF_STATE_PAD))
 			goto out_cancel_ps_nest;
-		if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY,
-				      ps->frequency, EM_A_PS_PAD))
+		if (nla_put_u64_64bit(msg,
+				      DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY,
+				      ps->frequency,
+				      DEV_ENERGYMODEL_A_PERF_STATE_PAD))
 			goto out_cancel_ps_nest;
-		if (nla_put_u64_64bit(msg, EM_A_PS_POWER,
-				      ps->power, EM_A_PS_PAD))
+		if (nla_put_u64_64bit(msg,
+				      DEV_ENERGYMODEL_A_PERF_STATE_POWER,
+				      ps->power,
+				      DEV_ENERGYMODEL_A_PERF_STATE_PAD))
 			goto out_cancel_ps_nest;
-		if (nla_put_u64_64bit(msg, EM_A_PS_COST,
-				      ps->cost, EM_A_PS_PAD))
+		if (nla_put_u64_64bit(msg,
+				      DEV_ENERGYMODEL_A_PERF_STATE_COST,
+				      ps->cost,
+				      DEV_ENERGYMODEL_A_PERF_STATE_PAD))
 			goto out_cancel_ps_nest;
-		if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS,
-				      ps->flags, EM_A_PS_PAD))
+		if (nla_put_u64_64bit(msg,
+				      DEV_ENERGYMODEL_A_PERF_STATE_FLAGS,
+				      ps->flags,
+				      DEV_ENERGYMODEL_A_PERF_STATE_PAD))
 			goto out_cancel_ps_nest;
 
 		nla_nest_end(msg, entry);
@@ -179,7 +207,8 @@ out_err:
 	return -EMSGSIZE;
 }
 
-int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info)
+int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb,
+					    struct genl_info *info)
 {
 	int cmd = info->genlhdr->cmd;
 	int msg_sz, ret = -EMSGSIZE;
@@ -197,7 +226,7 @@ int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info)
 	if (!msg)
 		return -ENOMEM;
 
-	hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+	hdr = genlmsg_put_reply(msg, info, &dev_energymodel_nl_family, 0, cmd);
 	if (!hdr)
 		goto out_free_msg;
 
@@ -221,7 +250,7 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type)
 	int msg_sz, ret = -EMSGSIZE;
 	void *hdr;
 
-	if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+	if (!genl_has_listeners(&dev_energymodel_nl_family, &init_net, DEV_ENERGYMODEL_NLGRP_EVENT))
 		return;
 
 	msg_sz = __em_nl_get_pd_table_size(pd);
@@ -230,7 +259,7 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type)
 	if (!msg)
 		return;
 
-	hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type);
+	hdr = genlmsg_put(msg, 0, 0, &dev_energymodel_nl_family, 0, ntf_type);
 	if (!hdr)
 		goto out_free_msg;
 
@@ -240,28 +269,28 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type)
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+	genlmsg_multicast(&dev_energymodel_nl_family, msg, 0,
+			  DEV_ENERGYMODEL_NLGRP_EVENT, GFP_KERNEL);
 
 	return;
 
 out_free_msg:
 	nlmsg_free(msg);
-	return;
 }
 
 void em_notify_pd_created(const struct em_perf_domain *pd)
 {
-	__em_notify_pd_table(pd, EM_CMD_PD_CREATED);
+	__em_notify_pd_table(pd, DEV_ENERGYMODEL_CMD_PERF_DOMAIN_CREATED);
 }
 
 void em_notify_pd_updated(const struct em_perf_domain *pd)
 {
-	__em_notify_pd_table(pd, EM_CMD_PD_UPDATED);
+	__em_notify_pd_table(pd, DEV_ENERGYMODEL_CMD_PERF_DOMAIN_UPDATED);
 }
 
 static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd)
 {
-	int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */
+	int id_sz = nla_total_size(sizeof(u32)); /* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID */
 
 	return nlmsg_total_size(genlmsg_msg_size(id_sz));
 }
@@ -272,7 +301,8 @@ void em_notify_pd_deleted(const struct em_perf_domain *pd)
 	void *hdr;
 	int msg_sz;
 
-	if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+	if (!genl_has_listeners(&dev_energymodel_nl_family, &init_net,
+				DEV_ENERGYMODEL_NLGRP_EVENT))
 		return;
 
 	msg_sz = __em_notify_pd_deleted_size(pd);
@@ -281,28 +311,29 @@ void em_notify_pd_deleted(const struct em_perf_domain *pd)
 	if (!msg)
 		return;
 
-	hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED);
+	hdr = genlmsg_put(msg, 0, 0, &dev_energymodel_nl_family, 0,
+			  DEV_ENERGYMODEL_CMD_PERF_DOMAIN_DELETED);
 	if (!hdr)
 		goto out_free_msg;
 
-	if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) {
+	if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID,
+			pd->id))
 		goto out_free_msg;
-	}
 
 	genlmsg_end(msg, hdr);
 
-	genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+	genlmsg_multicast(&dev_energymodel_nl_family, msg, 0,
+			  DEV_ENERGYMODEL_NLGRP_EVENT, GFP_KERNEL);
 
 	return;
 
 out_free_msg:
 	nlmsg_free(msg);
-	return;
 }
 
 /**************************** Initialization *********************************/
 static int __init em_netlink_init(void)
 {
-	return genl_register_family(&em_nl_family);
+	return genl_register_family(&dev_energymodel_nl_family);
 }
 postcore_initcall(em_netlink_init);
diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c
index ceb3b2bb6ebe..44acef0e7df2 100644
--- a/kernel/power/em_netlink_autogen.c
+++ b/kernel/power/em_netlink_autogen.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
 /* Do not edit directly, auto-generated from: */
-/*	Documentation/netlink/specs/em.yaml */
+/*	Documentation/netlink/specs/dev-energymodel.yaml */
 /* YNL-GEN kernel source */
 /* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
@@ -9,41 +9,41 @@
 
 #include "em_netlink_autogen.h"
 
-#include <uapi/linux/energy_model.h>
+#include <uapi/linux/dev_energymodel.h>
 
-/* EM_CMD_GET_PD_TABLE - do */
-static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = {
-	[EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, },
+/* DEV_ENERGYMODEL_CMD_GET_PERF_TABLE - do */
+static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID + 1] = {
+	[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID] = { .type = NLA_U32, },
 };
 
-/* Ops table for em */
-static const struct genl_split_ops em_nl_ops[] = {
+/* Ops table for dev_energymodel */
+static const struct genl_split_ops dev_energymodel_nl_ops[] = {
 	{
-		.cmd	= EM_CMD_GET_PDS,
-		.doit	= em_nl_get_pds_doit,
+		.cmd	= DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS,
+		.doit	= dev_energymodel_nl_get_perf_domains_doit,
 		.flags	= GENL_CMD_CAP_DO,
 	},
 	{
-		.cmd		= EM_CMD_GET_PD_TABLE,
-		.doit		= em_nl_get_pd_table_doit,
-		.policy		= em_get_pd_table_nl_policy,
-		.maxattr	= EM_A_PD_TABLE_PD_ID,
+		.cmd		= DEV_ENERGYMODEL_CMD_GET_PERF_TABLE,
+		.doit		= dev_energymodel_nl_get_perf_table_doit,
+		.policy		= dev_energymodel_get_perf_table_nl_policy,
+		.maxattr	= DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID,
 		.flags		= GENL_CMD_CAP_DO,
 	},
 };
 
-static const struct genl_multicast_group em_nl_mcgrps[] = {
-	[EM_NLGRP_EVENT] = { "event", },
+static const struct genl_multicast_group dev_energymodel_nl_mcgrps[] = {
+	[DEV_ENERGYMODEL_NLGRP_EVENT] = { "event", },
 };
 
-struct genl_family em_nl_family __ro_after_init = {
-	.name		= EM_FAMILY_NAME,
-	.version	= EM_FAMILY_VERSION,
+struct genl_family dev_energymodel_nl_family __ro_after_init = {
+	.name		= DEV_ENERGYMODEL_FAMILY_NAME,
+	.version	= DEV_ENERGYMODEL_FAMILY_VERSION,
 	.netnsok	= true,
 	.parallel_ops	= true,
 	.module		= THIS_MODULE,
-	.split_ops	= em_nl_ops,
-	.n_split_ops	= ARRAY_SIZE(em_nl_ops),
-	.mcgrps		= em_nl_mcgrps,
-	.n_mcgrps	= ARRAY_SIZE(em_nl_mcgrps),
+	.split_ops	= dev_energymodel_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(dev_energymodel_nl_ops),
+	.mcgrps		= dev_energymodel_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(dev_energymodel_nl_mcgrps),
 };
diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h
index 140ab548103c..f7e4bddcbd53 100644
--- a/kernel/power/em_netlink_autogen.h
+++ b/kernel/power/em_netlink_autogen.h
@@ -1,24 +1,26 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
 /* Do not edit directly, auto-generated from: */
-/*	Documentation/netlink/specs/em.yaml */
+/*	Documentation/netlink/specs/dev-energymodel.yaml */
 /* YNL-GEN kernel header */
 /* To regenerate run: tools/net/ynl/ynl-regen.sh */
 
-#ifndef _LINUX_EM_GEN_H
-#define _LINUX_EM_GEN_H
+#ifndef _LINUX_DEV_ENERGYMODEL_GEN_H
+#define _LINUX_DEV_ENERGYMODEL_GEN_H
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
 
-#include <uapi/linux/energy_model.h>
+#include <uapi/linux/dev_energymodel.h>
 
-int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info);
-int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info);
+int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb,
+					     struct genl_info *info);
+int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb,
+					   struct genl_info *info);
 
 enum {
-	EM_NLGRP_EVENT,
+	DEV_ENERGYMODEL_NLGRP_EVENT,
 };
 
-extern struct genl_family em_nl_family;
+extern struct genl_family dev_energymodel_nl_family;
 
-#endif /* _LINUX_EM_GEN_H */
+#endif /* _LINUX_DEV_ENERGYMODEL_GEN_H */
-- 
cgit v1.2.3


From 380ff27af25e49e2cb2ff8fd0ecd7c95be2976ee Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Thu, 8 Jan 2026 14:32:12 +0900
Subject: PM: EM: Add dump to get-perf-domains in the EM YNL spec

Add dump to get-perf-domains, so that a user can fetch either information
about a specific performance domain with do or information about all
performance domains with dump. Share the reply format of do and dump using
perf-domain-attrs, so remove perf-domains. The YNL spec, autogenerated
files, and the do implementation are updated, and the dump implementation
is added.

Suggested-by: Donald Hunter <donald.hunter@gmail.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Changwoo Min <changwoo@igalia.com>
Link: https://patch.msgid.link/20260108053212.642478-5-changwoo@igalia.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/netlink/specs/dev-energymodel.yaml | 25 +++++----
 include/uapi/linux/dev_energymodel.h             |  7 ---
 kernel/power/em_netlink.c                        | 68 ++++++++++++++++++------
 kernel/power/em_netlink_autogen.c                | 16 +++++-
 kernel/power/em_netlink_autogen.h                |  2 +
 5 files changed, 80 insertions(+), 38 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/dev-energymodel.yaml b/Documentation/netlink/specs/dev-energymodel.yaml
index af8b8f72f722..11faabfdfbe8 100644
--- a/Documentation/netlink/specs/dev-energymodel.yaml
+++ b/Documentation/netlink/specs/dev-energymodel.yaml
@@ -42,16 +42,6 @@ definitions:
           missing real power information.
 
 attribute-sets:
-  -
-    name: perf-domains
-    doc: >-
-      Information on all the performance domains.
-    attributes:
-      -
-        name: perf-domain
-        type: nest
-        nested-attributes: perf-domain
-        multi-attr: true
   -
     name: perf-domain
     doc: >-
@@ -133,12 +123,21 @@ operations:
   list:
     -
       name: get-perf-domains
-      attribute-set: perf-domains
+      attribute-set: perf-domain
       doc: Get the list of information for all performance domains.
       do:
-        reply:
+        request:
           attributes:
-            - perf-domain
+            - perf-domain-id
+        reply:
+          attributes: &perf-domain-attrs
+            - pad
+            - perf-domain-id
+            - flags
+            - cpus
+      dump:
+        reply:
+          attributes: *perf-domain-attrs
     -
       name: get-perf-table
       attribute-set: perf-table
diff --git a/include/uapi/linux/dev_energymodel.h b/include/uapi/linux/dev_energymodel.h
index 3399967e1f93..355d8885c9a0 100644
--- a/include/uapi/linux/dev_energymodel.h
+++ b/include/uapi/linux/dev_energymodel.h
@@ -36,13 +36,6 @@ enum dev_energymodel_perf_domain_flags {
 	DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL = 4,
 };
 
-enum {
-	DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN = 1,
-
-	__DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX,
-	DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX - 1)
-};
-
 enum {
 	DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD = 1,
 	DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID,
diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c
index b6edb018c65a..5a611d3950fd 100644
--- a/kernel/power/em_netlink.c
+++ b/kernel/power/em_netlink.c
@@ -18,6 +18,13 @@
 #include "em_netlink_autogen.h"
 
 /*************************** Command encoding ********************************/
+struct dump_ctx {
+	int idx;
+	int start;
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+};
+
 static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data)
 {
 	int nr_cpus, msg_sz, cpus_sz;
@@ -43,14 +50,8 @@ static int __em_nl_get_pd(struct em_perf_domain *pd, void *data)
 {
 	struct sk_buff *msg = data;
 	struct cpumask *cpumask;
-	struct nlattr *entry;
 	int cpu;
 
-	entry = nla_nest_start(msg,
-			       DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN);
-	if (!entry)
-		goto out_cancel_nest;
-
 	if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID,
 			pd->id))
 		goto out_cancel_nest;
@@ -66,26 +67,50 @@ static int __em_nl_get_pd(struct em_perf_domain *pd, void *data)
 			goto out_cancel_nest;
 	}
 
-	nla_nest_end(msg, entry);
-
 	return 0;
 
 out_cancel_nest:
-	nla_nest_cancel(msg, entry);
-
 	return -EMSGSIZE;
 }
 
+static int __em_nl_get_pd_for_dump(struct em_perf_domain *pd, void *data)
+{
+	const struct genl_info *info;
+	struct dump_ctx *ctx = data;
+	void *hdr;
+	int ret;
+
+	if (ctx->idx++ < ctx->start)
+		return 0;
+
+	info = genl_info_dump(ctx->cb);
+	hdr = genlmsg_iput(ctx->skb, info);
+	if (!hdr) {
+		genlmsg_cancel(ctx->skb, hdr);
+		return -EMSGSIZE;
+	}
+
+	ret = __em_nl_get_pd(pd, ctx->skb);
+	genlmsg_end(ctx->skb, hdr);
+	return ret;
+}
+
 int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb,
 					      struct genl_info *info)
 {
+	int id, ret = -EMSGSIZE, msg_sz = 0;
+	int cmd = info->genlhdr->cmd;
+	struct em_perf_domain *pd;
 	struct sk_buff *msg;
 	void *hdr;
-	int cmd = info->genlhdr->cmd;
-	int ret = -EMSGSIZE, msg_sz = 0;
 
-	for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz);
+	if (!info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID])
+		return -EINVAL;
 
+	id = nla_get_u32(info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID]);
+	pd = em_perf_domain_get_by_id(id);
+
+	__em_nl_get_pd_size(pd, &msg_sz);
 	msg = genlmsg_new(msg_sz, GFP_KERNEL);
 	if (!msg)
 		return -ENOMEM;
@@ -94,10 +119,9 @@ int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb,
 	if (!hdr)
 		goto out_free_msg;
 
-	ret = for_each_em_perf_domain(__em_nl_get_pd, msg);
+	ret = __em_nl_get_pd(pd, msg);
 	if (ret)
 		goto out_cancel_msg;
-
 	genlmsg_end(msg, hdr);
 
 	return genlmsg_reply(msg, info);
@@ -106,10 +130,22 @@ out_cancel_msg:
 	genlmsg_cancel(msg, hdr);
 out_free_msg:
 	nlmsg_free(msg);
-
 	return ret;
 }
 
+int dev_energymodel_nl_get_perf_domains_dumpit(struct sk_buff *skb,
+						struct netlink_callback *cb)
+{
+	struct dump_ctx ctx = {
+		.idx = 0,
+		.start = cb->args[0],
+		.skb = skb,
+		.cb = cb,
+	};
+
+	return for_each_em_perf_domain(__em_nl_get_pd_for_dump, &ctx);
+}
+
 static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs)
 {
 	struct em_perf_domain *pd;
diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c
index 44acef0e7df2..fedd473e4244 100644
--- a/kernel/power/em_netlink_autogen.c
+++ b/kernel/power/em_netlink_autogen.c
@@ -11,6 +11,11 @@
 
 #include <uapi/linux/dev_energymodel.h>
 
+/* DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS - do */
+static const struct nla_policy dev_energymodel_get_perf_domains_nl_policy[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID + 1] = {
+	[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID] = { .type = NLA_U32, },
+};
+
 /* DEV_ENERGYMODEL_CMD_GET_PERF_TABLE - do */
 static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID + 1] = {
 	[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID] = { .type = NLA_U32, },
@@ -18,10 +23,17 @@ static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENER
 
 /* Ops table for dev_energymodel */
 static const struct genl_split_ops dev_energymodel_nl_ops[] = {
+	{
+		.cmd		= DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS,
+		.doit		= dev_energymodel_nl_get_perf_domains_doit,
+		.policy		= dev_energymodel_get_perf_domains_nl_policy,
+		.maxattr	= DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID,
+		.flags		= GENL_CMD_CAP_DO,
+	},
 	{
 		.cmd	= DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS,
-		.doit	= dev_energymodel_nl_get_perf_domains_doit,
-		.flags	= GENL_CMD_CAP_DO,
+		.dumpit	= dev_energymodel_nl_get_perf_domains_dumpit,
+		.flags	= GENL_CMD_CAP_DUMP,
 	},
 	{
 		.cmd		= DEV_ENERGYMODEL_CMD_GET_PERF_TABLE,
diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h
index f7e4bddcbd53..5caf2f7e18a5 100644
--- a/kernel/power/em_netlink_autogen.h
+++ b/kernel/power/em_netlink_autogen.h
@@ -14,6 +14,8 @@
 
 int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb,
 					     struct genl_info *info);
+int dev_energymodel_nl_get_perf_domains_dumpit(struct sk_buff *skb,
+					       struct netlink_callback *cb);
 int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb,
 					   struct genl_info *info);
 
-- 
cgit v1.2.3


From c279e83953d937470f8a6e69b69f62608714f13f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 15 Dec 2025 13:42:19 -0800
Subject: iommu: Introduce pci_dev_reset_iommu_prepare/done()

PCIe permits a device to ignore ATS invalidation TLPs while processing a
reset. This creates a problem visible to the OS where an ATS invalidation
command will time out. E.g. an SVA domain will have no coordination with a
reset event and can racily issue ATS invalidations to a resetting device.

The OS should do something to mitigate this as we do not want production
systems to be reporting critical ATS failures, especially in a hypervisor
environment. Broadly, OS could arrange to ignore the timeouts, block page
table mutations to prevent invalidations, or disable and block ATS.

The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and
block ATS before initiating a Function Level Reset. It also mentions that
other reset methods could have the same vulnerability as well.

Provide a callback from the PCI subsystem that will enclose the reset and
have the iommu core temporarily change all the attached RID/PASID domains
group->blocking_domain so that the IOMMU hardware would fence any incoming
ATS queries. And IOMMU drivers should also synchronously stop issuing new
ATS invalidations and wait for all ATS invalidations to complete. This can
avoid any ATS invaliation timeouts.

However, if there is a domain attachment/replacement happening during an
ongoing reset, ATS routines may be re-activated between the two function
calls. So, introduce a new resetting_domain in the iommu_group structure
to reject any concurrent attach_dev/set_dev_pasid call during a reset for
a concern of compatibility failure. Since this changes the behavior of an
attach operation, update the uAPI accordingly.

Note that there are two corner cases:
 1. Devices in the same iommu_group
    Since an attachment is always per iommu_group, this means that any
    sibling devices in the iommu_group cannot change domain, to prevent
    race conditions.
 2. An SR-IOV PF that is being reset while its VF is not
    In such case, the VF itself is already broken. So, there is no point
    in preventing PF from going through the iommu reset.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Dheeraj Kumar Srivastava <dheerajkumar.srivastava@amd.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/iommu.c     | 173 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h     |  13 ++++
 include/uapi/linux/vfio.h |   4 ++
 3 files changed, 190 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 672597100e9a..0665dedd91b2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -61,6 +61,11 @@ struct iommu_group {
 	int id;
 	struct iommu_domain *default_domain;
 	struct iommu_domain *blocking_domain;
+	/*
+	 * During a group device reset, @resetting_domain points to the physical
+	 * domain, while @domain points to the attached domain before the reset.
+	 */
+	struct iommu_domain *resetting_domain;
 	struct iommu_domain *domain;
 	struct list_head entry;
 	unsigned int owner_cnt;
@@ -2195,6 +2200,15 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
 
 	guard(mutex)(&dev->iommu_group->mutex);
 
+	/*
+	 * This is a concurrent attach during a device reset. Reject it until
+	 * pci_dev_reset_iommu_done() attaches the device to group->domain.
+	 *
+	 * Note that this might fail the iommu_dma_map(). But there's nothing
+	 * more we can do here.
+	 */
+	if (dev->iommu_group->resetting_domain)
+		return -EBUSY;
 	return __iommu_attach_device(domain, dev, NULL);
 }
 
@@ -2253,6 +2267,17 @@ struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev)
 
 	lockdep_assert_held(&group->mutex);
 
+	/*
+	 * Driver handles the low-level __iommu_attach_device(), including the
+	 * one invoked by pci_dev_reset_iommu_done() re-attaching the device to
+	 * the cached group->domain. In this case, the driver must get the old
+	 * domain from group->resetting_domain rather than group->domain. This
+	 * prevents it from re-attaching the device from group->domain (old) to
+	 * group->domain (new).
+	 */
+	if (group->resetting_domain)
+		return group->resetting_domain;
+
 	return group->domain;
 }
 EXPORT_SYMBOL_GPL(iommu_driver_get_domain_for_dev);
@@ -2409,6 +2434,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
 	if (WARN_ON(!new_domain))
 		return -EINVAL;
 
+	/*
+	 * This is a concurrent attach during a device reset. Reject it until
+	 * pci_dev_reset_iommu_done() attaches the device to group->domain.
+	 */
+	if (group->resetting_domain)
+		return -EBUSY;
+
 	/*
 	 * Changing the domain is done by calling attach_dev() on the new
 	 * domain. This switch does not have to be atomic and DMA can be
@@ -3527,6 +3559,16 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
 		return -EINVAL;
 
 	mutex_lock(&group->mutex);
+
+	/*
+	 * This is a concurrent attach during a device reset. Reject it until
+	 * pci_dev_reset_iommu_done() attaches the device to group->domain.
+	 */
+	if (group->resetting_domain) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
 	for_each_group_device(group, device) {
 		/*
 		 * Skip PASID validation for devices without PASID support
@@ -3610,6 +3652,16 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
 		return -EINVAL;
 
 	mutex_lock(&group->mutex);
+
+	/*
+	 * This is a concurrent attach during a device reset. Reject it until
+	 * pci_dev_reset_iommu_done() attaches the device to group->domain.
+	 */
+	if (group->resetting_domain) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
 	entry = iommu_make_pasid_array_entry(domain, handle);
 	curr = xa_cmpxchg(&group->pasid_array, pasid, NULL,
 			  XA_ZERO_ENTRY, GFP_KERNEL);
@@ -3867,6 +3919,127 @@ err_unlock:
 }
 EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL");
 
+/**
+ * pci_dev_reset_iommu_prepare() - Block IOMMU to prepare for a PCI device reset
+ * @pdev: PCI device that is going to enter a reset routine
+ *
+ * The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and block
+ * ATS before initiating a reset. This means that a PCIe device during the reset
+ * routine wants to block any IOMMU activity: translation and ATS invalidation.
+ *
+ * This function attaches the device's RID/PASID(s) the group->blocking_domain,
+ * setting the group->resetting_domain. This allows the IOMMU driver pausing any
+ * IOMMU activity while leaving the group->domain pointer intact. Later when the
+ * reset is finished, pci_dev_reset_iommu_done() can restore everything.
+ *
+ * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done()
+ * before/after the core-level reset routine, to unset the resetting_domain.
+ *
+ * Return: 0 on success or negative error code if the preparation failed.
+ *
+ * These two functions are designed to be used by PCI reset functions that would
+ * not invoke any racy iommu_release_device(), since PCI sysfs node gets removed
+ * before it notifies with a BUS_NOTIFY_REMOVED_DEVICE. When using them in other
+ * case, callers must ensure there will be no racy iommu_release_device() call,
+ * which otherwise would UAF the dev->iommu_group pointer.
+ */
+int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
+{
+	struct iommu_group *group = pdev->dev.iommu_group;
+	unsigned long pasid;
+	void *entry;
+	int ret;
+
+	if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev))
+		return 0;
+
+	guard(mutex)(&group->mutex);
+
+	/* Re-entry is not allowed */
+	if (WARN_ON(group->resetting_domain))
+		return -EBUSY;
+
+	ret = __iommu_group_alloc_blocking_domain(group);
+	if (ret)
+		return ret;
+
+	/* Stage RID domain at blocking_domain while retaining group->domain */
+	if (group->domain != group->blocking_domain) {
+		ret = __iommu_attach_device(group->blocking_domain, &pdev->dev,
+					    group->domain);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Stage PASID domains at blocking_domain while retaining pasid_array.
+	 *
+	 * The pasid_array is mostly fenced by group->mutex, except one reader
+	 * in iommu_attach_handle_get(), so it's safe to read without xa_lock.
+	 */
+	xa_for_each_start(&group->pasid_array, pasid, entry, 1)
+		iommu_remove_dev_pasid(&pdev->dev, pasid,
+				       pasid_array_entry_to_domain(entry));
+
+	group->resetting_domain = group->blocking_domain;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare);
+
+/**
+ * pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done
+ * @pdev: PCI device that has finished a reset routine
+ *
+ * After a PCIe device finishes a reset routine, it wants to restore its IOMMU
+ * IOMMU activity, including new translation as well as cache invalidation, by
+ * re-attaching all RID/PASID of the device's back to the domains retained in
+ * the core-level structure.
+ *
+ * Caller must pair it with a successful pci_dev_reset_iommu_prepare().
+ *
+ * Note that, although unlikely, there is a risk that re-attaching domains might
+ * fail due to some unexpected happening like OOM.
+ */
+void pci_dev_reset_iommu_done(struct pci_dev *pdev)
+{
+	struct iommu_group *group = pdev->dev.iommu_group;
+	unsigned long pasid;
+	void *entry;
+
+	if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev))
+		return;
+
+	guard(mutex)(&group->mutex);
+
+	/* pci_dev_reset_iommu_prepare() was bypassed for the device */
+	if (!group->resetting_domain)
+		return;
+
+	/* pci_dev_reset_iommu_prepare() was not successfully called */
+	if (WARN_ON(!group->blocking_domain))
+		return;
+
+	/* Re-attach RID domain back to group->domain */
+	if (group->domain != group->blocking_domain) {
+		WARN_ON(__iommu_attach_device(group->domain, &pdev->dev,
+					      group->blocking_domain));
+	}
+
+	/*
+	 * Re-attach PASID domains back to the domains retained in pasid_array.
+	 *
+	 * The pasid_array is mostly fenced by group->mutex, except one reader
+	 * in iommu_attach_handle_get(), so it's safe to read without xa_lock.
+	 */
+	xa_for_each_start(&group->pasid_array, pasid, entry, 1)
+		WARN_ON(__iommu_set_group_pasid(
+			pasid_array_entry_to_domain(entry), group, pasid,
+			group->blocking_domain));
+
+	group->resetting_domain = NULL;
+}
+EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done);
+
 #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
 /**
  * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ff097df318b9..54b8b48c762e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1188,6 +1188,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain,
 			       struct device *dev, ioasid_t pasid);
 ioasid_t iommu_alloc_global_pasid(struct device *dev);
 void iommu_free_global_pasid(ioasid_t pasid);
+
+/* PCI device reset functions */
+int pci_dev_reset_iommu_prepare(struct pci_dev *pdev);
+void pci_dev_reset_iommu_done(struct pci_dev *pdev);
 #else /* CONFIG_IOMMU_API */
 
 struct iommu_ops {};
@@ -1511,6 +1515,15 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev)
 }
 
 static inline void iommu_free_global_pasid(ioasid_t pasid) {}
+
+static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
+{
+	return 0;
+}
+
+static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev)
+{
+}
 #endif /* CONFIG_IOMMU_API */
 
 #ifdef CONFIG_IRQ_MSI_IOMMU
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ac2329f24141..bb7b89330d35 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -964,6 +964,10 @@ struct vfio_device_bind_iommufd {
  * hwpt corresponding to the given pt_id.
  *
  * Return: 0 on success, -errno on failure.
+ *
+ * When a device is resetting, -EBUSY will be returned to reject any concurrent
+ * attachment to the resetting device itself or any sibling device in the IOMMU
+ * group having the resetting device.
  */
 struct vfio_device_attach_iommufd_pt {
 	__u32	argsz;
-- 
cgit v1.2.3


From bc87b14594e30720a5c1546c24e0f5f08d34eb40 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 8 Jan 2026 10:35:21 -0800
Subject: bnxt_en: Implement ethtool_ops -> get_link_ext_state()

Map the link_down_reason from the FW to the ethtool link_ext_state
when it is available.  Also log it to the link down dmesg when it is
available.  Add 2 new link_ext_state enums to the UAPI:

ETHTOOL_LINK_EXT_STATE_OTP_SPEED_VIOLATION
ETHTOOL_LINK_EXT_STATE_BMC_REQUEST_DOWN

to cover OTP (one-time-programmable) speed restrictions and
BMC (Baseboard management controller) forcing the link down.

Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20260108183521.215610-7-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         | 25 +++++++++++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 35 +++++++++++++++++++++++
 include/uapi/linux/ethtool.h                      |  2 ++
 4 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9902babd82cb..cb78614d4108 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -11915,6 +11915,26 @@ static char *bnxt_report_fec(struct bnxt_link_info *link_info)
 	}
 }
 
+static char *bnxt_link_down_reason(struct bnxt_link_info *link_info)
+{
+	u8 reason = link_info->link_down_reason;
+
+	/* Multiple bits can be set, we report 1 bit only in order of
+	 * priority.
+	 */
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_RF)
+		return "(Remote fault)";
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_OTP_SPEED_VIOLATION)
+		return "(OTP Speed limit violation)";
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_CABLE_REMOVED)
+		return "(Cable removed)";
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_MODULE_FAULT)
+		return "(Module fault)";
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_BMC_REQUEST)
+		return "(BMC request down)";
+	return "";
+}
+
 void bnxt_report_link(struct bnxt *bp)
 {
 	if (BNXT_LINK_IS_UP(bp)) {
@@ -11972,8 +11992,10 @@ void bnxt_report_link(struct bnxt *bp)
 				    (fec & BNXT_FEC_AUTONEG) ? "on" : "off",
 				    bnxt_report_fec(&bp->link_info));
 	} else {
+		char *str = bnxt_link_down_reason(&bp->link_info);
+
 		netif_carrier_off(bp->dev);
-		netdev_err(bp->dev, "NIC Link is Down\n");
+		netdev_err(bp->dev, "NIC Link is Down %s\n", str);
 	}
 }
 
@@ -12173,6 +12195,7 @@ int bnxt_update_link(struct bnxt *bp, bool chng_link_state)
 	link_info->phy_addr = resp->eee_config_phy_addr &
 			      PORT_PHY_QCFG_RESP_PHY_ADDR_MASK;
 	link_info->module_status = resp->module_status;
+	link_info->link_down_reason = resp->link_down_reason;
 
 	if (bp->phy_flags & BNXT_PHY_FL_EEE_CAP) {
 		struct ethtool_keee *eee = &bp->eee;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 3afd1d5e364a..e441a002ddef 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1553,6 +1553,7 @@ struct bnxt_link_info {
 #define BNXT_LINK_STATE_DOWN	1
 #define BNXT_LINK_STATE_UP	2
 #define BNXT_LINK_IS_UP(bp)	((bp)->link_info.link_state == BNXT_LINK_STATE_UP)
+	u8			link_down_reason;
 	u8			active_lanes;
 	u8			duplex;
 #define BNXT_LINK_DUPLEX_HALF	PORT_PHY_QCFG_RESP_DUPLEX_STATE_HALF
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 4dfae7b61c76..6b15fedbb16f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -3432,6 +3432,40 @@ static u32 bnxt_get_link(struct net_device *dev)
 	return BNXT_LINK_IS_UP(bp);
 }
 
+static int bnxt_get_link_ext_state(struct net_device *dev,
+				   struct ethtool_link_ext_state_info *info)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	u8 reason;
+
+	if (BNXT_LINK_IS_UP(bp))
+		return -ENODATA;
+
+	reason = bp->link_info.link_down_reason;
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_RF) {
+		info->link_ext_state = ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE;
+		info->link_training = ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT;
+		return 0;
+	}
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_CABLE_REMOVED) {
+		info->link_ext_state = ETHTOOL_LINK_EXT_STATE_NO_CABLE;
+		return 0;
+	}
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_OTP_SPEED_VIOLATION) {
+		info->link_ext_state = ETHTOOL_LINK_EXT_STATE_OTP_SPEED_VIOLATION;
+		return 0;
+	}
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_MODULE_FAULT) {
+		info->link_ext_state = ETHTOOL_LINK_EXT_STATE_MODULE;
+		return 0;
+	}
+	if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_BMC_REQUEST) {
+		info->link_ext_state = ETHTOOL_LINK_EXT_STATE_BMC_REQUEST_DOWN;
+		return 0;
+	}
+	return -ENODATA;
+}
+
 int bnxt_hwrm_nvm_get_dev_info(struct bnxt *bp,
 			       struct hwrm_nvm_get_dev_info_output *nvm_dev_info)
 {
@@ -5711,6 +5745,7 @@ const struct ethtool_ops bnxt_ethtool_ops = {
 	.get_eeprom             = bnxt_get_eeprom,
 	.set_eeprom		= bnxt_set_eeprom,
 	.get_link		= bnxt_get_link,
+	.get_link_ext_state	= bnxt_get_link_ext_state,
 	.get_link_ext_stats	= bnxt_get_link_ext_stats,
 	.get_eee		= bnxt_get_eee,
 	.set_eee		= bnxt_set_eee,
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index eb7ff2602fbb..5daa8f225b67 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -603,6 +603,8 @@ enum ethtool_link_ext_state {
 	ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED,
 	ETHTOOL_LINK_EXT_STATE_OVERHEAT,
 	ETHTOOL_LINK_EXT_STATE_MODULE,
+	ETHTOOL_LINK_EXT_STATE_OTP_SPEED_VIOLATION,
+	ETHTOOL_LINK_EXT_STATE_BMC_REQUEST_DOWN,
 };
 
 /* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */
-- 
cgit v1.2.3


From 576ee5dfd459abe8e29bee8b204cd259e60b4e18 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 12 Jan 2026 16:47:10 +0100
Subject: fs: add immutable rootfs

Currently pivot_root() doesn't work on the real rootfs because it
cannot be unmounted. Userspace has to do a recursive removal of the
initramfs contents manually before continuing the boot.

Really all we want from the real rootfs is to serve as the parent mount
for anything that is actually useful such as the tmpfs or ramfs for
initramfs unpacking or the rootfs itself. There's no need for the real
rootfs to actually be anything meaningful or useful. Add a immutable
rootfs called "nullfs" that can be selected via the "nullfs_rootfs"
kernel command line option.

The kernel will mount a tmpfs/ramfs on top of it, unpack the initramfs
and fire up userspace which mounts the rootfs and can then just do:

  chdir(rootfs);
  pivot_root(".", ".");
  umount2(".", MNT_DETACH);

and be done with it. (Ofc, userspace can also choose to retain the
initramfs contents by using something like pivot_root(".", "/initramfs")
without unmounting it.)

Technically this also means that the rootfs mount in unprivileged
namespaces doesn't need to become MNT_LOCKED anymore as it's guaranteed
that the immutable rootfs remains permanently empty so there cannot be
anything revealed by unmounting the covering mount.

In the future this will also allow us to create completely empty mount
namespaces without risking to leak anything.

systemd already handles this all correctly as it tries to pivot_root()
first and falls back to MS_MOVE only when that fails.

This goes back to various discussion in previous years and a LPC 2024
presentation about this very topic.

Link: https://patch.msgid.link/20260112-work-immutable-rootfs-v2-3-88dd1c34a204@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/Makefile                |  2 +-
 fs/mount.h                 |  1 +
 fs/namespace.c             | 82 +++++++++++++++++++++++++++++++++++++++-------
 fs/nullfs.c                | 70 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/magic.h |  1 +
 init/do_mounts.c           | 14 ++++++++
 init/do_mounts.h           |  1 +
 7 files changed, 159 insertions(+), 12 deletions(-)
 create mode 100644 fs/nullfs.c

(limited to 'include/uapi/linux')

diff --git a/fs/Makefile b/fs/Makefile
index a04274a3c854..becf133e4791 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -16,7 +16,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
-		file_attr.o
+		file_attr.o nullfs.o
 
 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
diff --git a/fs/mount.h b/fs/mount.h
index 2d28ef2a3aed..e0816c11a198 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -5,6 +5,7 @@
 #include <linux/ns_common.h>
 #include <linux/fs_pin.h>
 
+extern struct file_system_type nullfs_fs_type;
 extern struct list_head notify_list;
 
 struct mnt_namespace {
diff --git a/fs/namespace.c b/fs/namespace.c
index 9261f56ccc81..a44ebb2f1161 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -75,6 +75,17 @@ static int __init initramfs_options_setup(char *str)
 
 __setup("initramfs_options=", initramfs_options_setup);
 
+bool nullfs_rootfs = false;
+
+static int __init nullfs_rootfs_setup(char *str)
+{
+	if (*str)
+		return 0;
+	nullfs_rootfs = true;
+	return 1;
+}
+__setup("nullfs_rootfs", nullfs_rootfs_setup);
+
 static u64 event;
 static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
 static DEFINE_IDA(mnt_group_ida);
@@ -4582,8 +4593,9 @@ int path_pivot_root(struct path *new, struct path *old)
  * pointed to by put_old must yield the same directory as new_root. No other
  * file system may be mounted on put_old. After all, new_root is a mountpoint.
  *
- * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
- * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
+ * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem
+ * unless the kernel was booted with "nullfs_rootfs". See
+ * Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
  * in this situation.
  *
  * Notes:
@@ -5976,24 +5988,72 @@ struct mnt_namespace init_mnt_ns = {
 
 static void __init init_mount_tree(void)
 {
-	struct vfsmount *mnt;
-	struct mount *m;
+	struct vfsmount *mnt, *nullfs_mnt;
+	struct mount *mnt_root;
 	struct path root;
 
+	/*
+	 * When nullfs is used, we create two mounts:
+	 *
+	 * (1) nullfs with mount id 1
+	 * (2) mutable rootfs with mount id 2
+	 *
+	 * with (2) mounted on top of (1).
+	 */
+	if (nullfs_rootfs) {
+		nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL);
+		if (IS_ERR(nullfs_mnt))
+			panic("VFS: Failed to create nullfs");
+	}
+
 	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
-	m = real_mount(mnt);
-	init_mnt_ns.root = m;
-	init_mnt_ns.nr_mounts = 1;
-	mnt_add_to_ns(&init_mnt_ns, m);
+	if (nullfs_rootfs) {
+		VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1);
+		VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2);
+
+		/* The namespace root is the nullfs mnt. */
+		mnt_root		= real_mount(nullfs_mnt);
+		init_mnt_ns.root	= mnt_root;
+
+		/* Mount mutable rootfs on top of nullfs. */
+		root.mnt		= nullfs_mnt;
+		root.dentry		= nullfs_mnt->mnt_root;
+
+		LOCK_MOUNT_EXACT(mp, &root);
+		if (unlikely(IS_ERR(mp.parent)))
+			panic("VFS: Failed to mount rootfs on nullfs");
+		scoped_guard(mount_writer)
+			attach_mnt(real_mount(mnt), mp.parent, mp.mp);
+
+		pr_info("VFS: Finished mounting rootfs on nullfs\n");
+	} else {
+		VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 1);
+
+		/* The namespace root is the mutable rootfs. */
+		mnt_root		= real_mount(mnt);
+		init_mnt_ns.root	= mnt_root;
+	}
+
+	/*
+	 * We've dropped all locks here but that's fine. Not just are we
+	 * the only task that's running, there's no other mount
+	 * namespace in existence and the initial mount namespace is
+	 * completely empty until we add the mounts we just created.
+	 */
+	for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) {
+		mnt_add_to_ns(&init_mnt_ns, p);
+		init_mnt_ns.nr_mounts++;
+	}
+
 	init_task.nsproxy->mnt_ns = &init_mnt_ns;
 	get_mnt_ns(&init_mnt_ns);
 
-	root.mnt = mnt;
-	root.dentry = mnt->mnt_root;
-
+	/* The root and pwd always point to the mutable rootfs. */
+	root.mnt	= mnt;
+	root.dentry	= mnt->mnt_root;
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
 
diff --git a/fs/nullfs.c b/fs/nullfs.c
new file mode 100644
index 000000000000..fdbd3e5d3d71
--- /dev/null
+++ b/fs/nullfs.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+#include <linux/fs/super_types.h>
+#include <linux/fs_context.h>
+#include <linux/magic.h>
+
+static const struct super_operations nullfs_super_operations = {
+	.statfs	= simple_statfs,
+};
+
+static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc)
+{
+	struct inode *inode;
+
+	s->s_maxbytes		= MAX_LFS_FILESIZE;
+	s->s_blocksize		= PAGE_SIZE;
+	s->s_blocksize_bits	= PAGE_SHIFT;
+	s->s_magic		= NULL_FS_MAGIC;
+	s->s_op			= &nullfs_super_operations;
+	s->s_export_op		= NULL;
+	s->s_xattr		= NULL;
+	s->s_time_gran		= 1;
+	s->s_d_flags		= 0;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOMEM;
+
+	/* nullfs is permanently empty... */
+	make_empty_dir_inode(inode);
+	simple_inode_init_ts(inode);
+	inode->i_ino	= 1;
+	/* ... and immutable. */
+	inode->i_flags |= S_IMMUTABLE;
+
+	s->s_root = d_make_root(inode);
+	if (!s->s_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * For now this is a single global instance. If needed we can make it
+ * mountable by userspace at which point we will need to make it
+ * multi-instance.
+ */
+static int nullfs_fs_get_tree(struct fs_context *fc)
+{
+	return get_tree_single(fc, nullfs_fs_fill_super);
+}
+
+static const struct fs_context_operations nullfs_fs_context_ops = {
+	.get_tree	= nullfs_fs_get_tree,
+};
+
+static int nullfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops		= &nullfs_fs_context_ops;
+	fc->global	= true;
+	fc->sb_flags	= SB_NOUSER;
+	fc->s_iflags	= SB_I_NOEXEC | SB_I_NODEV;
+	return 0;
+}
+
+struct file_system_type nullfs_fs_type = {
+	.name			= "nullfs",
+	.init_fs_context	= nullfs_init_fs_context,
+	.kill_sb		= kill_anon_super,
+};
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 638ca21b7a90..4f2da935a76c 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -104,5 +104,6 @@
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
 #define GUEST_MEMFD_MAGIC	0x474d454d	/* "GMEM" */
+#define NULL_FS_MAGIC		0x4E554C4C	/* "NULL" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index defbbf1d55f7..675397c8a7a4 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -492,6 +492,20 @@ void __init prepare_namespace(void)
 	mount_root(saved_root_name);
 out:
 	devtmpfs_mount();
+
+	if (nullfs_rootfs) {
+		if (init_pivot_root(".", ".")) {
+			pr_err("VFS: Failed to pivot into new rootfs\n");
+			return;
+		}
+		if (init_umount(".", MNT_DETACH)) {
+			pr_err("VFS: Failed to unmount old rootfs\n");
+			return;
+		}
+		pr_info("VFS: Pivoted into new rootfs\n");
+		return;
+	}
+
 	init_mount(".", "/", NULL, MS_MOVE, NULL);
 	init_chroot(".");
 }
diff --git a/init/do_mounts.h b/init/do_mounts.h
index 6069ea3eb80d..fbfee810aa89 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -15,6 +15,7 @@
 void  mount_root_generic(char *name, char *pretty_name, int flags);
 void  mount_root(char *root_device_name);
 extern int root_mountflags;
+extern bool nullfs_rootfs;
 
 static inline __init int create_dev(char *name, dev_t dev)
 {
-- 
cgit v1.2.3


From 6abbb8703aeeb645a681ab6ad155e0b450413787 Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack3000@gmail.com>
Date: Sun, 11 Jan 2026 18:52:04 +0100
Subject: landlock: Clarify documentation for the IOCTL access right
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the description of the LANDLOCK_ACCESS_FS_IOCTL_DEV access right
together with the file access rights.

This group of access rights applies to files (in this case device
files), and they can be added to file or directory inodes using
landlock_add_rule(2).  The check for that works the same for all file
access rights, including LANDLOCK_ACCESS_FS_IOCTL_DEV.

Invoking ioctl(2) on directory FDs can not currently be restricted
with Landlock.  Having it grouped separately in the documentation is a
remnant from earlier revisions of the LANDLOCK_ACCESS_FS_IOCTL_DEV
patch set.

Link: https://lore.kernel.org/all/20260108.Thaex5ruach2@digikod.net/
Signed-off-by: Günther Noack <gnoack3000@gmail.com>
Link: https://lore.kernel.org/r/20260111175203.6545-2-gnoack3000@gmail.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index f030adc462ee..75fd7f5e6cc3 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -216,6 +216,23 @@ struct landlock_net_port_attr {
  *   :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with
  *   ``O_TRUNC``.  This access right is available since the third version of the
  *   Landlock ABI.
+ * - %LANDLOCK_ACCESS_FS_IOCTL_DEV: Invoke :manpage:`ioctl(2)` commands on an opened
+ *   character or block device.
+ *
+ *   This access right applies to all `ioctl(2)` commands implemented by device
+ *   drivers.  However, the following common IOCTL commands continue to be
+ *   invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL_DEV right:
+ *
+ *   * IOCTL commands targeting file descriptors (``FIOCLEX``, ``FIONCLEX``),
+ *   * IOCTL commands targeting file descriptions (``FIONBIO``, ``FIOASYNC``),
+ *   * IOCTL commands targeting file systems (``FIFREEZE``, ``FITHAW``,
+ *     ``FIGETBSZ``, ``FS_IOC_GETFSUUID``, ``FS_IOC_GETFSSYSFSPATH``)
+ *   * Some IOCTL commands which do not make sense when used with devices, but
+ *     whose implementations are safe and return the right error codes
+ *     (``FS_IOC_FIEMAP``, ``FICLONE``, ``FICLONERANGE``, ``FIDEDUPERANGE``)
+ *
+ *   This access right is available since the fifth version of the Landlock
+ *   ABI.
  *
  * Whether an opened file can be truncated with :manpage:`ftruncate(2)` or used
  * with `ioctl(2)` is determined during :manpage:`open(2)`, in the same way as
@@ -275,26 +292,6 @@ struct landlock_net_port_attr {
  *   If multiple requirements are not met, the ``EACCES`` error code takes
  *   precedence over ``EXDEV``.
  *
- * The following access right applies both to files and directories:
- *
- * - %LANDLOCK_ACCESS_FS_IOCTL_DEV: Invoke :manpage:`ioctl(2)` commands on an opened
- *   character or block device.
- *
- *   This access right applies to all `ioctl(2)` commands implemented by device
- *   drivers.  However, the following common IOCTL commands continue to be
- *   invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL_DEV right:
- *
- *   * IOCTL commands targeting file descriptors (``FIOCLEX``, ``FIONCLEX``),
- *   * IOCTL commands targeting file descriptions (``FIONBIO``, ``FIOASYNC``),
- *   * IOCTL commands targeting file systems (``FIFREEZE``, ``FITHAW``,
- *     ``FIGETBSZ``, ``FS_IOC_GETFSUUID``, ``FS_IOC_GETFSSYSFSPATH``)
- *   * Some IOCTL commands which do not make sense when used with devices, but
- *     whose implementations are safe and return the right error codes
- *     (``FS_IOC_FIEMAP``, ``FICLONE``, ``FICLONERANGE``, ``FIDEDUPERANGE``)
- *
- *   This access right is available since the fifth version of the Landlock
- *   ABI.
- *
  * .. warning::
  *
  *   It is currently not possible to restrict some file-related actions
-- 
cgit v1.2.3


From 98bf2256855eb682433a33e6a7c4bce35191ca99 Mon Sep 17 00:00:00 2001
From: Stanley Zhang <stazhang@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:31 -0700
Subject: ublk: support UBLK_PARAM_TYPE_INTEGRITY in device creation

Add a feature flag UBLK_F_INTEGRITY for a ublk server to request
integrity/metadata support when creating a ublk device. The ublk server
can also check for the feature flag on the created device or the result
of UBLK_U_CMD_GET_FEATURES to tell if the ublk driver supports it.
UBLK_F_INTEGRITY requires UBLK_F_USER_COPY, as user copy is the only
data copy mode initially supported for integrity data.
Add UBLK_PARAM_TYPE_INTEGRITY and struct ublk_param_integrity to struct
ublk_params to specify the integrity params of a ublk device.
UBLK_PARAM_TYPE_INTEGRITY requires UBLK_F_INTEGRITY and a nonzero
metadata_size. The LBMD_PI_CAP_* and LBMD_PI_CSUM_* values from the
linux/fs.h UAPI header are used for the flags and csum_type fields.
If the UBLK_PARAM_TYPE_INTEGRITY flag is set, validate the integrity
parameters and apply them to the blk_integrity limits.
The struct ublk_param_integrity validations are based on the checks in
blk_validate_integrity_limits(). Any invalid parameters should be
rejected before being applied to struct blk_integrity.

[csander: drop redundant pi_tuple_size field, use block metadata UAPI
 constants, add param validation]

Signed-off-by: Stanley Zhang <stazhang@purestorage.com>
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 101 +++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/ublk_cmd.h |  19 ++++++++
 2 files changed, 119 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 53df4bfa2c92..a4d62e8e4f6b 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -44,6 +44,8 @@
 #include <linux/task_work.h>
 #include <linux/namei.h>
 #include <linux/kref.h>
+#include <linux/blk-integrity.h>
+#include <uapi/linux/fs.h>
 #include <uapi/linux/ublk_cmd.h>
 
 #define UBLK_MINORS		(1U << MINORBITS)
@@ -83,7 +85,8 @@
 #define UBLK_PARAM_TYPE_ALL                                \
 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED |    \
-	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
+	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
+	 UBLK_PARAM_TYPE_INTEGRITY)
 
 struct ublk_uring_cmd_pdu {
 	/*
@@ -301,6 +304,11 @@ static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
 	return ubq->flags & UBLK_F_ZONED;
 }
 
+static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
+{
+	return ub->dev_info.flags & UBLK_F_INTEGRITY;
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 
 struct ublk_zoned_report_desc {
@@ -616,6 +624,53 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
 	set_capacity(ub->ub_disk, p->dev_sectors);
 }
 
+static int ublk_integrity_flags(u32 flags)
+{
+	int ret_flags = 0;
+
+	if (flags & LBMD_PI_CAP_INTEGRITY) {
+		flags &= ~LBMD_PI_CAP_INTEGRITY;
+		ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+	}
+	if (flags & LBMD_PI_CAP_REFTAG) {
+		flags &= ~LBMD_PI_CAP_REFTAG;
+		ret_flags |= BLK_INTEGRITY_REF_TAG;
+	}
+	return flags ? -EINVAL : ret_flags;
+}
+
+static int ublk_integrity_pi_tuple_size(u8 csum_type)
+{
+	switch (csum_type) {
+	case LBMD_PI_CSUM_NONE:
+		return 0;
+	case LBMD_PI_CSUM_IP:
+	case LBMD_PI_CSUM_CRC16_T10DIF:
+		return 8;
+	case LBMD_PI_CSUM_CRC64_NVME:
+		return 16;
+	default:
+		return -EINVAL;
+	}
+}
+
+static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
+{
+	switch (csum_type) {
+	case LBMD_PI_CSUM_NONE:
+		return BLK_INTEGRITY_CSUM_NONE;
+	case LBMD_PI_CSUM_IP:
+		return BLK_INTEGRITY_CSUM_IP;
+	case LBMD_PI_CSUM_CRC16_T10DIF:
+		return BLK_INTEGRITY_CSUM_CRC;
+	case LBMD_PI_CSUM_CRC64_NVME:
+		return BLK_INTEGRITY_CSUM_CRC64;
+	default:
+		WARN_ON_ONCE(1);
+		return BLK_INTEGRITY_CSUM_NONE;
+	}
+}
+
 static int ublk_validate_params(const struct ublk_device *ub)
 {
 	/* basic param is the only one which must be set */
@@ -678,6 +733,29 @@ static int ublk_validate_params(const struct ublk_device *ub)
 			return -EINVAL;
 	}
 
+	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
+		const struct ublk_param_integrity *p = &ub->params.integrity;
+		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
+		int flags = ublk_integrity_flags(p->flags);
+
+		if (!ublk_dev_support_integrity(ub))
+			return -EINVAL;
+		if (flags < 0)
+			return flags;
+		if (pi_tuple_size < 0)
+			return pi_tuple_size;
+		if (!p->metadata_size)
+			return -EINVAL;
+		if (p->csum_type == LBMD_PI_CSUM_NONE &&
+		    p->flags & LBMD_PI_CAP_REFTAG)
+			return -EINVAL;
+		if (p->pi_offset + pi_tuple_size > p->metadata_size)
+			return -EINVAL;
+		if (p->interval_exp < SECTOR_SHIFT ||
+		    p->interval_exp > ub->params.basic.logical_bs_shift)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -2950,6 +3028,23 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
 		lim.max_segments = ub->params.seg.max_segments;
 	}
 
+	if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
+		const struct ublk_param_integrity *p = &ub->params.integrity;
+		int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
+
+		lim.max_integrity_segments =
+			p->max_integrity_segments ?: USHRT_MAX;
+		lim.integrity = (struct blk_integrity) {
+			.flags = ublk_integrity_flags(p->flags),
+			.csum_type = ublk_integrity_csum_type(p->csum_type),
+			.metadata_size = p->metadata_size,
+			.pi_offset = p->pi_offset,
+			.interval_exp = p->interval_exp,
+			.tag_size = p->tag_size,
+			.pi_tuple_size = pi_tuple_size,
+		};
+	}
+
 	if (wait_for_completion_interruptible(&ub->completion) != 0)
 		return -EINTR;
 
@@ -3140,6 +3235,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
 			return -EINVAL;
 	}
 
+	/* User copy is required to access integrity buffer */
+	if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
+		return -EINVAL;
+
 	/* the created device is always owned by current user */
 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index ec77dabba45b..4c141d7e4710 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -311,6 +311,12 @@
  */
 #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14)
 
+/*
+ * ublk device supports requests with integrity/metadata buffer.
+ * Requires UBLK_F_USER_COPY.
+ */
+#define UBLK_F_INTEGRITY (1ULL << 16)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
@@ -600,6 +606,17 @@ struct ublk_param_segment {
 	__u8	pad[2];
 };
 
+struct ublk_param_integrity {
+	__u32	flags; /* LBMD_PI_CAP_* from linux/fs.h */
+	__u16	max_integrity_segments; /* 0 means no limit */
+	__u8	interval_exp;
+	__u8	metadata_size; /* UBLK_PARAM_TYPE_INTEGRITY requires nonzero */
+	__u8	pi_offset;
+	__u8	csum_type; /* LBMD_PI_CSUM_* from linux/fs.h */
+	__u8	tag_size;
+	__u8	pad[5];
+};
+
 struct ublk_params {
 	/*
 	 * Total length of parameters, userspace has to set 'len' for both
@@ -614,6 +631,7 @@ struct ublk_params {
 #define UBLK_PARAM_TYPE_ZONED           (1 << 3)
 #define UBLK_PARAM_TYPE_DMA_ALIGN       (1 << 4)
 #define UBLK_PARAM_TYPE_SEGMENT         (1 << 5)
+#define UBLK_PARAM_TYPE_INTEGRITY       (1 << 6) /* requires UBLK_F_INTEGRITY */
 	__u32	types;			/* types of parameter included */
 
 	struct ublk_param_basic		basic;
@@ -622,6 +640,7 @@ struct ublk_params {
 	struct ublk_param_zoned	zoned;
 	struct ublk_param_dma_align	dma;
 	struct ublk_param_segment	seg;
+	struct ublk_param_integrity	integrity;
 };
 
 #endif
-- 
cgit v1.2.3


From f82f0a16a8270b17211254beeb123d11a0f279cd Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:32 -0700
Subject: ublk: set UBLK_IO_F_INTEGRITY in ublksrv_io_desc

Indicate to the ublk server when an incoming request has integrity data
by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 3 +++
 include/uapi/linux/ublk_cmd.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index a4d62e8e4f6b..fc7de2985a20 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1113,6 +1113,9 @@ static inline unsigned int ublk_req_build_flags(struct request *req)
 	if (req->cmd_flags & REQ_SWAP)
 		flags |= UBLK_IO_F_SWAP;
 
+	if (blk_integrity_rq(req))
+		flags |= UBLK_IO_F_INTEGRITY;
+
 	return flags;
 }
 
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 4c141d7e4710..dfde4aee39eb 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -414,6 +414,8 @@ struct ublksrv_ctrl_dev_info {
  * passed in.
  */
 #define		UBLK_IO_F_NEED_REG_BUF		(1U << 17)
+/* Request has an integrity data buffer */
+#define		UBLK_IO_F_INTEGRITY		(1UL << 18)
 
 /*
  * io cmd is described by this structure, and stored in share memory, indexed
-- 
cgit v1.2.3


From be82a89066d595da334f6e153ababcedc3f92ad6 Mon Sep 17 00:00:00 2001
From: Stanley Zhang <stazhang@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:37 -0700
Subject: ublk: implement integrity user copy

Add a function ublk_copy_user_integrity() to copy integrity information
between a request and a user iov_iter. This mirrors the existing
ublk_copy_user_pages() but operates on request integrity data instead of
regular data. Check UBLKSRV_IO_INTEGRITY_FLAG in iocb->ki_pos in
ublk_user_copy() to choose between copying data or integrity data.

[csander: change offset units from data bytes to integrity data bytes,
 fix CONFIG_BLK_DEV_INTEGRITY=n build, rebase on user copy refactor]

Signed-off-by: Stanley Zhang <stazhang@purestorage.com>
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 53 +++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/ublk_cmd.h |  4 ++++
 2 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index d428a25121db..5c441f507c43 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1040,6 +1040,33 @@ static size_t ublk_copy_user_pages(const struct request *req,
 	return done;
 }
 
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static size_t ublk_copy_user_integrity(const struct request *req,
+		unsigned offset, struct iov_iter *uiter, int dir)
+{
+	size_t done = 0;
+	struct bio *bio = req->bio;
+	struct bvec_iter iter;
+	struct bio_vec iv;
+
+	if (!blk_integrity_rq(req))
+		return 0;
+
+	bio_for_each_integrity_vec(iv, bio, iter) {
+		if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
+			break;
+	}
+
+	return done;
+}
+#else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
+static size_t ublk_copy_user_integrity(const struct request *req,
+		unsigned offset, struct iov_iter *uiter, int dir)
+{
+	return 0;
+}
+#endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
+
 static inline bool ublk_need_map_req(const struct request *req)
 {
 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
@@ -2668,6 +2695,8 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
 	struct ublk_queue *ubq;
 	struct request *req;
 	struct ublk_io *io;
+	unsigned data_len;
+	bool is_integrity;
 	size_t buf_off;
 	u16 tag, q_id;
 	ssize_t ret;
@@ -2681,6 +2710,10 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
 	tag = ublk_pos_to_tag(iocb->ki_pos);
 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
+	is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
+
+	if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
+		return -EINVAL;
 
 	if (q_id >= ub->dev_info.nr_hw_queues)
 		return -EINVAL;
@@ -2697,7 +2730,14 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
 	if (!req)
 		return -EINVAL;
 
-	if (buf_off > blk_rq_bytes(req)) {
+	if (is_integrity) {
+		struct blk_integrity *bi = &req->q->limits.integrity;
+
+		data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
+	} else {
+		data_len = blk_rq_bytes(req);
+	}
+	if (buf_off > data_len) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -2707,7 +2747,10 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
 		goto out;
 	}
 
-	ret = ublk_copy_user_pages(req, buf_off, iter, dir);
+	if (is_integrity)
+		ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
+	else
+		ret = ublk_copy_user_pages(req, buf_off, iter, dir);
 
 out:
 	ublk_put_req_ref(io, req);
@@ -3948,6 +3991,12 @@ static int __init ublk_init(void)
 
 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
+	/*
+	 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
+	 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
+	 */
+	BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
+		     UBLKSRV_IO_INTEGRITY_FLAG);
 	BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
 
 	init_waitqueue_head(&ublk_idr_wq);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index dfde4aee39eb..61ac5d8e1078 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -134,6 +134,10 @@
 #define UBLKSRV_IO_BUF_TOTAL_BITS	(UBLK_QID_OFF + UBLK_QID_BITS)
 #define UBLKSRV_IO_BUF_TOTAL_SIZE	(1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
 
+/* Copy to/from request integrity buffer instead of data buffer */
+#define UBLK_INTEGRITY_FLAG_OFF 62
+#define UBLKSRV_IO_INTEGRITY_FLAG (1ULL << UBLK_INTEGRITY_FLAG_OFF)
+
 /*
  * ublk server can register data buffers for incoming I/O requests with a sparse
  * io_uring buffer table. The request buffer can then be used as the data buffer
-- 
cgit v1.2.3


From e6ce36ccc86f6d447808a6e620f56d440d74aa19 Mon Sep 17 00:00:00 2001
From: Askar Safin <safinaskar@gmail.com>
Date: Wed, 19 Nov 2025 22:24:07 +0000
Subject: init: remove /proc/sys/kernel/real-root-dev

It is not used anymore.

Signed-off-by: Askar Safin <safinaskar@gmail.com>
Link: https://patch.msgid.link/20251119222407.3333257-4-safinaskar@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/admin-guide/sysctl/kernel.rst |  6 ------
 include/uapi/linux/sysctl.h                 |  1 -
 init/do_mounts_initrd.c                     | 20 --------------------
 3 files changed, 27 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 239da22c4e28..bb577fac76a0 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1235,12 +1235,6 @@ that support this feature.
 ==  ===========================================================================
 
 
-real-root-dev
-=============
-
-See Documentation/admin-guide/initrd.rst.
-
-
 reboot-cmd (SPARC only)
 =======================
 
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 63d1464cb71c..1c7fe0f4dca4 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -92,7 +92,6 @@ enum
 	KERN_DOMAINNAME=8,	/* string: domainname */
 
 	KERN_PANIC=15,		/* int: panic timeout */
-	KERN_REALROOTDEV=16,	/* real root device to mount after initrd */
 
 	KERN_SPARC_REBOOT=21,	/* reboot command on Sparc */
 	KERN_CTLALTDEL=22,	/* int: allow ctl-alt-del to reboot */
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index fe335dbc95e0..892e69ab41c4 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -8,31 +8,11 @@
 
 unsigned long initrd_start, initrd_end;
 int initrd_below_start_ok;
-static unsigned int real_root_dev;	/* do_proc_dointvec cannot handle kdev_t */
 static int __initdata mount_initrd = 1;
 
 phys_addr_t phys_initrd_start __initdata;
 unsigned long phys_initrd_size __initdata;
 
-#ifdef CONFIG_SYSCTL
-static const struct ctl_table kern_do_mounts_initrd_table[] = {
-	{
-		.procname       = "real-root-dev",
-		.data           = &real_root_dev,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec,
-	},
-};
-
-static __init int kernel_do_mounts_initrd_sysctls_init(void)
-{
-	register_sysctl_init("kernel", kern_do_mounts_initrd_table);
-	return 0;
-}
-late_initcall(kernel_do_mounts_initrd_sysctls_init);
-#endif /* CONFIG_SYSCTL */
-
 static int __init no_initrd(char *str)
 {
 	pr_warn("noinitrd option is deprecated and will be removed soon\n");
-- 
cgit v1.2.3


From e1cbdf78f60c35a1a320ca401852fd6a73624a4a Mon Sep 17 00:00:00 2001
From: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Date: Fri, 9 Jan 2026 19:14:39 +1100
Subject: wifi: cfg80211: include S1G_NO_PRIMARY flag when sending channel

When sending a channel ensure we include the IEEE80211_CHAN_S1G_NO_PRIMARY
flag.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20260109081439.3168-1-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 ++++
 net/wireless/nl80211.c       | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8134f10e4e6c..964e1c779cdd 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4444,6 +4444,9 @@ enum nl80211_wmm_rule {
  *	channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_NO_16MHZ: 16 MHz operation is not allowed on this
  *	channel in current regulatory domain.
+ * @NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY: Channel is not permitted for use
+ *	as a primary channel. Does not prevent the channel from existing
+ *	as a non-primary subchannel. Only applicable to S1G channels.
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -4492,6 +4495,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_NO_4MHZ,
 	NL80211_FREQUENCY_ATTR_NO_8MHZ,
 	NL80211_FREQUENCY_ATTR_NO_16MHZ,
+	NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c961cd42a832..225580507a4b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1314,6 +1314,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 		if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) &&
 		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ))
 			goto nla_put_failure;
+		if ((chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) &&
+		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY))
+			goto nla_put_failure;
 	}
 
 	if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
-- 
cgit v1.2.3


From 93ada1b3da398b492c45429cef1a1c9651d5c7ba Mon Sep 17 00:00:00 2001
From: Yoav Cohen <yoav@nvidia.com>
Date: Tue, 13 Jan 2026 00:05:01 +0200
Subject: ublk: add UBLK_CMD_TRY_STOP_DEV command

Add a best-effort stop command, UBLK_CMD_TRY_STOP_DEV, which only stops a
ublk device when it has no active openers.

Unlike UBLK_CMD_STOP_DEV, this command does not disrupt existing users.
New opens are blocked only after disk_openers has reached zero; if the
device is busy, the command returns -EBUSY and leaves it running.

The ub->block_open flag is used only to close a race with an in-progress
open and does not otherwise change open behavior.

Advertise support via the UBLK_F_SAFE_STOP_DEV feature flag.

Signed-off-by: Yoav Cohen <yoav@nvidia.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 44 +++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/ublk_cmd.h |  9 ++++++++-
 2 files changed, 50 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 73490890242b..aaf94d2fb789 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -56,6 +56,7 @@
 #define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
+#define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
 
 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -76,7 +77,8 @@
 		| UBLK_F_QUIESCE \
 		| UBLK_F_PER_IO_DAEMON \
 		| UBLK_F_BUF_REG_OFF_DAEMON \
-		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0))
+		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
+		| UBLK_F_SAFE_STOP_DEV)
 
 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
 		| UBLK_F_USER_RECOVERY_REISSUE \
@@ -243,6 +245,8 @@ struct ublk_device {
 	struct delayed_work	exit_work;
 	struct work_struct	partition_scan_work;
 
+	bool			block_open; /* protected by open_mutex */
+
 	struct ublk_queue       *queues[];
 };
 
@@ -984,6 +988,9 @@ static int ublk_open(struct gendisk *disk, blk_mode_t mode)
 			return -EPERM;
 	}
 
+	if (ub->block_open)
+		return -ENXIO;
+
 	return 0;
 }
 
@@ -3343,7 +3350,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
 		UBLK_F_URING_CMD_COMP_IN_TASK |
 		UBLK_F_PER_IO_DAEMON |
-		UBLK_F_BUF_REG_OFF_DAEMON;
+		UBLK_F_BUF_REG_OFF_DAEMON |
+		UBLK_F_SAFE_STOP_DEV;
 
 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
@@ -3464,6 +3472,34 @@ static void ublk_ctrl_stop_dev(struct ublk_device *ub)
 	ublk_stop_dev(ub);
 }
 
+static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
+{
+	struct gendisk *disk;
+	int ret = 0;
+
+	disk = ublk_get_disk(ub);
+	if (!disk)
+		return -ENODEV;
+
+	mutex_lock(&disk->open_mutex);
+	if (disk_openers(disk) > 0) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+	ub->block_open = true;
+	/* release open_mutex as del_gendisk() will reacquire it */
+	mutex_unlock(&disk->open_mutex);
+
+	ublk_ctrl_stop_dev(ub);
+	goto out;
+
+unlock:
+	mutex_unlock(&disk->open_mutex);
+out:
+	ublk_put_disk(disk);
+	return ret;
+}
+
 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
 		const struct ublksrv_ctrl_cmd *header)
 {
@@ -3859,6 +3895,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 	case UBLK_CMD_END_USER_RECOVERY:
 	case UBLK_CMD_UPDATE_SIZE:
 	case UBLK_CMD_QUIESCE_DEV:
+	case UBLK_CMD_TRY_STOP_DEV:
 		mask = MAY_READ | MAY_WRITE;
 		break;
 	default:
@@ -3972,6 +4009,9 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_QUIESCE_DEV:
 		ret = ublk_ctrl_quiesce_dev(ub, header);
 		break;
+	case UBLK_CMD_TRY_STOP_DEV:
+		ret = ublk_ctrl_try_stop_dev(ub);
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 		break;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 61ac5d8e1078..90f47da4f435 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -55,7 +55,8 @@
 	_IOWR('u', 0x15, struct ublksrv_ctrl_cmd)
 #define UBLK_U_CMD_QUIESCE_DEV		\
 	_IOWR('u', 0x16, struct ublksrv_ctrl_cmd)
-
+#define UBLK_U_CMD_TRY_STOP_DEV		\
+	_IOWR('u', 0x17, struct ublksrv_ctrl_cmd)
 /*
  * 64bits are enough now, and it should be easy to extend in case of
  * running out of feature flags
@@ -321,6 +322,12 @@
  */
 #define UBLK_F_INTEGRITY (1ULL << 16)
 
+/*
+ * The device supports the UBLK_CMD_TRY_STOP_DEV command, which
+ * allows stopping the device only if there are no openers.
+ */
+#define UBLK_F_SAFE_STOP_DEV	(1ULL << 17)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit v1.2.3


From 406fc2e9ca65e0df345ebf4ce95aa87cb6416f35 Mon Sep 17 00:00:00 2001
From: Deepa Guthyappa Madivalara <deepa.madivalara@oss.qualcomm.com>
Date: Wed, 10 Dec 2025 10:59:04 -0800
Subject: media: uapi: videodev2: Add support for AV1 stateful decoder

Introduce a new pixel format, V4L2_PIX_FMT_AV1, to the
Video4Linux2(V4L2) API. This format is intended for AV1
bitstreams in stateful decoding/encoding workflows.
The fourcc code 'AV10' is used to distinguish
this format from the existing V4L2_PIX_FMT_AV1_FRAME,
which is used for stateless AV1 decoder implementation.

Reviewed-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Reviewed-by: Hans Verkuil <hverkuil+cisco@kernel.org>
Signed-off-by: Deepa Guthyappa Madivalara <deepa.madivalara@oss.qualcomm.com>
Tested-by: Val Packett <val@packett.cool>
Signed-off-by: Bryan O'Donoghue <bod@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-compressed.rst | 8 ++++++++
 include/uapi/linux/videodev2.h                              | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index c7efb0465db6..235f955d3cd5 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -275,6 +275,14 @@ Compressed Formats
         of macroblocks to decode a full corresponding frame to the matching
         capture buffer.
 
+    * .. _V4L2-PIX-FMT-AV1:
+
+      - ``V4L2_PIX_FMT_AV1``
+      - 'AV01'
+      - AV1 compressed video frame. This format is adapted for implementing AV1
+        pipeline. The decoder implements stateful video decoder and expects one
+        temporal unit per buffer in OBU stream format.
+        The encoder generates one Temporal Unit per buffer.
 .. raw:: latex
 
     \normalsize
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index add08188f068..848e86617d5c 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -775,6 +775,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
 #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
 #define V4L2_PIX_FMT_AV1_FRAME v4l2_fourcc('A', 'V', '1', 'F') /* AV1 parsed frame */
+#define V4L2_PIX_FMT_AV1      v4l2_fourcc('A', 'V', '0', '1') /* AV1 */
 #define V4L2_PIX_FMT_SPK      v4l2_fourcc('S', 'P', 'K', '0') /* Sorenson Spark */
 #define V4L2_PIX_FMT_RV30     v4l2_fourcc('R', 'V', '3', '0') /* RealVideo 8 */
 #define V4L2_PIX_FMT_RV40     v4l2_fourcc('R', 'V', '4', '0') /* RealVideo 9 & 10 */
-- 
cgit v1.2.3


From 1bddd758bac21fbbd8a06af746ec7b6d878a9d2c Mon Sep 17 00:00:00 2001
From: Jonas Köppeler <j.koeppeler@tu-berlin.de>
Date: Fri, 9 Jan 2026 14:15:34 +0100
Subject: net/sched: sch_cake: share shaper state across sub-instances of
 cake_mq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds shared shaper state across the cake instances beneath a
cake_mq qdisc. It works by periodically tracking the number of active
instances, and scaling the configured rate by the number of active
queues.

The scan is lockless and simply reads the qlen and the last_active state
variable of each of the instances configured beneath the parent cake_mq
instance. Locking is not required since the values are only updated by
the owning instance, and eventual consistency is sufficient for the
purpose of estimating the number of active queues.

The interval for scanning the number of active queues is set to 200 us.
We found this to be a good tradeoff between overhead and response time.
For a detailed analysis of this aspect see the Netdevconf talk:

https://netdevconf.info/0x19/docs/netdev-0x19-paper16-talk-paper.pdf

Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Jonas Köppeler <j.koeppeler@tu-berlin.de>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20260109-mq-cake-sub-qdisc-v8-5-8d613fece5d8@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/netlink/specs/tc.yaml |  3 +++
 include/uapi/linux/pkt_sched.h      |  1 +
 net/sched/sch_cake.c                | 51 +++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml
index b398f7a46dae..2e663333a279 100644
--- a/Documentation/netlink/specs/tc.yaml
+++ b/Documentation/netlink/specs/tc.yaml
@@ -2207,6 +2207,9 @@ attribute-sets:
       -
         name: blue-timer-us
         type: s32
+      -
+        name: active-queues
+        type: u32
   -
     name: cake-tin-stats-attrs
     name-prefix: tca-cake-tin-stats-
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index c2da76e78bad..66e8072f44df 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1036,6 +1036,7 @@ enum {
 	TCA_CAKE_STATS_DROP_NEXT_US,
 	TCA_CAKE_STATS_P_DROP,
 	TCA_CAKE_STATS_BLUE_TIMER_US,
+	TCA_CAKE_STATS_ACTIVE_QUEUES,
 	__TCA_CAKE_STATS_MAX
 };
 #define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1)
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 2e60e7980558..e30ef7f8ee68 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -202,6 +202,7 @@ struct cake_sched_config {
 	u64		rate_bps;
 	u64		interval;
 	u64		target;
+	u64		sync_time;
 	u32		buffer_config_limit;
 	u32		fwmark_mask;
 	u16		fwmark_shft;
@@ -258,6 +259,11 @@ struct cake_sched_data {
 	u16		max_adjlen;
 	u16		min_netlen;
 	u16		min_adjlen;
+
+	/* mq sync state */
+	u64		last_checked_active;
+	u64		last_active;
+	u32		active_queues;
 };
 
 enum {
@@ -384,6 +390,8 @@ static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
 	1239850263, 1191209601, 1147878294, 1108955788
 };
 
+static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
+			  u64 target_ns, u64 rtt_est_ns);
 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
  *
@@ -2004,6 +2012,40 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch)
 	u64 delay;
 	u32 len;
 
+	if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) {
+		struct net_device *dev = qdisc_dev(sch);
+		struct cake_sched_data *other_priv;
+		u64 new_rate = q->config->rate_bps;
+		u64 other_qlen, other_last_active;
+		struct Qdisc *other_sch;
+		u32 num_active_qs = 1;
+		unsigned int ntx;
+
+		for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+			other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
+			other_priv = qdisc_priv(other_sch);
+
+			if (other_priv == q)
+				continue;
+
+			other_qlen = READ_ONCE(other_sch->q.qlen);
+			other_last_active = READ_ONCE(other_priv->last_active);
+
+			if (other_qlen || other_last_active > q->last_checked_active)
+				num_active_qs++;
+		}
+
+		if (num_active_qs > 1)
+			new_rate = div64_u64(q->config->rate_bps, num_active_qs);
+
+		/* mtu = 0 is used to only update the rate and not mess with cobalt params */
+		cake_set_rate(b, new_rate, 0, 0, 0);
+		q->last_checked_active = now;
+		q->active_queues = num_active_qs;
+		q->rate_ns = b->tin_rate_ns;
+		q->rate_shft = b->tin_rate_shft;
+	}
+
 begin:
 	if (!sch->q.qlen)
 		return NULL;
@@ -2203,6 +2245,7 @@ retry:
 
 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
 	qdisc_bstats_update(sch, skb);
+	WRITE_ONCE(q->last_active, now);
 
 	/* collect delay stats */
 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
@@ -2303,6 +2346,9 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
 	b->tin_rate_ns   = rate_ns;
 	b->tin_rate_shft = rate_shft;
 
+	if (mtu == 0)
+		return;
+
 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
 
 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
@@ -2769,6 +2815,7 @@ static void cake_config_init(struct cake_sched_config *q, bool is_shared)
 			       */
 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
 	q->is_shared = is_shared;
+	q->sync_time = 200 * NSEC_PER_USEC;
 }
 
 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
@@ -2842,6 +2889,9 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
 	qd->avg_peak_bandwidth = q->rate_bps;
 	qd->min_netlen = ~0;
 	qd->min_adjlen = ~0;
+	qd->active_queues = 0;
+	qd->last_checked_active = 0;
+
 	return 0;
 err:
 	kvfree(qd->config);
@@ -2974,6 +3024,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
+	PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues);
 
 #undef PUT_STAT_U32
 #undef PUT_STAT_U64
-- 
cgit v1.2.3


From f29c852149f94dc1975c64fa919b3dd62db04d23 Mon Sep 17 00:00:00 2001
From: Ainy Kumari <ainy.kumari@oss.qualcomm.com>
Date: Wed, 14 Jan 2026 16:48:52 +0530
Subject: wifi: cfg80211: add support for EPPKE Authentication Protocol

Add an extended feature flag NL80211_EXT_FEATURE_EPPKE to allow a
driver to indicate support for the Enhanced Privacy Protection Key
Exchange (EPPKE) authentication protocol in non-AP STA mode, as
defined in "IEEE P802.11bi/D3.0, 12.16.9".

In case of SME in userspace, the Authentication frame body is prepared
in userspace while the driver finalizes the Authentication frame once
it receives the required fields and elements. The driver indicates
support for EPPKE using the extended feature flag so that userspace
can initiate EPPKE authentication.

When the feature flag is set, process EPPKE Authentication frames from
userspace in non-AP STA mode. If the flag is not set, reject EPPKE
Authentication frames.

Define a new authentication type NL80211_AUTHTYPE_EPPKE for EPPKE.

Signed-off-by: Ainy Kumari <ainy.kumari@oss.qualcomm.com>
Co-developed-by: Kavita Kavita <kavita.kavita@oss.qualcomm.com>
Signed-off-by: Kavita Kavita <kavita.kavita@oss.qualcomm.com>
Link: https://patch.msgid.link/20260114111900.2196941-2-kavita.kavita@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    |  1 +
 include/uapi/linux/nl80211.h |  7 +++++++
 net/wireless/nl80211.c       | 14 ++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 96439de55f07..fbde215c25aa 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1351,6 +1351,7 @@ struct ieee80211_tdls_data {
 #define WLAN_AUTH_FILS_SK 4
 #define WLAN_AUTH_FILS_SK_PFS 5
 #define WLAN_AUTH_FILS_PK 6
+#define WLAN_AUTH_EPPKE 9
 #define WLAN_AUTH_LEAP 128
 
 #define WLAN_AUTH_CHALLENGE_LEN 128
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 964e1c779cdd..351d4d176f87 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5429,6 +5429,7 @@ enum nl80211_bss_status {
  * @NL80211_AUTHTYPE_FILS_SK: Fast Initial Link Setup shared key
  * @NL80211_AUTHTYPE_FILS_SK_PFS: Fast Initial Link Setup shared key with PFS
  * @NL80211_AUTHTYPE_FILS_PK: Fast Initial Link Setup public key
+ * @NL80211_AUTHTYPE_EPPKE: Enhanced Privacy Protection Key Exchange
  * @__NL80211_AUTHTYPE_NUM: internal
  * @NL80211_AUTHTYPE_MAX: maximum valid auth algorithm
  * @NL80211_AUTHTYPE_AUTOMATIC: determine automatically (if necessary by
@@ -5444,6 +5445,7 @@ enum nl80211_auth_type {
 	NL80211_AUTHTYPE_FILS_SK,
 	NL80211_AUTHTYPE_FILS_SK_PFS,
 	NL80211_AUTHTYPE_FILS_PK,
+	NL80211_AUTHTYPE_EPPKE,
 
 	/* keep last */
 	__NL80211_AUTHTYPE_NUM,
@@ -6748,6 +6750,10 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_BEACON_RATE_EHT: Driver supports beacon rate
  *	configuration (AP/mesh) with EHT rates.
  *
+ * @NL80211_EXT_FEATURE_EPPKE: Driver supports Enhanced Privacy Protection
+ *	Key Exchange (EPPKE) with user space SME (NL80211_CMD_AUTHENTICATE)
+ *	in non-AP STA mode.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6824,6 +6830,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_DFS_CONCURRENT,
 	NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT,
 	NL80211_EXT_FEATURE_BEACON_RATE_EHT,
+	NL80211_EXT_FEATURE_EPPKE,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 225580507a4b..8f3a27b7d4fd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6473,6 +6473,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 		     auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
 		     auth_type == NL80211_AUTHTYPE_FILS_PK))
 			return false;
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_EPPKE) &&
+		    auth_type == NL80211_AUTHTYPE_EPPKE)
+			return false;
 		return true;
 	case NL80211_CMD_CONNECT:
 		if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
@@ -6490,6 +6494,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 			    NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) &&
 		    auth_type == NL80211_AUTHTYPE_FILS_SK)
 			return false;
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_EPPKE) &&
+		    auth_type == NL80211_AUTHTYPE_EPPKE)
+			return false;
 		return true;
 	case NL80211_CMD_START_AP:
 		if (!wiphy_ext_feature_isset(&rdev->wiphy,
@@ -11956,7 +11964,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 	if ((auth_type == NL80211_AUTHTYPE_SAE ||
 	     auth_type == NL80211_AUTHTYPE_FILS_SK ||
 	     auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
-	     auth_type == NL80211_AUTHTYPE_FILS_PK) &&
+	     auth_type == NL80211_AUTHTYPE_FILS_PK ||
+	     auth_type == NL80211_AUTHTYPE_EPPKE) &&
 	    !info->attrs[NL80211_ATTR_AUTH_DATA])
 		return -EINVAL;
 
@@ -11964,7 +11973,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 		if (auth_type != NL80211_AUTHTYPE_SAE &&
 		    auth_type != NL80211_AUTHTYPE_FILS_SK &&
 		    auth_type != NL80211_AUTHTYPE_FILS_SK_PFS &&
-		    auth_type != NL80211_AUTHTYPE_FILS_PK)
+		    auth_type != NL80211_AUTHTYPE_FILS_PK &&
+		    auth_type != NL80211_AUTHTYPE_EPPKE)
 			return -EINVAL;
 		req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
 		req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
-- 
cgit v1.2.3


From 9d17a040c15d4b99484f13cf08dd45a9e308beeb Mon Sep 17 00:00:00 2001
From: Ainy Kumari <ainy.kumari@oss.qualcomm.com>
Date: Wed, 14 Jan 2026 16:48:53 +0530
Subject: wifi: cfg80211: add feature flag for (re)association frame encryption

Introduce an extended feature flag that allows drivers to signal
support for encryption of (Re)Association Request and Response frames
in both non-AP STA and AP mode, as specified in specification
"IEEE P802.11bi/D3.0, 12.16.6".

Signed-off-by: Ainy Kumari <ainy.kumari@oss.qualcomm.com>
Signed-off-by: Kavita Kavita <kavita.kavita@oss.qualcomm.com>
Link: https://patch.msgid.link/20260114111900.2196941-3-kavita.kavita@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 351d4d176f87..60573334e086 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6754,6 +6754,11 @@ enum nl80211_feature_flags {
  *	Key Exchange (EPPKE) with user space SME (NL80211_CMD_AUTHENTICATE)
  *	in non-AP STA mode.
  *
+ * @NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION: This specifies that the
+ *	driver supports encryption of (Re)Association Request and Response
+ *	frames in both non‑AP STA and AP mode as specified in
+ *	"IEEE P802.11bi/D3.0, 12.16.6".
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6831,6 +6836,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT,
 	NL80211_EXT_FEATURE_BEACON_RATE_EHT,
 	NL80211_EXT_FEATURE_EPPKE,
+	NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3


From 6ee3a22c61cdf57d71592ec9f3b9439cd5d0c75f Mon Sep 17 00:00:00 2001
From: Sai Pratyusha Magam <sai.magam@oss.qualcomm.com>
Date: Wed, 14 Jan 2026 16:48:55 +0530
Subject: wifi: nl80211: Add support for EPP peer indication

Introduce a new netlink attribute NL80211_ATTR_EPP_PEER
to be used with NL80211_CMD_NEW_STA and
NL80211_CMD_ADD_LINK_STA for the userspace to indicate
that a non-AP STA is an Enhanced Privacy Protection (EPP)
peer.

Co-developed-by: Rohan Dutta <quic_drohan@quicinc.com>
Signed-off-by: Rohan Dutta <quic_drohan@quicinc.com>
Signed-off-by: Sai Pratyusha Magam <sai.magam@oss.qualcomm.com>
Signed-off-by: Kavita Kavita <kavita.kavita@oss.qualcomm.com>
Link: https://patch.msgid.link/20260114111900.2196941-5-kavita.kavita@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 5 +++++
 net/wireless/nl80211.c       | 5 +++++
 3 files changed, 12 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cbccedf32228..6d8e35a0dde4 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1785,6 +1785,7 @@ struct cfg80211_ttlm_params {
  *	present/updated
  * @eml_cap: EML capabilities of this station
  * @link_sta_params: link related params.
+ * @epp_peer: EPP peer indication
  */
 struct station_parameters {
 	struct net_device *vlan;
@@ -1811,6 +1812,7 @@ struct station_parameters {
 	bool eml_cap_present;
 	u16 eml_cap;
 	struct link_station_parameters link_sta_params;
+	bool epp_peer;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 60573334e086..eb92296457c9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2973,6 +2973,9 @@ enum nl80211_commands {
  *	primary channel is 2 MHz wide, and the control channel designates
  *	the 1 MHz primary subchannel within that 2 MHz primary.
  *
+ * @NL80211_ATTR_EPP_PEER: A flag attribute to indicate if the peer is an EPP
+ *	STA. Used with %NL80211_CMD_NEW_STA and %NL80211_CMD_ADD_LINK_STA
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3541,6 +3544,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_S1G_PRIMARY_2MHZ,
 
+	NL80211_ATTR_EPP_PEER,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index df159a5f1816..3d74bea09ba3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -932,6 +932,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		NLA_POLICY_NESTED(nl80211_s1g_short_beacon),
 	[NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG },
 	[NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG },
+	[NL80211_ATTR_EPP_PEER] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -8792,6 +8793,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 			goto out;
 		}
 	}
+
+	params.epp_peer =
+		nla_get_flag(info->attrs[NL80211_ATTR_EPP_PEER]);
+
 	err = rdev_add_station(rdev, dev, mac_addr, &params);
 out:
 	dev_put(params.vlan);
-- 
cgit v1.2.3


From dacbfc16780837aa3e00c684d89492d211fd809f Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Mon, 5 Jan 2026 13:24:03 +0100
Subject: crypto: af_alg - Annotate struct af_alg_iv with __counted_by

Add the __counted_by() compiler attribute to the flexible array member
'iv' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://patch.msgid.link/20260105122402.2685-2-thorsten.blum@linux.dev
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/uapi/linux/if_alg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h
index b35871cbeed7..4f51e198ac2e 100644
--- a/include/uapi/linux/if_alg.h
+++ b/include/uapi/linux/if_alg.h
@@ -42,7 +42,7 @@ struct sockaddr_alg_new {
 
 struct af_alg_iv {
 	__u32	ivlen;
-	__u8	iv[];
+	__u8	iv[] __counted_by(ivlen);
 };
 
 /* Socket options */
-- 
cgit v1.2.3


From d2bdcde9626cbea0c44a6aaa33b440c8adf81e09 Mon Sep 17 00:00:00 2001
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
Date: Wed, 14 Jan 2026 09:17:45 +0800
Subject: perf/x86/intel: Add support for PEBS memory auxiliary info field in
 DMR

With the introduction of the OMR feature, the PEBS memory auxiliary info
field for load and store latency events has been restructured for DMR.

The memory auxiliary info field's bit[8] indicates whether a L2 cache
miss occurred for a memory load or store instruction. If bit[8] is 0,
it signifies no L2 cache miss, and bits[7:0] specify the exact cache data
source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent
the OMR encoding, indicating the specific L3 cache or memory region
involved in the memory access. A significant enhancement is OMR encoding
provides up to 8 fine-grained memory regions besides the cache region.

A significant enhancement for OMR encoding is the ability to provide
up to 8 fine-grained memory regions in addition to the cache region,
offering more detailed insights into memory access regions.

For detailed information on the memory auxiliary info encoding, please
refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in
the ISE documentation.

This patch ensures that the PEBS memory auxiliary info field is correctly
interpreted and utilized in DMR.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com
---
 arch/x86/events/intel/ds.c            | 140 ++++++++++++++++++++++++++++++++++
 arch/x86/events/perf_event.h          |   2 +
 include/uapi/linux/perf_event.h       |  27 ++++++-
 tools/include/uapi/linux/perf_event.h |  27 ++++++-
 4 files changed, 190 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index feb1c3cf63e4..272e652f25fc 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -34,6 +34,17 @@ struct pebs_record_32 {
 
  */
 
+union omr_encoding {
+	struct {
+		u8 omr_source : 4;
+		u8 omr_remote : 1;
+		u8 omr_hitm : 1;
+		u8 omr_snoop : 1;
+		u8 omr_promoted : 1;
+	};
+	u8 omr_full;
+};
+
 union intel_x86_pebs_dse {
 	u64 val;
 	struct {
@@ -73,6 +84,18 @@ union intel_x86_pebs_dse {
 		unsigned int lnc_addr_blk:1;
 		unsigned int ld_reserved6:18;
 	};
+	struct {
+		unsigned int pnc_dse: 8;
+		unsigned int pnc_l2_miss:1;
+		unsigned int pnc_stlb_clean_hit:1;
+		unsigned int pnc_stlb_any_hit:1;
+		unsigned int pnc_stlb_miss:1;
+		unsigned int pnc_locked:1;
+		unsigned int pnc_data_blk:1;
+		unsigned int pnc_addr_blk:1;
+		unsigned int pnc_fb_full:1;
+		unsigned int ld_reserved8:16;
+	};
 };
 
 
@@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void)
 	__intel_pmu_pebs_data_source_cmt(data_source);
 }
 
+/* Version for Panthercove and later */
+
+/* L2 hit */
+#define PNC_PEBS_DATA_SOURCE_MAX	16
+static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = {
+	P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA),	/* 0x00: non-cache access */
+	OP_LH               | LEVEL(L0) | P(SNOOP, NONE),	/* 0x01: L0 hit */
+	OP_LH | P(LVL, L1)  | LEVEL(L1) | P(SNOOP, NONE),	/* 0x02: L1 hit */
+	OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE),	/* 0x03: L1 Miss Handling Buffer hit */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, NONE),	/* 0x04: L2 Hit Clean */
+	0,							/* 0x05: Reserved */
+	0,							/* 0x06: Reserved */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, HIT),	/* 0x07: L2 Hit Snoop HIT */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, HITM),	/* 0x08: L2 Hit Snoop Hit Modified */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, MISS),	/* 0x09: Prefetch Promotion */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, MISS),	/* 0x0a: Cross Core Prefetch Promotion */
+	0,							/* 0x0b: Reserved */
+	0,							/* 0x0c: Reserved */
+	0,							/* 0x0d: Reserved */
+	0,							/* 0x0e: Reserved */
+	OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE),	/* 0x0f: uncached */
+};
+
+/* L2 miss */
+#define OMR_DATA_SOURCE_MAX		16
+static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = {
+	P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA),	/* 0x00: invalid */
+	0,							/* 0x01: Reserved */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE),	/* 0x02: local CA shared cache */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO),	/* 0x04: other CA IO agent */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE),	/* 0x05: other CA shared cache */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */
+	OP_LH | LEVEL(RAM) | P(REGION, MMIO),			/* 0x07: MMIO */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM0),			/* 0x08: Memory region 0 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM1),			/* 0x09: Memory region 1 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM2),			/* 0x0a: Memory region 2 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM3),			/* 0x0b: Memory region 3 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM4),			/* 0x0c: Memory region 4 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM5),			/* 0x0d: Memory region 5 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM6),			/* 0x0e: Memory region 6 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM7),			/* 0x0f: Memory region 7 */
+};
+
+static u64 parse_omr_data_source(u8 dse)
+{
+	union omr_encoding omr;
+	u64 val = 0;
+
+	omr.omr_full = dse;
+	val = omr_data_source[omr.omr_source];
+	if (omr.omr_source > 0x1 && omr.omr_source < 0x7)
+		val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0;
+	else if (omr.omr_source > 0x7)
+		val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM);
+
+	if (omr.omr_remote)
+		val |= REM;
+
+	val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT);
+
+	if (omr.omr_source == 0x2) {
+		u8 snoop = omr.omr_snoop | omr.omr_promoted;
+
+		if (snoop == 0x0)
+			val |= P(SNOOP, NA);
+		else if (snoop == 0x1)
+			val |= P(SNOOP, MISS);
+		else if (snoop == 0x2)
+			val |= P(SNOOP, HIT);
+		else if (snoop == 0x3)
+			val |= P(SNOOP, NONE);
+	} else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) {
+		val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0;
+	}
+
+	return val;
+}
+
 static u64 precise_store_data(u64 status)
 {
 	union intel_x86_pebs_dse dse;
@@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status)
 	return lnl_latency_data(event, status);
 }
 
+u64 pnc_latency_data(struct perf_event *event, u64 status)
+{
+	union intel_x86_pebs_dse dse;
+	union perf_mem_data_src src;
+	u64 val;
+
+	dse.val = status;
+
+	if (!dse.pnc_l2_miss)
+		val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf];
+	else
+		val = parse_omr_data_source(dse.pnc_dse);
+
+	if (!val)
+		val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
+
+	if (dse.pnc_stlb_miss)
+		val |= P(TLB, MISS) | P(TLB, L2);
+	else
+		val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+	if (dse.pnc_locked)
+		val |= P(LOCK, LOCKED);
+
+	if (dse.pnc_data_blk)
+		val |= P(BLK, DATA);
+	if (dse.pnc_addr_blk)
+		val |= P(BLK, ADDR);
+	if (!dse.pnc_data_blk && !dse.pnc_addr_blk)
+		val |= P(BLK, NA);
+
+	src.val = val;
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+		src.mem_op = P(OP, STORE);
+
+	return src.val;
+}
+
 static u64 load_latency_data(struct perf_event *event, u64 status)
 {
 	union intel_x86_pebs_dse dse;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 586e3fdfe6d8..bd501c2a0f73 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status);
 
 u64 arl_h_latency_data(struct perf_event *event, u64 status);
 
+u64 pnc_latency_data(struct perf_event *event, u64 status);
+
 extern struct event_constraint intel_core2_pebs_event_constraints[];
 
 extern struct event_constraint intel_atom_pebs_event_constraints[];
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c44a8fb3e418..533393ec94d0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1330,14 +1330,16 @@ union perf_mem_data_src {
 			mem_snoopx  :  2, /* Snoop mode, ext */
 			mem_blk     :  3, /* Access blocked */
 			mem_hops    :  3, /* Hop level */
-			mem_rsvd    : 18;
+			mem_region  :  5, /* cache/memory regions */
+			mem_rsvd    : 13;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd    : 18,
+		__u64	mem_rsvd    : 13,
+			mem_region  :  5, /* cache/memory regions */
 			mem_hops    :  3, /* Hop level */
 			mem_blk     :  3, /* Access blocked */
 			mem_snoopx  :  2, /* Snoop mode, ext */
@@ -1394,7 +1396,7 @@ union perf_mem_data_src {
 #define PERF_MEM_LVLNUM_L4			0x0004 /* L4 */
 #define PERF_MEM_LVLNUM_L2_MHB			0x0005 /* L2 Miss Handling Buffer */
 #define PERF_MEM_LVLNUM_MSC			0x0006 /* Memory-side Cache */
-/* 0x007 available */
+#define PERF_MEM_LVLNUM_L0			0x0007 /* L0 */
 #define PERF_MEM_LVLNUM_UNC			0x0008 /* Uncached */
 #define PERF_MEM_LVLNUM_CXL			0x0009 /* CXL */
 #define PERF_MEM_LVLNUM_IO			0x000a /* I/O */
@@ -1447,6 +1449,25 @@ union perf_mem_data_src {
 /* 5-7 available */
 #define PERF_MEM_HOPS_SHIFT			43
 
+/* Cache/Memory region */
+#define PERF_MEM_REGION_NA		0x0  /* Invalid */
+#define PERF_MEM_REGION_RSVD		0x01 /* Reserved */
+#define PERF_MEM_REGION_L_SHARE		0x02 /* Local CA shared cache */
+#define PERF_MEM_REGION_L_NON_SHARE	0x03 /* Local CA non-shared cache */
+#define PERF_MEM_REGION_O_IO		0x04 /* Other CA IO agent */
+#define PERF_MEM_REGION_O_SHARE		0x05 /* Other CA shared cache */
+#define PERF_MEM_REGION_O_NON_SHARE	0x06 /* Other CA non-shared cache */
+#define PERF_MEM_REGION_MMIO		0x07 /* MMIO */
+#define PERF_MEM_REGION_MEM0		0x08 /* Memory region 0 */
+#define PERF_MEM_REGION_MEM1		0x09 /* Memory region 1 */
+#define PERF_MEM_REGION_MEM2		0x0a /* Memory region 2 */
+#define PERF_MEM_REGION_MEM3		0x0b /* Memory region 3 */
+#define PERF_MEM_REGION_MEM4		0x0c /* Memory region 4 */
+#define PERF_MEM_REGION_MEM5		0x0d /* Memory region 5 */
+#define PERF_MEM_REGION_MEM6		0x0e /* Memory region 6 */
+#define PERF_MEM_REGION_MEM7		0x0f /* Memory region 7 */
+#define PERF_MEM_REGION_SHIFT		46
+
 #define PERF_MEM_S(a, s) \
 	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
 
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index c44a8fb3e418..d4b99610a3b0 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -1330,14 +1330,16 @@ union perf_mem_data_src {
 			mem_snoopx  :  2, /* Snoop mode, ext */
 			mem_blk     :  3, /* Access blocked */
 			mem_hops    :  3, /* Hop level */
-			mem_rsvd    : 18;
+			mem_region  :  5, /* cache/memory regions */
+			mem_rsvd    : 13;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd    : 18,
+		__u64	mem_rsvd    : 13,
+			mem_region  :  5, /* cache/memory regions */
 			mem_hops    :  3, /* Hop level */
 			mem_blk     :  3, /* Access blocked */
 			mem_snoopx  :  2, /* Snoop mode, ext */
@@ -1394,7 +1396,7 @@ union perf_mem_data_src {
 #define PERF_MEM_LVLNUM_L4			0x0004 /* L4 */
 #define PERF_MEM_LVLNUM_L2_MHB			0x0005 /* L2 Miss Handling Buffer */
 #define PERF_MEM_LVLNUM_MSC			0x0006 /* Memory-side Cache */
-/* 0x007 available */
+#define PERF_MEM_LVLNUM_L0			0x0007   /* L0 */
 #define PERF_MEM_LVLNUM_UNC			0x0008 /* Uncached */
 #define PERF_MEM_LVLNUM_CXL			0x0009 /* CXL */
 #define PERF_MEM_LVLNUM_IO			0x000a /* I/O */
@@ -1447,6 +1449,25 @@ union perf_mem_data_src {
 /* 5-7 available */
 #define PERF_MEM_HOPS_SHIFT			43
 
+/* Cache/Memory region */
+#define PERF_MEM_REGION_NA		0x0  /* Invalid */
+#define PERF_MEM_REGION_RSVD		0x01 /* Reserved */
+#define PERF_MEM_REGION_L_SHARE		0x02 /* Local CA shared cache */
+#define PERF_MEM_REGION_L_NON_SHARE	0x03 /* Local CA non-shared cache */
+#define PERF_MEM_REGION_O_IO		0x04 /* Other CA IO agent */
+#define PERF_MEM_REGION_O_SHARE		0x05 /* Other CA shared cache */
+#define PERF_MEM_REGION_O_NON_SHARE	0x06 /* Other CA non-shared cache */
+#define PERF_MEM_REGION_MMIO		0x07 /* MMIO */
+#define PERF_MEM_REGION_MEM0		0x08 /* Memory region 0 */
+#define PERF_MEM_REGION_MEM1		0x09 /* Memory region 1 */
+#define PERF_MEM_REGION_MEM2		0x0a /* Memory region 2 */
+#define PERF_MEM_REGION_MEM3		0x0b /* Memory region 3 */
+#define PERF_MEM_REGION_MEM4		0x0c /* Memory region 4 */
+#define PERF_MEM_REGION_MEM5		0x0d /* Memory region 5 */
+#define PERF_MEM_REGION_MEM6		0x0e /* Memory region 6 */
+#define PERF_MEM_REGION_MEM7		0x0f /* Memory region 7 */
+#define PERF_MEM_REGION_SHIFT		46
+
 #define PERF_MEM_S(a, s) \
 	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
 
-- 
cgit v1.2.3


From 567873005dca1be0a3b3e2e309a8f0de14d2b827 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Thu, 15 Jan 2026 08:05:44 +0200
Subject: ethtool: Clarify len/n_stats fields in/out semantics

Document that the 'len' field in ethtool_gstrings and 'n_stats' field in
ethtool_stats optionally serve dual purposes: on entry they specify the
number of items requested, and on return they indicate the number
actually returned (which is not necessarily the same).

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Link: https://patch.msgid.link/20260115060544.481550-1-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 5daa8f225b67..bbfe6e1cf01b 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1096,13 +1096,20 @@ enum ethtool_module_fw_flash_status {
  * struct ethtool_gstrings - string set for data tagging
  * @cmd: Command number = %ETHTOOL_GSTRINGS
  * @string_set: String set ID; one of &enum ethtool_stringset
- * @len: On return, the number of strings in the string set
+ * @len: Number of strings in the string set
  * @data: Buffer for strings.  Each string is null-padded to a size of
  *	%ETH_GSTRING_LEN.
  *
  * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in
  * the string set.  They must allocate a buffer of the appropriate
  * size immediately following this structure.
+ *
+ * Setting @len on input is optional (though preferred), but must be zeroed
+ * otherwise.
+ * When set, @len will return the requested count if it matches the actual
+ * count; otherwise, it will be zero.
+ * This prevents issues when the number of strings is different than the
+ * userspace allocation.
  */
 struct ethtool_gstrings {
 	__u32	cmd;
@@ -1179,13 +1186,20 @@ struct ethtool_test {
 /**
  * struct ethtool_stats - device-specific statistics
  * @cmd: Command number = %ETHTOOL_GSTATS
- * @n_stats: On return, the number of statistics
+ * @n_stats: Number of statistics
  * @data: Array of statistics
  *
  * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the
  * number of statistics that will be returned.  They must allocate a
  * buffer of the appropriate size (8 * number of statistics)
  * immediately following this structure.
+ *
+ * Setting @n_stats on input is optional (though preferred), but must be zeroed
+ * otherwise.
+ * When set, @n_stats will return the requested count if it matches the actual
+ * count; otherwise, it will be zero.
+ * This prevents issues when the number of stats is different than the
+ * userspace allocation.
  */
 struct ethtool_stats {
 	__u32	cmd;
-- 
cgit v1.2.3


From d89ccbf3dde727d91a242a5a3f3b70a90579b057 Mon Sep 17 00:00:00 2001
From: Richard Leitner <richard.leitner@linux.dev>
Date: Tue, 9 Dec 2025 23:44:36 +0100
Subject: media: v4l: ctrls: add a control for flash/strobe duration

Add a V4L2_CID_FLASH_DURATION control to set the duration of a
flash/strobe pulse. This controls the length of the flash/strobe pulse
output by device (typically a camera sensor) and connected to the flash
controller. This is different to the V4L2_CID_FLASH_TIMEOUT control,
which is implemented by the flash controller and defines a limit after
which the flash is "forcefully" turned off again.

Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Richard Leitner <richard.leitner@linux.dev>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ctrls-defs.c | 1 +
 include/uapi/linux/v4l2-controls.h        | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
index ad41f65374e2..4848423205ff 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
@@ -1135,6 +1135,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_FLASH_FAULT:		return "Faults";
 	case V4L2_CID_FLASH_CHARGE:		return "Charge";
 	case V4L2_CID_FLASH_READY:		return "Ready to Strobe";
+	case V4L2_CID_FLASH_DURATION:		return "Strobe Duration";
 
 	/* JPEG encoder controls */
 	/* Keep the order of the 'case's the same as in v4l2-controls.h! */
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index f84ed133a6c9..357845830fe9 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1192,6 +1192,7 @@ enum v4l2_flash_strobe_source {
 
 #define V4L2_CID_FLASH_CHARGE			(V4L2_CID_FLASH_CLASS_BASE + 11)
 #define V4L2_CID_FLASH_READY			(V4L2_CID_FLASH_CLASS_BASE + 12)
+#define V4L2_CID_FLASH_DURATION			(V4L2_CID_FLASH_CLASS_BASE + 13)
 
 
 /* JPEG-class control IDs */
-- 
cgit v1.2.3


From 5be4154f6255d92d9d2ad5da658d7d33a655386f Mon Sep 17 00:00:00 2001
From: Richard Leitner <richard.leitner@linux.dev>
Date: Tue, 9 Dec 2025 23:44:37 +0100
Subject: media: v4l: ctrls: add a control for enabling strobe output

Add a control V4L2_CID_FLASH_STROBE_OE to en- or disable the
strobe output of v4l2 devices (most likely sensors).

Signed-off-by: Richard Leitner <richard.leitner@linux.dev>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ctrls-defs.c | 2 ++
 include/uapi/linux/v4l2-controls.h        | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
index 4848423205ff..765aeeec84fe 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
@@ -1136,6 +1136,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_FLASH_CHARGE:		return "Charge";
 	case V4L2_CID_FLASH_READY:		return "Ready to Strobe";
 	case V4L2_CID_FLASH_DURATION:		return "Strobe Duration";
+	case V4L2_CID_FLASH_STROBE_OE:		return "Strobe Output Enable";
 
 	/* JPEG encoder controls */
 	/* Keep the order of the 'case's the same as in v4l2-controls.h! */
@@ -1282,6 +1283,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_FLASH_STROBE_STATUS:
 	case V4L2_CID_FLASH_CHARGE:
 	case V4L2_CID_FLASH_READY:
+	case V4L2_CID_FLASH_STROBE_OE:
 	case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER:
 	case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE:
 	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 357845830fe9..572622e4535e 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1193,6 +1193,7 @@ enum v4l2_flash_strobe_source {
 #define V4L2_CID_FLASH_CHARGE			(V4L2_CID_FLASH_CLASS_BASE + 11)
 #define V4L2_CID_FLASH_READY			(V4L2_CID_FLASH_CLASS_BASE + 12)
 #define V4L2_CID_FLASH_DURATION			(V4L2_CID_FLASH_CLASS_BASE + 13)
+#define V4L2_CID_FLASH_STROBE_OE		(V4L2_CID_FLASH_CLASS_BASE + 14)
 
 
 /* JPEG-class control IDs */
-- 
cgit v1.2.3


From 10d28cffb3f6ec7ad67f0a4cd32c2afa92909452 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 3 Dec 2025 16:24:38 +0000
Subject: comedi: Fix getting range information for subdevices 16 to 255

The `COMEDI_RANGEINFO` ioctl does not work properly for subdevice
indices above 15.  Currently, the only in-tree COMEDI drivers that
support more than 16 subdevices are the "8255" driver and the
"comedi_bond" driver.  Making the ioctl work for subdevice indices up to
255 is achievable.  It needs minor changes to the handling of the
`COMEDI_RANGEINFO` and `COMEDI_CHANINFO` ioctls that should be mostly
harmless to user-space, apart from making them less broken.  Details
follow...

The `COMEDI_RANGEINFO` ioctl command gets the list of supported ranges
(usually with units of volts or milliamps) for a COMEDI subdevice or
channel.  (Only some subdevices have per-channel range tables, indicated
by the `SDF_RANGETYPE` flag in the subdevice information.)  It uses a
`range_type` value and a user-space pointer, both supplied by
user-space, but the `range_type` value should match what was obtained
using the `COMEDI_CHANINFO` ioctl (if the subdevice has per-channel
range tables)  or `COMEDI_SUBDINFO` ioctl (if the subdevice uses a
single range table for all channels).  Bits 15 to 0 of the `range_type`
value contain the length of the range table, which is the only part that
user-space should care about (so it can use a suitably sized buffer to
fetch the range table).  Bits 23 to 16 store the channel index, which is
assumed to be no more than 255 if the subdevice has per-channel range
tables, and is set to 0 if the subdevice has a single range table.  For
`range_type` values produced by the `COMEDI_SUBDINFO` ioctl, bits 31 to
24 contain the subdevice index, which is assumed to be no more than 255.
But for `range_type` values produced by the `COMEDI_CHANINFO` ioctl,
bits 27 to 24 contain the subdevice index, which is assumed to be no
more than 15, and bits 31 to 28 contain the COMEDI device's minor device
number for some unknown reason lost in the mists of time.  The
`COMEDI_RANGEINFO` ioctl extract the length from bits 15 to 0 of the
user-supplied `range_type` value, extracts the channel index from bits
23 to 16 (only used if the subdevice has per-channel range tables),
extracts the subdevice index from bits 27 to 24, and ignores bits 31 to
28.  So for subdevice indices 16 to 255, the `COMEDI_SUBDINFO` or
`COMEDI_CHANINFO` ioctl will report a `range_type` value that doesn't
work with the `COMEDI_RANGEINFO` ioctl.  It will either get the range
table for the subdevice index modulo 16, or will fail with `-EINVAL`.

To fix this, always use bits 31 to 24 of the `range_type` value to hold
the subdevice index (assumed to be no more than 255).  This affects the
`COMEDI_CHANINFO` and `COMEDI_RANGEINFO` ioctls.  There should not be
anything in user-space that depends on the old, broken usage, although
it may now see different values in bits 31 to 28 of the `range_type`
values reported by the `COMEDI_CHANINFO` ioctl for subdevices that have
per-channel subdevices.  User-space should not be trying to decode bits
31 to 16 of the `range_type` values anyway.

Fixes: ed9eccbe8970 ("Staging: add comedi core")
Cc: stable@vger.kernel.org #5.17+
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://patch.msgid.link/20251203162438.176841-1-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/comedi/comedi_fops.c | 2 +-
 drivers/comedi/range.c       | 2 +-
 include/uapi/linux/comedi.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c
index 657c98cd723e..2c3eb9e89571 100644
--- a/drivers/comedi/comedi_fops.c
+++ b/drivers/comedi/comedi_fops.c
@@ -1155,7 +1155,7 @@ static int do_chaninfo_ioctl(struct comedi_device *dev,
 		for (i = 0; i < s->n_chan; i++) {
 			int x;
 
-			x = (dev->minor << 28) | (it->subdev << 24) | (i << 16) |
+			x = (it->subdev << 24) | (i << 16) |
 			    (s->range_table_list[i]->length);
 			if (put_user(x, it->rangelist + i))
 				return -EFAULT;
diff --git a/drivers/comedi/range.c b/drivers/comedi/range.c
index 8f43cf88d784..5b8f662365e3 100644
--- a/drivers/comedi/range.c
+++ b/drivers/comedi/range.c
@@ -52,7 +52,7 @@ int do_rangeinfo_ioctl(struct comedi_device *dev,
 	const struct comedi_lrange *lr;
 	struct comedi_subdevice *s;
 
-	subd = (it->range_type >> 24) & 0xf;
+	subd = (it->range_type >> 24) & 0xff;
 	chan = (it->range_type >> 16) & 0xff;
 
 	if (!dev->attached)
diff --git a/include/uapi/linux/comedi.h b/include/uapi/linux/comedi.h
index 7314e5ee0a1e..798ec9a39e12 100644
--- a/include/uapi/linux/comedi.h
+++ b/include/uapi/linux/comedi.h
@@ -640,7 +640,7 @@ struct comedi_chaninfo {
 
 /**
  * struct comedi_rangeinfo - used to retrieve the range table for a channel
- * @range_type:		Encodes subdevice index (bits 27:24), channel index
+ * @range_type:		Encodes subdevice index (bits 31:24), channel index
  *			(bits 23:16) and range table length (bits 15:0).
  * @range_ptr:		Pointer to array of @struct comedi_krange to be filled
  *			in with the range table for the channel or subdevice.
-- 
cgit v1.2.3


From 9b8a0ba68246a61d903ce62c35c303b1501df28b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Dec 2025 14:03:24 +0100
Subject: mount: add OPEN_TREE_NAMESPACE

When creating containers the setup usually involves using CLONE_NEWNS
via clone3() or unshare(). This copies the caller's complete mount
namespace. The runtime will also assemble a new rootfs and then use
pivot_root() to switch the old mount tree with the new rootfs. Afterward
it will recursively umount the old mount tree thereby getting rid of all
mounts.

On a basic system here where the mount table isn't particularly large
this still copies about 30 mounts. Copying all of these mounts only to
get rid of them later is pretty wasteful.

This is exacerbated if intermediary mount namespaces are used that only
exist for a very short amount of time and are immediately destroyed
again causing a ton of mounts to be copied and destroyed needlessly.

With a large mount table and a system where thousands or ten-thousands
of containers are spawned in parallel this quickly becomes a bottleneck
increasing contention on the semaphore.

Extend open_tree() with a new OPEN_TREE_NAMESPACE flag. Similar to
OPEN_TREE_CLONE only the indicated mount tree is copied. Instead of
returning a file descriptor referring to that mount tree
OPEN_TREE_NAMESPACE will cause open_tree() to return a file descriptor
to a new mount namespace. In that new mount namespace the copied mount
tree has been mounted on top of a copy of the real rootfs.

The caller can setns() into that mount namespace and perform any
additionally required setup such as move_mount() detached mounts in
there.

This allows OPEN_TREE_NAMESPACE to function as a combined
unshare(CLONE_NEWNS) and pivot_root().

A caller may for example choose to create an extremely minimal rootfs:

fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE);

This will create a mount namespace where "wootwoot" has become the
rootfs mounted on top of the real rootfs. The caller can now setns()
into this new mount namespace and assemble additional mounts.

This also works with user namespaces:

unshare(CLONE_NEWUSER);
fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE);

which creates a new mount namespace owned by the earlier created user
namespace with "wootwoot" as the rootfs mounted on top of the real
rootfs.

Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-1-bfb24c7b061f@kernel.org
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Aleksa Sarai <cyphar@cyphar.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Suggested-by: Christian Brauner <brauner@kernel.org>
Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h              |   1 +
 fs/namespace.c             | 163 ++++++++++++++++++++++++++++++++++++++++-----
 fs/nsfs.c                  |  13 ++++
 include/uapi/linux/mount.h |   3 +-
 4 files changed, 163 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/internal.h b/fs/internal.h
index e333b105337a..f6932e639f36 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -246,6 +246,7 @@ extern void mnt_pin_kill(struct mount *m);
  */
 extern const struct dentry_operations ns_dentry_operations;
 int open_namespace(struct ns_common *ns);
+struct file *open_namespace_file(struct ns_common *ns);
 
 /*
  * fs/stat.c:
diff --git a/fs/namespace.c b/fs/namespace.c
index ec3b16fedd9f..59557019e422 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2796,6 +2796,9 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
 		__unlock_mount(m);
 }
 
+static void lock_mount_exact(const struct path *path,
+			     struct pinned_mountpoint *mp);
+
 #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
 	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
 	do_lock_mount((path), &mp, (beneath))
@@ -2946,10 +2949,11 @@ static inline bool may_copy_tree(const struct path *path)
 	return check_anonymous_mnt(mnt);
 }
 
-
-static struct mount *__do_loopback(const struct path *old_path, int recurse)
+static struct mount *__do_loopback(const struct path *old_path,
+				   unsigned int flags, unsigned int copy_flags)
 {
 	struct mount *old = real_mount(old_path->mnt);
+	bool recurse = flags & AT_RECURSIVE;
 
 	if (IS_MNT_UNBINDABLE(old))
 		return ERR_PTR(-EINVAL);
@@ -2960,10 +2964,22 @@ static struct mount *__do_loopback(const struct path *old_path, int recurse)
 	if (!recurse && __has_locked_children(old, old_path->dentry))
 		return ERR_PTR(-EINVAL);
 
+	/*
+	 * When creating a new mount namespace we don't want to copy over
+	 * mounts of mount namespaces to avoid the risk of cycles and also to
+	 * minimize the default complex interdependencies between mount
+	 * namespaces.
+	 *
+	 * We could ofc just check whether all mount namespace files aren't
+	 * creating cycles but really let's keep this simple.
+	 */
+	if (!(flags & OPEN_TREE_NAMESPACE))
+		copy_flags |= CL_COPY_MNT_NS_FILE;
+
 	if (recurse)
-		return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
-	else
-		return clone_mnt(old, old_path->dentry, 0);
+		return copy_tree(old, old_path->dentry, copy_flags);
+
+	return clone_mnt(old, old_path->dentry, copy_flags);
 }
 
 /*
@@ -2974,7 +2990,9 @@ static int do_loopback(const struct path *path, const char *old_name,
 {
 	struct path old_path __free(path_put) = {};
 	struct mount *mnt = NULL;
+	unsigned int flags = recurse ? AT_RECURSIVE : 0;
 	int err;
+
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
@@ -2991,7 +3009,7 @@ static int do_loopback(const struct path *path, const char *old_name,
 	if (!check_mnt(mp.parent))
 		return -EINVAL;
 
-	mnt = __do_loopback(&old_path, recurse);
+	mnt = __do_loopback(&old_path, flags, 0);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -3004,7 +3022,7 @@ static int do_loopback(const struct path *path, const char *old_name,
 	return err;
 }
 
-static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
+static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
 {
 	struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
 	struct user_namespace *user_ns = mnt_ns->user_ns;
@@ -3029,7 +3047,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
 			ns->seq_origin = src_mnt_ns->ns.ns_id;
 	}
 
-	mnt = __do_loopback(path, recursive);
+	mnt = __do_loopback(path, flags, 0);
 	if (IS_ERR(mnt)) {
 		emptied_ns = ns;
 		return ERR_CAST(mnt);
@@ -3043,9 +3061,9 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
 	return ns;
 }
 
-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct file *open_detached_copy(struct path *path, unsigned int flags)
 {
-	struct mnt_namespace *ns = get_detached_copy(path, recursive);
+	struct mnt_namespace *ns = get_detached_copy(path, flags);
 	struct file *file;
 
 	if (IS_ERR(ns))
@@ -3061,21 +3079,122 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
 	return file;
 }
 
+DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
+	    if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
+
+static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
+{
+	struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
+	struct path to_path __free(path_put) = {};
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct user_namespace *user_ns = current_user_ns();
+	struct mount *new_ns_root;
+	struct mount *mnt;
+	unsigned int copy_flags = 0;
+	bool locked = false;
+
+	if (user_ns != ns->user_ns)
+		copy_flags |= CL_SLAVE;
+
+	new_ns = alloc_mnt_ns(user_ns, false);
+	if (IS_ERR(new_ns))
+		return ERR_CAST(new_ns);
+
+	scoped_guard(namespace_excl) {
+		new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
+		if (IS_ERR(new_ns_root))
+			return ERR_CAST(new_ns_root);
+
+		/*
+		 * If the real rootfs had a locked mount on top of it somewhere
+		 * in the stack, lock the new mount tree as well so it can't be
+		 * exposed.
+		 */
+		mnt = ns->root;
+		while (mnt->overmount) {
+			mnt = mnt->overmount;
+			if (mnt->mnt.mnt_flags & MNT_LOCKED)
+				locked = true;
+		}
+	}
+
+	/*
+	 * We dropped the namespace semaphore so we can actually lock
+	 * the copy for mounting. The copied mount isn't attached to any
+	 * mount namespace and it is thus excluded from any propagation.
+	 * So realistically we're isolated and the mount can't be
+	 * overmounted.
+	 */
+
+	/* Borrow the reference from clone_mnt(). */
+	to_path.mnt = &new_ns_root->mnt;
+	to_path.dentry = dget(new_ns_root->mnt.mnt_root);
+
+	/* Now lock for actual mounting. */
+	LOCK_MOUNT_EXACT(mp, &to_path);
+	if (unlikely(IS_ERR(mp.parent)))
+		return ERR_CAST(mp.parent);
+
+	/*
+	 * We don't emulate unshare()ing a mount namespace. We stick to the
+	 * restrictions of creating detached bind-mounts. It has a lot
+	 * saner and simpler semantics.
+	 */
+	mnt = __do_loopback(path, flags, copy_flags);
+	if (IS_ERR(mnt))
+		return ERR_CAST(mnt);
+
+	scoped_guard(mount_writer) {
+		if (locked)
+			mnt->mnt.mnt_flags |= MNT_LOCKED;
+		/*
+		 * Now mount the detached tree on top of the copy of the
+		 * real rootfs we created.
+		 */
+		attach_mnt(mnt, new_ns_root, mp.mp);
+		if (user_ns != ns->user_ns)
+			lock_mnt_tree(new_ns_root);
+	}
+
+	/* Add all mounts to the new namespace. */
+	for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
+		mnt_add_to_ns(new_ns, p);
+		new_ns->nr_mounts++;
+	}
+
+	new_ns->root = real_mount(no_free_ptr(to_path.mnt));
+	ns_tree_add_raw(new_ns);
+	return no_free_ptr(new_ns);
+}
+
+static struct file *open_new_namespace(struct path *path, unsigned int flags)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = create_new_namespace(path, flags);
+	if (IS_ERR(new_ns))
+		return ERR_CAST(new_ns);
+	return open_namespace_file(to_ns_common(new_ns));
+}
+
 static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
 {
 	int ret;
 	struct path path __free(path_put) = {};
 	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-	bool detached = flags & OPEN_TREE_CLONE;
 
 	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
 
 	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
 		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
-		      OPEN_TREE_CLOEXEC))
+		      OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
 		return ERR_PTR(-EINVAL);
 
-	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
+	    AT_RECURSIVE)
+		return ERR_PTR(-EINVAL);
+
+	if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
 		return ERR_PTR(-EINVAL);
 
 	if (flags & AT_NO_AUTOMOUNT)
@@ -3085,15 +3204,27 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
 	if (flags & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;
 
-	if (detached && !may_mount())
+	/*
+	 * If we create a new mount namespace with the cloned mount tree we
+	 * just care about being privileged over our current user namespace.
+	 * The new mount namespace will be owned by it.
+	 */
+	if ((flags & OPEN_TREE_NAMESPACE) &&
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if ((flags & OPEN_TREE_CLONE) && !may_mount())
 		return ERR_PTR(-EPERM);
 
 	ret = user_path_at(dfd, filename, lookup_flags, &path);
 	if (unlikely(ret))
 		return ERR_PTR(ret);
 
-	if (detached)
-		return open_detached_copy(&path, flags & AT_RECURSIVE);
+	if (flags & OPEN_TREE_NAMESPACE)
+		return open_new_namespace(&path, flags);
+
+	if (flags & OPEN_TREE_CLONE)
+		return open_detached_copy(&path, flags);
 
 	return dentry_open(&path, O_PATH, current_cred());
 }
diff --git a/fs/nsfs.c b/fs/nsfs.c
index bf27d5da91f1..db91de208645 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -99,6 +99,19 @@ int ns_get_path(struct path *path, struct task_struct *task,
 	return ns_get_path_cb(path, ns_get_path_task, &args);
 }
 
+struct file *open_namespace_file(struct ns_common *ns)
+{
+	struct path path __free(path_put) = {};
+	int err;
+
+	/* call first to consume reference */
+	err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	return dentry_open(&path, O_RDONLY, current_cred());
+}
+
 /**
  * open_namespace - open a namespace
  * @ns: the namespace to open
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 18c624405268..d9d86598d100 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -61,7 +61,8 @@
 /*
  * open_tree() flags.
  */
-#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLONE		(1 << 0)	/* Clone the target tree and attach the clone */
+#define OPEN_TREE_NAMESPACE	(1 << 1)	/* Clone the target tree into a new mount namespace */
 #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
 
 /*
-- 
cgit v1.2.3


From 1e5271393d777f6159d896943b4c44c4f3ecff52 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Thu, 15 Jan 2026 08:35:44 +0100
Subject: hyper-v: Mark inner union in hv_kvp_exchg_msg_value as packed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The unpacked union within a packed struct generates alignment warnings
on clang for 32-bit ARM:

./usr/include/linux/hyperv.h:361:2: error: field  within 'struct hv_kvp_exchg_msg_value'
  is less aligned than 'union hv_kvp_exchg_msg_value::(anonymous at ./usr/include/linux/hyperv.h:361:2)'
  and is usually due to 'struct hv_kvp_exchg_msg_value' being packed,
  which can lead to unaligned accesses [-Werror,-Wunaligned-access]
     361 |         union {
         |         ^

With the recent changes to compile-test the UAPI headers in more cases,
this warning in combination with CONFIG_WERROR breaks the build.

Fix the warning.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202512140314.DzDxpIVn-lkp@intel.com/
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/linux-kbuild/20260110-uapi-test-disable-headers-arm-clang-unaligned-access-v1-1-b7b0fa541daa@kernel.org/
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/linux-kbuild/29b2e736-d462-45b7-a0a9-85f8d8a3de56@app.fastmail.com/
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: Wei Liu (Microsoft) <wei.liu@kernel.org>
Tested-by: Nicolas Schier <nsc@kernel.org>
Reviewed-by: Nicolas Schier <nsc@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/20260115-kbuild-alignment-vbox-v1-1-076aed1623ff@linutronix.de
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
---
 include/uapi/linux/hyperv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h
index aaa502a7bff4..1749b35ab2c2 100644
--- a/include/uapi/linux/hyperv.h
+++ b/include/uapi/linux/hyperv.h
@@ -362,7 +362,7 @@ struct hv_kvp_exchg_msg_value {
 		__u8 value[HV_KVP_EXCHANGE_MAX_VALUE_SIZE];
 		__u32 value_u32;
 		__u64 value_u64;
-	};
+	} __attribute__((packed));
 } __attribute__((packed));
 
 struct hv_kvp_msg_enumerate {
-- 
cgit v1.2.3


From c25d01e1c4f2d43f47af87c00e223f5ca7c71792 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Thu, 15 Jan 2026 08:35:45 +0100
Subject: virt: vbox: uapi: Mark inner unions in packed structs as packed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The unpacked unions within a packed struct generates alignment warnings
on clang for 32-bit ARM:

./usr/include/linux/vbox_vmmdev_types.h:239:4: error: field u within 'struct vmmdev_hgcm_function_parameter32'
  is less aligned than 'union (unnamed union at ./usr/include/linux/vbox_vmmdev_types.h:223:2)'
  and is usually due to 'struct vmmdev_hgcm_function_parameter32' being packed,
  which can lead to unaligned accesses [-Werror,-Wunaligned-access]
     239 |         } u;
         |           ^

./usr/include/linux/vbox_vmmdev_types.h:254:6: error: field u within
  'struct vmmdev_hgcm_function_parameter64::(anonymous union)::(unnamed at ./usr/include/linux/vbox_vmmdev_types.h:249:3)'
  is less aligned than 'union (unnamed union at ./usr/include/linux/vbox_vmmdev_types.h:251:4)' and is usually due to
  'struct vmmdev_hgcm_function_parameter64::(anonymous union)::(unnamed at ./usr/include/linux/vbox_vmmdev_types.h:249:3)'
  being packed, which can lead to unaligned accesses [-Werror,-Wunaligned-access]

With the recent changes to compile-test the UAPI headers in more cases,
these warning in combination with CONFIG_WERROR breaks the build.

Fix the warnings.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202512140314.DzDxpIVn-lkp@intel.com/
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/linux-kbuild/20260110-uapi-test-disable-headers-arm-clang-unaligned-access-v1-1-b7b0fa541daa@kernel.org/
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/linux-kbuild/29b2e736-d462-45b7-a0a9-85f8d8a3de56@app.fastmail.com/
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Tested-by: Nicolas Schier <nsc@kernel.org>
Reviewed-by: Nicolas Schier <nsc@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/20260115-kbuild-alignment-vbox-v1-2-076aed1623ff@linutronix.de
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
---
 include/uapi/linux/vbox_vmmdev_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h
index 6073858d52a2..11f3627c3729 100644
--- a/include/uapi/linux/vbox_vmmdev_types.h
+++ b/include/uapi/linux/vbox_vmmdev_types.h
@@ -236,7 +236,7 @@ struct vmmdev_hgcm_function_parameter32 {
 			/** Relative to the request header. */
 			__u32 offset;
 		} page_list;
-	} u;
+	} __packed u;
 } __packed;
 VMMDEV_ASSERT_SIZE(vmmdev_hgcm_function_parameter32, 4 + 8);
 
@@ -251,7 +251,7 @@ struct vmmdev_hgcm_function_parameter64 {
 			union {
 				__u64 phys_addr;
 				__u64 linear_addr;
-			} u;
+			} __packed u;
 		} __packed pointer;
 		struct {
 			/** Size of the buffer described by the page list. */
-- 
cgit v1.2.3


From 150a04d817d8f5be5a4f92799827cdc8d7e45989 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo@google.com>
Date: Fri, 16 Jan 2026 00:57:57 +0000
Subject: compiler_types.h: Attributes: Add __counted_by_ptr macro

Introduce __counted_by_ptr(), which works like __counted_by(), but for
pointer struct members.

struct foo {
	int a, b, c;
	char *buffer __counted_by_ptr(bytes);
	short nr_bars;
	struct bar *bars __counted_by_ptr(nr_bars);
	size_t bytes;
};

Because "counted_by" can only be applied to pointer members in very
recent compiler versions, its application ends up needing to be distinct
from flexibe array "counted_by" annotations, hence a separate macro.

This is a reworking of Kees' previous patch [1].

Link: https://lore.kernel.org/all/20251020220118.1226740-1-kees@kernel.org/ [1]
Co-developed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Bill Wendling <morbo@google.com>
Link: https://patch.msgid.link/20260116005838.2419118-1-morbo@google.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 Makefile                       |  6 ++++++
 include/linux/compiler_types.h | 18 +++++++++++++++++-
 include/uapi/linux/stddef.h    |  4 ++++
 init/Kconfig                   |  7 +++++++
 4 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Makefile b/Makefile
index 3cd00b62cde9..c0751976cdee 100644
--- a/Makefile
+++ b/Makefile
@@ -952,6 +952,12 @@ KBUILD_CFLAGS	+= $(CC_AUTO_VAR_INIT_ZERO_ENABLER)
 endif
 endif
 
+ifdef CONFIG_CC_IS_CLANG
+ifdef CONFIG_CC_HAS_COUNTED_BY_PTR
+KBUILD_CFLAGS	+= -fexperimental-late-parse-attributes
+endif
+endif
+
 # Explicitly clear padding bits during variable initialization
 KBUILD_CFLAGS += $(call cc-option,-fzero-init-padding-bits=all)
 
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index d3318a3c2577..d095beb904ea 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -369,7 +369,7 @@ struct ftrace_likely_data {
  * Optional: only supported since clang >= 18
  *
  *   gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
- * clang: https://github.com/llvm/llvm-project/pull/76348
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#counted-by-counted-by-or-null-sized-by-sized-by-or-null
  *
  * __bdos on clang < 19.1.2 can erroneously return 0:
  * https://github.com/llvm/llvm-project/pull/110497
@@ -383,6 +383,22 @@ struct ftrace_likely_data {
 # define __counted_by(member)
 #endif
 
+/*
+ * Runtime track number of objects pointed to by a pointer member for use by
+ * CONFIG_FORTIFY_SOURCE and CONFIG_UBSAN_BOUNDS.
+ *
+ * Optional: only supported since gcc >= 16
+ * Optional: only supported since clang >= 22
+ *
+ *   gcc: https://gcc.gnu.org/pipermail/gcc-patches/2025-April/681727.html
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#counted-by-counted-by-or-null-sized-by-sized-by-or-null
+ */
+#ifdef CONFIG_CC_HAS_COUNTED_BY_PTR
+#define __counted_by_ptr(member)	__attribute__((__counted_by__(member)))
+#else
+#define __counted_by_ptr(member)
+#endif
+
 /*
  * Optional: only supported since gcc >= 15
  * Optional: not supported by Clang
diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h
index 9a28f7d9a334..111b097ec00b 100644
--- a/include/uapi/linux/stddef.h
+++ b/include/uapi/linux/stddef.h
@@ -72,6 +72,10 @@
 #define __counted_by_be(m)
 #endif
 
+#ifndef __counted_by_ptr
+#define __counted_by_ptr(m)
+#endif
+
 #ifdef __KERNEL__
 #define __kernel_nonstring	__nonstring
 #else
diff --git a/init/Kconfig b/init/Kconfig
index fa79feb8fe57..96b7cd481eaa 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -143,6 +143,13 @@ config CC_HAS_COUNTED_BY
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
 	default y if CC_IS_GCC && GCC_VERSION >= 150100
 
+config CC_HAS_COUNTED_BY_PTR
+	bool
+	# supported since clang 22
+	default y if CC_IS_CLANG && CLANG_VERSION >= 220000
+	# supported since gcc 16.0.0
+	default y if CC_IS_GCC && GCC_VERSION >= 160000
+
 config CC_HAS_MULTIDIMENSIONAL_NONSTRING
 	def_bool $(success,echo 'char tag[][4] __attribute__((__nonstring__)) = { };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror)
 
-- 
cgit v1.2.3


From ca9d74eb5f6aea6eee746aae648382332dbcf24e Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 13 Jan 2026 08:44:17 +0100
Subject: uapi: add INT_MAX and INT_MIN constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some UAPI headers use INT_MAX and INT_MIN. Currently they include
<limits.h> for their definitions, which introduces a problematic
dependency on libc.

Add custom, namespaced definitions of INT_MAX and INT_MIN using the
same values as the regular kernel code.
These definitions are not added to uapi/linux/limits.h, as that header
will conflict with libc definitions on some platforms.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20260113-uapi-limits-v2-1-93c20f4b2c1a@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/typelimits.h | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 include/uapi/linux/typelimits.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/typelimits.h b/include/uapi/linux/typelimits.h
new file mode 100644
index 000000000000..8166c639b518
--- /dev/null
+++ b/include/uapi/linux/typelimits.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_TYPELIMITS_H
+#define _UAPI_LINUX_TYPELIMITS_H
+
+#define __KERNEL_INT_MAX ((int)(~0U >> 1))
+#define __KERNEL_INT_MIN (-__KERNEL_INT_MAX - 1)
+
+#endif /* _UAPI_LINUX_TYPELIMITS_H */
-- 
cgit v1.2.3


From a8a11e5237aed71b7f5f9d33c554ef06fe974311 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 13 Jan 2026 08:44:18 +0100
Subject: ethtool: uapi: Use UAPI definition of INT_MAX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using <limits.h> to gain access to INT_MAX introduces a dependency on a
libc, which UAPI headers should not do.

Use the equivalent UAPI constant.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20260113-uapi-limits-v2-2-93c20f4b2c1a@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index bbfe6e1cf01b..ce9aeb65a8e1 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -15,13 +15,10 @@
 #define _UAPI_LINUX_ETHTOOL_H
 
 #include <linux/const.h>
+#include <linux/typelimits.h>
 #include <linux/types.h>
 #include <linux/if_ether.h>
 
-#ifndef __KERNEL__
-#include <limits.h> /* for INT_MAX */
-#endif
-
 /* All structures exposed to userland should be defined such that they
  * have the same layout for 32-bit and 64-bit userland.
  */
@@ -2216,7 +2213,7 @@ enum ethtool_link_mode_bit_indices {
 
 static inline int ethtool_validate_speed(__u32 speed)
 {
-	return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN;
+	return speed <= __KERNEL_INT_MAX || speed == (__u32)SPEED_UNKNOWN;
 }
 
 /* Duplex, half or full. */
-- 
cgit v1.2.3


From 0b3877bec78b0f26a280078a15f8992426de1db7 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 13 Jan 2026 08:44:19 +0100
Subject: netfilter: uapi: Use UAPI definition of INT_MAX and INT_MIN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using <limits.h> to gain access to INT_MAX and INT_MIN introduces a
dependency on a libc, which UAPI headers should not do.

Use the equivalent UAPI constants.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20260113-uapi-limits-v2-3-93c20f4b2c1a@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/netfilter_bridge.h | 9 +++------
 include/uapi/linux/netfilter_ipv4.h   | 9 ++++-----
 include/uapi/linux/netfilter_ipv6.h   | 7 +++----
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h
index 1610fdbab98d..f6e8d1e05c97 100644
--- a/include/uapi/linux/netfilter_bridge.h
+++ b/include/uapi/linux/netfilter_bridge.h
@@ -10,10 +10,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/if_pppox.h>
-
-#ifndef __KERNEL__
-#include <limits.h> /* for INT_MIN, INT_MAX */
-#endif
+#include <linux/typelimits.h>
 
 /* Bridge Hooks */
 /* After promisc drops, checksum checks. */
@@ -31,14 +28,14 @@
 #define NF_BR_NUMHOOKS		6
 
 enum nf_br_hook_priorities {
-	NF_BR_PRI_FIRST = INT_MIN,
+	NF_BR_PRI_FIRST = __KERNEL_INT_MIN,
 	NF_BR_PRI_NAT_DST_BRIDGED = -300,
 	NF_BR_PRI_FILTER_BRIDGED = -200,
 	NF_BR_PRI_BRNF = 0,
 	NF_BR_PRI_NAT_DST_OTHER = 100,
 	NF_BR_PRI_FILTER_OTHER = 200,
 	NF_BR_PRI_NAT_SRC = 300,
-	NF_BR_PRI_LAST = INT_MAX,
+	NF_BR_PRI_LAST = __KERNEL_INT_MAX,
 };
 
 #endif /* _UAPI__LINUX_BRIDGE_NETFILTER_H */
diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h
index 155e77d6a42d..439d3c59862b 100644
--- a/include/uapi/linux/netfilter_ipv4.h
+++ b/include/uapi/linux/netfilter_ipv4.h
@@ -7,12 +7,11 @@
 
 
 #include <linux/netfilter.h>
+#include <linux/typelimits.h>
 
 /* only for userspace compatibility */
 #ifndef __KERNEL__
 
-#include <limits.h> /* for INT_MIN, INT_MAX */
-
 /* IP Hooks */
 /* After promisc drops, checksum checks. */
 #define NF_IP_PRE_ROUTING	0
@@ -28,7 +27,7 @@
 #endif /* ! __KERNEL__ */
 
 enum nf_ip_hook_priorities {
-	NF_IP_PRI_FIRST = INT_MIN,
+	NF_IP_PRI_FIRST = __KERNEL_INT_MIN,
 	NF_IP_PRI_RAW_BEFORE_DEFRAG = -450,
 	NF_IP_PRI_CONNTRACK_DEFRAG = -400,
 	NF_IP_PRI_RAW = -300,
@@ -41,8 +40,8 @@ enum nf_ip_hook_priorities {
 	NF_IP_PRI_NAT_SRC = 100,
 	NF_IP_PRI_SELINUX_LAST = 225,
 	NF_IP_PRI_CONNTRACK_HELPER = 300,
-	NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
-	NF_IP_PRI_LAST = INT_MAX,
+	NF_IP_PRI_CONNTRACK_CONFIRM = __KERNEL_INT_MAX,
+	NF_IP_PRI_LAST = __KERNEL_INT_MAX,
 };
 
 /* Arguments for setsockopt SOL_IP: */
diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h
index 80aa9b0799af..0e40d00b37fa 100644
--- a/include/uapi/linux/netfilter_ipv6.h
+++ b/include/uapi/linux/netfilter_ipv6.h
@@ -10,12 +10,11 @@
 
 
 #include <linux/netfilter.h>
+#include <linux/typelimits.h>
 
 /* only for userspace compatibility */
 #ifndef __KERNEL__
 
-#include <limits.h> /* for INT_MIN, INT_MAX */
-
 /* IP6 Hooks */
 /* After promisc drops, checksum checks. */
 #define NF_IP6_PRE_ROUTING	0
@@ -32,7 +31,7 @@
 
 
 enum nf_ip6_hook_priorities {
-	NF_IP6_PRI_FIRST = INT_MIN,
+	NF_IP6_PRI_FIRST = __KERNEL_INT_MIN,
 	NF_IP6_PRI_RAW_BEFORE_DEFRAG = -450,
 	NF_IP6_PRI_CONNTRACK_DEFRAG = -400,
 	NF_IP6_PRI_RAW = -300,
@@ -45,7 +44,7 @@ enum nf_ip6_hook_priorities {
 	NF_IP6_PRI_NAT_SRC = 100,
 	NF_IP6_PRI_SELINUX_LAST = 225,
 	NF_IP6_PRI_CONNTRACK_HELPER = 300,
-	NF_IP6_PRI_LAST = INT_MAX,
+	NF_IP6_PRI_LAST = __KERNEL_INT_MAX,
 };
 
 
-- 
cgit v1.2.3


From 7d8b06ecc45bd679dec58d2cc2bd86223d4e076d Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Thu, 15 Jan 2026 06:08:02 +0000
Subject: iommu/amd: Add support for hw_info for iommu capability query

AMD IOMMU Extended Feature (EFR) and Extended Feature 2 (EFR2) registers
specify features supported by each IOMMU hardware instance.
The IOMMU driver checks each feature-specific bits before enabling
each feature at run time.

For IOMMUFD, the hypervisor passes the raw value of amd_iommu_efr and
amd_iommu_efr2 to VMM via iommufd IOMMU_DEVICE_GET_HW_INFO ioctl.

Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/amd/Kconfig    | 10 ++++++++++
 drivers/iommu/amd/Makefile   |  1 +
 drivers/iommu/amd/iommu.c    |  2 ++
 drivers/iommu/amd/iommufd.c  | 31 +++++++++++++++++++++++++++++++
 drivers/iommu/amd/iommufd.h  | 15 +++++++++++++++
 include/uapi/linux/iommufd.h | 28 ++++++++++++++++++++++++++++
 6 files changed, 87 insertions(+)
 create mode 100644 drivers/iommu/amd/iommufd.c
 create mode 100644 drivers/iommu/amd/iommufd.h

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index f2acf471cb5d..588355ff7eb7 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -30,6 +30,16 @@ config AMD_IOMMU
 	  your BIOS for an option to enable it or if you have an IVRS ACPI
 	  table.
 
+config AMD_IOMMU_IOMMUFD
+	bool "Enable IOMMUFD features for AMD IOMMU (EXPERIMENTAL)"
+	depends on IOMMUFD
+	depends on AMD_IOMMU
+	help
+	  Support for IOMMUFD features intended to support virtual machines
+	  with accelerated virtual IOMMUs.
+
+	  Say Y here if you are doing development and testing on this feature.
+
 config AMD_IOMMU_DEBUGFS
 	bool "Enable AMD IOMMU internals in DebugFS"
 	depends on AMD_IOMMU && IOMMU_DEBUGFS
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 5412a563c697..41f053b49dce 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y += iommu.o init.o quirks.o ppr.o pasid.o
+obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o
 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index d7f457338de7..d550a7e431ac 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -43,6 +43,7 @@
 #include <linux/generic_pt/iommu.h>
 
 #include "amd_iommu.h"
+#include "iommufd.h"
 #include "../irq_remapping.h"
 #include "../iommu-pages.h"
 
@@ -3083,6 +3084,7 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
 
 const struct iommu_ops amd_iommu_ops = {
 	.capable = amd_iommu_capable,
+	.hw_info = amd_iommufd_hw_info,
 	.blocked_domain = &blocked_domain,
 	.release_domain = &blocked_domain,
 	.identity_domain = &identity_domain.domain,
diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c
new file mode 100644
index 000000000000..72eaaa923d04
--- /dev/null
+++ b/drivers/iommu/amd/iommufd.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/iommu.h>
+
+#include "iommufd.h"
+#include "amd_iommu.h"
+#include "amd_iommu_types.h"
+
+void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type)
+{
+	struct iommu_hw_info_amd *hwinfo;
+
+	if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
+	    *type != IOMMU_HW_INFO_TYPE_AMD)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	hwinfo = kzalloc(sizeof(*hwinfo), GFP_KERNEL);
+	if (!hwinfo)
+		return ERR_PTR(-ENOMEM);
+
+	*length = sizeof(*hwinfo);
+	*type = IOMMU_HW_INFO_TYPE_AMD;
+
+	hwinfo->efr = amd_iommu_efr;
+	hwinfo->efr2 = amd_iommu_efr2;
+
+	return hwinfo;
+}
diff --git a/drivers/iommu/amd/iommufd.h b/drivers/iommu/amd/iommufd.h
new file mode 100644
index 000000000000..f880be80a30d
--- /dev/null
+++ b/drivers/iommu/amd/iommufd.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#ifndef AMD_IOMMUFD_H
+#define AMD_IOMMUFD_H
+
+#if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD)
+void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type);
+#else
+#define amd_iommufd_hw_info NULL
+#endif /* CONFIG_AMD_IOMMU_IOMMUFD */
+
+#endif /* AMD_IOMMUFD_H */
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 2c41920b641d..3db37f6042a0 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -623,6 +623,32 @@ struct iommu_hw_info_tegra241_cmdqv {
 	__u8 __reserved;
 };
 
+/**
+ * struct iommu_hw_info_amd - AMD IOMMU device info
+ *
+ * @efr : Value of AMD IOMMU Extended Feature Register (EFR)
+ * @efr2: Value of AMD IOMMU Extended Feature 2 Register (EFR2)
+ *
+ * Please See description of these registers in the following sections of
+ * the AMD I/O Virtualization Technology (IOMMU) Specification.
+ * (https://docs.amd.com/v/u/en-US/48882_3.10_PUB)
+ *
+ * - MMIO Offset 0030h IOMMU Extended Feature Register
+ * - MMIO Offset 01A0h IOMMU Extended Feature 2 Register
+ *
+ * Note: The EFR and EFR2 are raw values reported by hardware.
+ * VMM is responsible to determine the appropriate flags to be exposed to
+ * the VM since cetertain features are not currently supported by the kernel
+ * for HW-vIOMMU.
+ *
+ * Current VMM-allowed list of feature flags are:
+ * - EFR[GTSup, GASup, GioSup, PPRSup, EPHSup, GATS, GLX, PASmax]
+ */
+struct iommu_hw_info_amd {
+	__aligned_u64 efr;
+	__aligned_u64 efr2;
+};
+
 /**
  * enum iommu_hw_info_type - IOMMU Hardware Info Types
  * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware
@@ -632,6 +658,7 @@ struct iommu_hw_info_tegra241_cmdqv {
  * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
  * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM
  *                                     SMMUv3) info type
+ * @IOMMU_HW_INFO_TYPE_AMD: AMD IOMMU info type
  */
 enum iommu_hw_info_type {
 	IOMMU_HW_INFO_TYPE_NONE = 0,
@@ -639,6 +666,7 @@ enum iommu_hw_info_type {
 	IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
 	IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
 	IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3,
+	IOMMU_HW_INFO_TYPE_AMD = 4,
 };
 
 /**
-- 
cgit v1.2.3


From e05698c10d980ac0a0b57ed81ec9353b9e9533c6 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Thu, 15 Jan 2026 06:08:06 +0000
Subject: iommufd: Introduce data struct for AMD nested domain allocation

Introduce IOMMU_HWPT_DATA_AMD_GUEST data type for IOMMU guest page table,
which is used for stage-1 in nested translation. The data structure
contains information necessary for setting up the AMD HW-vIOMMU support.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/uapi/linux/iommufd.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 3db37f6042a0..1dafbc552d37 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -465,16 +465,27 @@ struct iommu_hwpt_arm_smmuv3 {
 	__aligned_le64 ste[2];
 };
 
+/**
+ * struct iommu_hwpt_amd_guest - AMD IOMMU guest I/O page table data
+ *				 (IOMMU_HWPT_DATA_AMD_GUEST)
+ * @dte: Guest Device Table Entry (DTE)
+ */
+struct iommu_hwpt_amd_guest {
+	__aligned_u64 dte[4];
+};
+
 /**
  * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
  * @IOMMU_HWPT_DATA_NONE: no data
  * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
  * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
+ * @IOMMU_HWPT_DATA_AMD_GUEST: AMD IOMMU guest page table
  */
 enum iommu_hwpt_data_type {
 	IOMMU_HWPT_DATA_NONE = 0,
 	IOMMU_HWPT_DATA_VTD_S1 = 1,
 	IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
+	IOMMU_HWPT_DATA_AMD_GUEST = 3,
 };
 
 /**
-- 
cgit v1.2.3


From cd16edba1c6a24af138e1a5ded2711231fffa99f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 4 Dec 2025 11:19:10 +0100
Subject: ext4: fix ext4_tune_sb_params padding

The padding at the end of struct ext4_tune_sb_params is architecture
specific and in particular is different between x86-32 and x86-64,
since the __u64 member only enforces struct alignment on the latter.

This shows up as a new warning when test-building the headers with
-Wpadded:

include/linux/ext4.h:144:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded]

All members inside the structure are naturally aligned, so the only
difference here is the amount of padding at the end. Make the padding
explicit, to have a consistent sizeof(struct ext4_tune_sb_params) of
232 on all architectures and avoid adding compat ioctl handling for
EXT4_IOC_GET_TUNE_SB_PARAM/EXT4_IOC_SET_TUNE_SB_PARAM.

This is an ABI break on x86-32 but hopefully this can go into 6.18.y early
enough as a fixup so no actual users will be affected.  Alternatively, the
kernel could handle the ioctl commands for both sizes (232 and 228 bytes)
on all architectures.

Fixes: 04a91570ac67 ("ext4: implemet new ioctls to set and get superblock parameters")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20251204101914.1037148-1-arnd@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 include/uapi/linux/ext4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h
index 411dcc1e4a35..9c683991c32f 100644
--- a/include/uapi/linux/ext4.h
+++ b/include/uapi/linux/ext4.h
@@ -139,7 +139,7 @@ struct ext4_tune_sb_params {
 	__u32 clear_feature_incompat_mask;
 	__u32 clear_feature_ro_compat_mask;
 	__u8  mount_opts[64];
-	__u8  pad[64];
+	__u8  pad[68];
 };
 
 #define EXT4_TUNE_FL_ERRORS_BEHAVIOR	0x00000001
-- 
cgit v1.2.3


From 8a42938a28941da29bf3e4cd2af877b0d5d929e1 Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Wed, 7 Jan 2026 14:22:54 +0200
Subject: wifi: nl80211: ignore cluster id after NAN started

After NAN was started, cluster id updates from the user space should not
happen, since the device already started a cluster with the
previousely provided id.

Since NL80211_CMD_CHANGE_NAN_CONFIG requires to set the full NAN
configuration, we can't require that NL80211_NAN_CONF_CLUSTER_ID won't
be included in this command, and keeping the last confgiured value just
to be able to compare it against the new one seems a bit overkill.

Therefore, just ignore cluster id in this command and clarify the
documentation.

Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20260107142229.fb55e5853269.I10d18c8f69d98b28916596d6da4207c15ea4abb5@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  4 +++-
 net/wireless/nl80211.c       | 11 ++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index eb92296457c9..b0f050e36fa4 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -11,7 +11,7 @@
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -7454,6 +7454,8 @@ enum nl80211_nan_band_conf_attributes {
  *	address that can take values from 50-6F-9A-01-00-00 to
  *	50-6F-9A-01-FF-FF. This attribute is optional. If not present,
  *	a random Cluster ID will be chosen.
+ *	This attribute will be ignored in NL80211_CMD_CHANGE_NAN_CONFIG
+ *	since after NAN was started, the cluster ID can no longer change.
  * @NL80211_NAN_CONF_EXTRA_ATTRS: Additional NAN attributes to be
  *	published in the beacons. This is an optional byte array.
  * @NL80211_NAN_CONF_VENDOR_ELEMS: Vendor-specific elements that will
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index bcf30c5f5042..56cc5ed33ea3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5,7 +5,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
  */
 
 #include <linux/if.h>
@@ -15583,7 +15583,8 @@ static int nl80211_parse_nan_band_config(struct wiphy *wiphy,
 static int nl80211_parse_nan_conf(struct wiphy *wiphy,
 				  struct genl_info *info,
 				  struct cfg80211_nan_conf *conf,
-				  u32 *changed_flags)
+				  u32 *changed_flags,
+				  bool start)
 {
 	struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1];
 	int err, rem;
@@ -15630,7 +15631,7 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy,
 		return err;
 
 	changed |= CFG80211_NAN_CONF_CHANGED_CONFIG;
-	if (attrs[NL80211_NAN_CONF_CLUSTER_ID])
+	if (attrs[NL80211_NAN_CONF_CLUSTER_ID] && start)
 		conf->cluster_id =
 			nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]);
 
@@ -15741,7 +15742,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
 		return -EINVAL;
 
-	err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL);
+	err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL, true);
 	if (err)
 		return err;
 
@@ -16107,7 +16108,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
 	if (!wdev_running(wdev))
 		return -ENOTCONN;
 
-	err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed);
+	err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed, false);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From a9927022c4491ba44249af079e8799ce56f8053c Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 15 Jan 2026 12:56:44 +0100
Subject: net: ethtool: Add support for 80Gbps speed

USB4 v2 link used in peer-to-peer networking is symmetric 80Gbps so in
order to support reading this link speed, add support for it to ethtool.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20260115115646.328898-3-mika.westerberg@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy-caps.h   | 1 +
 drivers/net/phy/phy-core.c   | 2 ++
 drivers/net/phy/phy_caps.c   | 2 ++
 drivers/net/phy/phylink.c    | 1 +
 include/linux/phylink.h      | 7 ++++---
 include/uapi/linux/ethtool.h | 1 +
 6 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/phy/phy-caps.h b/drivers/net/phy/phy-caps.h
index 5f3f757e0b2f..421088e6f6e8 100644
--- a/drivers/net/phy/phy-caps.h
+++ b/drivers/net/phy/phy-caps.h
@@ -25,6 +25,7 @@ enum {
 	LINK_CAPA_40000FD,
 	LINK_CAPA_50000FD,
 	LINK_CAPA_56000FD,
+	LINK_CAPA_80000FD,
 	LINK_CAPA_100000FD,
 	LINK_CAPA_200000FD,
 	LINK_CAPA_400000FD,
diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 3badf6e84554..d7a4a977fc8a 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -48,6 +48,8 @@ const char *phy_speed_to_str(int speed)
 		return "50Gbps";
 	case SPEED_56000:
 		return "56Gbps";
+	case SPEED_80000:
+		return "80Gbps";
 	case SPEED_100000:
 		return "100Gbps";
 	case SPEED_200000:
diff --git a/drivers/net/phy/phy_caps.c b/drivers/net/phy/phy_caps.c
index 17a63c931335..942d43191561 100644
--- a/drivers/net/phy/phy_caps.c
+++ b/drivers/net/phy/phy_caps.c
@@ -21,6 +21,7 @@ static struct link_capabilities link_caps[__LINK_CAPA_MAX] __ro_after_init = {
 	{ SPEED_40000, DUPLEX_FULL, {0} }, /* LINK_CAPA_40000FD */
 	{ SPEED_50000, DUPLEX_FULL, {0} }, /* LINK_CAPA_50000FD */
 	{ SPEED_56000, DUPLEX_FULL, {0} }, /* LINK_CAPA_56000FD */
+	{ SPEED_80000, DUPLEX_FULL, {0} }, /* LINK_CAPA_80000FD */
 	{ SPEED_100000, DUPLEX_FULL, {0} }, /* LINK_CAPA_100000FD */
 	{ SPEED_200000, DUPLEX_FULL, {0} }, /* LINK_CAPA_200000FD */
 	{ SPEED_400000, DUPLEX_FULL, {0} }, /* LINK_CAPA_400000FD */
@@ -49,6 +50,7 @@ static int speed_duplex_to_capa(int speed, unsigned int duplex)
 	case SPEED_40000: return LINK_CAPA_40000FD;
 	case SPEED_50000: return LINK_CAPA_50000FD;
 	case SPEED_56000: return LINK_CAPA_56000FD;
+	case SPEED_80000: return LINK_CAPA_80000FD;
 	case SPEED_100000: return LINK_CAPA_100000FD;
 	case SPEED_200000: return LINK_CAPA_200000FD;
 	case SPEED_400000: return LINK_CAPA_400000FD;
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 43d8380aaefb..c8fd6b91cdd4 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -311,6 +311,7 @@ static struct {
 	{ MAC_400000FD, SPEED_400000, DUPLEX_FULL, BIT(LINK_CAPA_400000FD) },
 	{ MAC_200000FD, SPEED_200000, DUPLEX_FULL, BIT(LINK_CAPA_200000FD) },
 	{ MAC_100000FD, SPEED_100000, DUPLEX_FULL, BIT(LINK_CAPA_100000FD) },
+	{ MAC_80000FD,  SPEED_80000,  DUPLEX_FULL, BIT(LINK_CAPA_80000FD) },
 	{ MAC_56000FD,  SPEED_56000,  DUPLEX_FULL, BIT(LINK_CAPA_56000FD) },
 	{ MAC_50000FD,  SPEED_50000,  DUPLEX_FULL, BIT(LINK_CAPA_50000FD) },
 	{ MAC_40000FD,  SPEED_40000,  DUPLEX_FULL, BIT(LINK_CAPA_40000FD) },
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 38363e566ac3..20996f5778d1 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -90,9 +90,10 @@ enum {
 	MAC_40000FD	= BIT(13),
 	MAC_50000FD	= BIT(14),
 	MAC_56000FD	= BIT(15),
-	MAC_100000FD	= BIT(16),
-	MAC_200000FD	= BIT(17),
-	MAC_400000FD	= BIT(18),
+	MAC_80000FD	= BIT(16),
+	MAC_100000FD	= BIT(17),
+	MAC_200000FD	= BIT(18),
+	MAC_400000FD	= BIT(19),
 };
 
 static inline bool phylink_autoneg_inband(unsigned int mode)
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index ce9aeb65a8e1..b74b80508553 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2203,6 +2203,7 @@ enum ethtool_link_mode_bit_indices {
 #define SPEED_40000		40000
 #define SPEED_50000		50000
 #define SPEED_56000		56000
+#define SPEED_80000		80000
 #define SPEED_100000		100000
 #define SPEED_200000		200000
 #define SPEED_400000		400000
-- 
cgit v1.2.3


From 50b359896fe55d0443ed550e1fabba71d242031a Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Sun, 18 Jan 2026 09:51:15 +0200
Subject: wifi: cfg80211: ignore link disabled flag from userspace

When the AP has an advertised TID to Link Mapping (TTLM) it shall
include the element in the association response. As such, when this
element is present it needs to be used for the currently dormant links.
See Draft P802.11REVmf_D1.0 section 35.3.7.2.3 ("Negotiation of TTLM")
for the details. The flag is also not usable in case userspace wants to
specify a negotiated TTLM during association.

Note that for the link reconfiguration case, mac80211 did not use the
information. Draft P802.11REVmf_D1.0 states in section 35.3.6.4 ("Link
reconfiguration to the setup links) that we "shall operate with all the
TIDs mapped to the newly added links ..."

All this means that the flag is not needed. The implementation should
parse the information from the association response.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20260118093904.754e057896a5.Ifd06f5ef839a93bfd54d0593dc932870f95f3242@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  3 ---
 include/uapi/linux/nl80211.h |  5 +++--
 net/wireless/nl80211.c       | 10 ----------
 3 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 899f267b7cf9..2900202588a5 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3221,8 +3221,6 @@ struct cfg80211_auth_request {
  *	if this is %NULL for a link, that link is not requested
  * @elems: extra elements for the per-STA profile for this link
  * @elems_len: length of the elements
- * @disabled: If set this link should be included during association etc. but it
- *	should not be used until enabled by the AP MLD.
  * @error: per-link error code, must be <= 0. If there is an error, then the
  *	operation as a whole must fail.
  */
@@ -3230,7 +3228,6 @@ struct cfg80211_assoc_link {
 	struct cfg80211_bss *bss;
 	const u8 *elems;
 	size_t elems_len;
-	bool disabled;
 	int error;
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8134f10e4e6c..8433bac48112 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2880,8 +2880,9 @@ enum nl80211_commands {
  *	index. If the userspace includes more RNR elements than number of
  *	MBSSID elements then these will be added in every EMA beacon.
  *
- * @NL80211_ATTR_MLO_LINK_DISABLED: Flag attribute indicating that the link is
- *	disabled.
+ * @NL80211_ATTR_MLO_LINK_DISABLED: Unused. It was used to indicate that a link
+ *	is disabled during association. However, the AP will send the
+ *	information by including a TTLM in the association response.
  *
  * @NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA: Include BSS usage data, i.e.
  *	include BSSes that can only be used in restricted scenarios and/or
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c961cd42a832..03efd45c007f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12241,9 +12241,6 @@ static int nl80211_process_links(struct cfg80211_registered_device *rdev,
 				return -EINVAL;
 			}
 		}
-
-		links[link_id].disabled =
-			nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]);
 	}
 
 	return 0;
@@ -12423,13 +12420,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 			goto free;
 		}
 
-		if (req.links[req.link_id].disabled) {
-			GENL_SET_ERR_MSG(info,
-					 "cannot have assoc link disabled");
-			err = -EINVAL;
-			goto free;
-		}
-
 		if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS])
 			req.ext_mld_capa_ops =
 				nla_get_u16(info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]);
-- 
cgit v1.2.3


From a5546e18f77c0cb15d434bf5b92647687fe483e3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 15 Jan 2026 09:25:48 +0100
Subject: net: Add queue-create operation

Add a ynl netdev family operation called queue-create that creates a
new queue on a netdevice:

      name: queue-create
      attribute-set: queue
      flags: [admin-perm]
      do:
        request:
          attributes:
            - ifindex
            - type
            - lease
        reply: &queue-create-op
          attributes:
            - id

This is a generic operation such that it can be extended for various
use cases in future. Right now it is mandatory to specify ifindex,
the queue type which is enforced to rx and a lease. The newly created
queue id is returned to the caller.

A queue from a virtual device can have a lease which refers to another
queue from a physical device. This is useful for memory providers
and AF_XDP operations which take an ifindex and queue id to allow
applications to bind against virtual devices in containers. The lease
couples both queues together and allows to proxy the operations from
a virtual device in a container to the physical device.

In future, the nested lease attribute can be lifted and made optional
for other use-cases such as dynamic queue creation for physical
netdevs. The lack of lease and the specification of the physical
device as an ifindex will imply that we need a real queue to be
allocated. Similarly, the queue type enforcement to rx can then be
lifted as well to support tx.

An early implementation had only driver-specific integration [0], but
in order for other virtual devices to reuse, it makes sense to have
this as a generic API in core net.

For leasing queues, the virtual netdev must have real_num_rx_queue
less than num_rx_queues at the time of calling queue-create. The
queue-type must be rx as only rx queues are supported for leasing
for now. We also enforce that the queue-create ifindex must point
to a virtual device, and that the nested lease attribute's ifindex
must point to a physical device. The nested lease attribute set
contains a netns-id attribute which is currently only intended for
dumping as part of the queue-get operation. Also, it is modeled as
an s32 type similarly as done elsewhere in the stack.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0]
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20260115082603.219152-2-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/netlink/specs/netdev.yaml | 44 +++++++++++++++++++++++++++++++++
 include/uapi/linux/netdev.h             | 11 +++++++++
 net/core/netdev-genl-gen.c              | 20 +++++++++++++++
 net/core/netdev-genl-gen.h              |  2 ++
 net/core/netdev-genl.c                  |  5 ++++
 tools/include/uapi/linux/netdev.h       | 11 +++++++++
 6 files changed, 93 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 596c306ce52b..b86db8656eac 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -339,6 +339,15 @@ attribute-sets:
         doc: XSK information for this queue, if any.
         type: nest
         nested-attributes: xsk-info
+      -
+        name: lease
+        doc: |
+          A queue from a virtual device can have a lease which refers to
+          another queue from a physical device. This is useful for memory
+          providers and AF_XDP operations which take an ifindex and queue id
+          to allow applications to bind against virtual devices in containers.
+        type: nest
+        nested-attributes: lease
   -
     name: qstats
     doc: |
@@ -537,6 +546,24 @@ attribute-sets:
         name: id
       -
         name: type
+  -
+    name: lease
+    attributes:
+      -
+        name: ifindex
+        doc: The netdev ifindex to lease the queue from.
+        type: u32
+        checks:
+          min: 1
+      -
+        name: queue
+        doc: The netdev queue to lease from.
+        type: nest
+        nested-attributes: queue-id
+      -
+        name: netns-id
+        doc: The network namespace id of the netdev.
+        type: s32
   -
     name: dmabuf
     attributes:
@@ -686,6 +713,7 @@ operations:
             - dmabuf
             - io-uring
             - xsk
+            - lease
       dump:
         request:
           attributes:
@@ -797,6 +825,22 @@ operations:
         reply:
           attributes:
             - id
+    -
+      name: queue-create
+      doc: |
+        Create a new queue for the given netdevice. Whether this operation
+        is supported depends on the device and the driver.
+      attribute-set: queue
+      flags: [admin-perm]
+      do:
+        request:
+          attributes:
+            - ifindex
+            - type
+            - lease
+        reply: &queue-create-op
+          attributes:
+            - id
 
 kernel-family:
   headers: ["net/netdev_netlink.h"]
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index e0b579a1df4f..7df1056a35fd 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -160,6 +160,7 @@ enum {
 	NETDEV_A_QUEUE_DMABUF,
 	NETDEV_A_QUEUE_IO_URING,
 	NETDEV_A_QUEUE_XSK,
+	NETDEV_A_QUEUE_LEASE,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -202,6 +203,15 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
+enum {
+	NETDEV_A_LEASE_IFINDEX = 1,
+	NETDEV_A_LEASE_QUEUE,
+	NETDEV_A_LEASE_NETNS_ID,
+
+	__NETDEV_A_LEASE_MAX,
+	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
+};
+
 enum {
 	NETDEV_A_DMABUF_IFINDEX = 1,
 	NETDEV_A_DMABUF_QUEUES,
@@ -228,6 +238,7 @@ enum {
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
 	NETDEV_CMD_BIND_TX,
+	NETDEV_CMD_QUEUE_CREATE,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index ba673e81716f..52ba99c019e7 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -28,6 +28,12 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range
 };
 
 /* Common nested types */
+const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = {
+	[NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+	[NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+	[NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, },
+};
+
 const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
 	[NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
 	[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
@@ -107,6 +113,13 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1]
 	[NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
 };
 
+/* NETDEV_CMD_QUEUE_CREATE - do */
+static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = {
+	[NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+	[NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+	[NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy),
+};
+
 /* Ops table for netdev */
 static const struct genl_split_ops netdev_nl_ops[] = {
 	{
@@ -205,6 +218,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.maxattr	= NETDEV_A_DMABUF_FD,
 		.flags		= GENL_CMD_CAP_DO,
 	},
+	{
+		.cmd		= NETDEV_CMD_QUEUE_CREATE,
+		.doit		= netdev_nl_queue_create_doit,
+		.policy		= netdev_queue_create_nl_policy,
+		.maxattr	= NETDEV_A_QUEUE_LEASE,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
 };
 
 static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index cffc08517a41..d71b435d72c1 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -14,6 +14,7 @@
 #include <net/netdev_netlink.h>
 
 /* Common nested types */
+extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1];
 extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
 extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
 
@@ -36,6 +37,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info);
 
 enum {
 	NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 470fabbeacd9..aae75431858d 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -1120,6 +1120,11 @@ err_genlmsg_free:
 	return err;
 }
 
+int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	return -EOPNOTSUPP;
+}
+
 void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
 {
 	INIT_LIST_HEAD(&priv->bindings);
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index e0b579a1df4f..7df1056a35fd 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -160,6 +160,7 @@ enum {
 	NETDEV_A_QUEUE_DMABUF,
 	NETDEV_A_QUEUE_IO_URING,
 	NETDEV_A_QUEUE_XSK,
+	NETDEV_A_QUEUE_LEASE,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -202,6 +203,15 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
+enum {
+	NETDEV_A_LEASE_IFINDEX = 1,
+	NETDEV_A_LEASE_QUEUE,
+	NETDEV_A_LEASE_NETNS_ID,
+
+	__NETDEV_A_LEASE_MAX,
+	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
+};
+
 enum {
 	NETDEV_A_DMABUF_IFINDEX = 1,
 	NETDEV_A_DMABUF_QUEUES,
@@ -228,6 +238,7 @@ enum {
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
 	NETDEV_CMD_BIND_TX,
+	NETDEV_CMD_QUEUE_CREATE,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
-- 
cgit v1.2.3


From b5c3fa4a0b16d4a7d0bd0e5626a13fec0024030a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 15 Jan 2026 09:25:56 +0100
Subject: netkit: Add single device mode for netkit

Add a single device mode for netkit instead of netkit pairs. The primary
target for the paired devices is to connect network namespaces, of course,
and support has been implemented in projects like Cilium [0]. For the rxq
leasing the plan is to support two main scenarios related to single device
mode:

* For the use-case of io_uring zero-copy, the control plane can either
  set up a netkit pair where the peer device can perform rxq leasing which
  is then tied to the lifetime of the peer device, or the control plane
  can use a regular netkit pair to connect the hostns to a Pod/container
  and dynamically add/remove rxq leasing through a single device without
  having to interrupt the device pair. In the case of io_uring, the memory
  pool is used as skb non-linear pages, and thus the skb will go its way
  through the regular stack into netkit. Things like the netkit policy when
  no BPF is attached or skb scrubbing etc apply as-is in case the paired
  devices are used, or if the backend memory is tied to the single device
  and traffic goes through a paired device.

* For the use-case of AF_XDP, the control plane needs to use netkit in the
  single device mode. The single device mode currently enforces only a
  pass policy when no BPF is attached, and does not yet support BPF link
  attachments for AF_XDP. skbs sent to that device get dropped at the
  moment. Given AF_XDP operates at a lower layer of the stack tying this
  to the netkit pair did not make sense. In future, the plan is to allow
  BPF at the XDP layer which can: i) process traffic coming from the AF_XDP
  application (e.g. QEMU with AF_XDP backend) to filter egress traffic or
  to push selected egress traffic up to the single netkit device to the
  local stack (e.g. DHCP requests), and ii) vice-versa skbs sent to the
  single netkit into the AF_XDP application (e.g. DHCP replies). Also,
  the control-plane can dynamically manage rxq leasing for the single
  netkit device without having to interrupt (e.g. down/up cycle) the main
  netkit pair for the Pod which has traffic going in and out.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Jordan Rife <jordan@jrife.io>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://docs.cilium.io/en/stable/operations/performance/tuning/#netkit-device-mode [0]
Link: https://patch.msgid.link/20260115082603.219152-10-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/netkit.c         | 110 +++++++++++++++++++++++++++----------------
 include/uapi/linux/if_link.h |   6 +++
 2 files changed, 76 insertions(+), 40 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index 0a2fef7caccb..76332a98af92 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -26,6 +26,7 @@ struct netkit {
 
 	__cacheline_group_begin(netkit_slowpath);
 	enum netkit_mode mode;
+	enum netkit_pairing pair;
 	bool primary;
 	u32 headroom;
 	__cacheline_group_end(netkit_slowpath);
@@ -135,6 +136,10 @@ static int netkit_open(struct net_device *dev)
 	struct netkit *nk = netkit_priv(dev);
 	struct net_device *peer = rtnl_dereference(nk->peer);
 
+	if (nk->pair == NETKIT_DEVICE_SINGLE) {
+		netif_carrier_on(dev);
+		return 0;
+	}
 	if (!peer)
 		return -ENOTCONN;
 	if (peer->flags & IFF_UP) {
@@ -335,6 +340,7 @@ static int netkit_new_link(struct net_device *dev,
 	enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
 	enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
 	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr;
+	enum netkit_pairing pair = NETKIT_DEVICE_PAIR;
 	enum netkit_action policy_prim = NETKIT_PASS;
 	enum netkit_action policy_peer = NETKIT_PASS;
 	struct nlattr **data = params->data;
@@ -343,7 +349,8 @@ static int netkit_new_link(struct net_device *dev,
 	struct nlattr **tb = params->tb;
 	u16 headroom = 0, tailroom = 0;
 	struct ifinfomsg *ifmp = NULL;
-	struct net_device *peer;
+	struct net_device *peer = NULL;
+	bool seen_peer = false;
 	char ifname[IFNAMSIZ];
 	struct netkit *nk;
 	int err;
@@ -380,6 +387,12 @@ static int netkit_new_link(struct net_device *dev,
 			headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]);
 		if (data[IFLA_NETKIT_TAILROOM])
 			tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]);
+		if (data[IFLA_NETKIT_PAIRING])
+			pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]);
+
+		seen_peer = data[IFLA_NETKIT_PEER_INFO] ||
+			    data[IFLA_NETKIT_PEER_SCRUB] ||
+			    data[IFLA_NETKIT_PEER_POLICY];
 	}
 
 	if (ifmp && tbp[IFLA_IFNAME]) {
@@ -392,45 +405,46 @@ static int netkit_new_link(struct net_device *dev,
 	if (mode != NETKIT_L2 &&
 	    (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS]))
 		return -EOPNOTSUPP;
+	if (pair == NETKIT_DEVICE_SINGLE &&
+	    (tb != tbp || seen_peer || policy_prim != NETKIT_PASS))
+		return -EOPNOTSUPP;
 
-	peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
-				&netkit_link_ops, tbp, extack);
-	if (IS_ERR(peer))
-		return PTR_ERR(peer);
-
-	netif_inherit_tso_max(peer, dev);
-	if (headroom) {
-		peer->needed_headroom = headroom;
-		dev->needed_headroom = headroom;
-	}
-	if (tailroom) {
-		peer->needed_tailroom = tailroom;
-		dev->needed_tailroom = tailroom;
-	}
-
-	if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
-		eth_hw_addr_random(peer);
-	if (ifmp && dev->ifindex)
-		peer->ifindex = ifmp->ifi_index;
-
-	nk = netkit_priv(peer);
-	nk->primary = false;
-	nk->policy = policy_peer;
-	nk->scrub = scrub_peer;
-	nk->mode = mode;
-	nk->headroom = headroom;
-	bpf_mprog_bundle_init(&nk->bundle);
+	if (pair == NETKIT_DEVICE_PAIR) {
+		peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
+					&netkit_link_ops, tbp, extack);
+		if (IS_ERR(peer))
+			return PTR_ERR(peer);
+
+		netif_inherit_tso_max(peer, dev);
+		if (headroom)
+			peer->needed_headroom = headroom;
+		if (tailroom)
+			peer->needed_tailroom = tailroom;
+		if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
+			eth_hw_addr_random(peer);
+		if (ifmp && dev->ifindex)
+			peer->ifindex = ifmp->ifi_index;
 
-	err = register_netdevice(peer);
-	if (err < 0)
-		goto err_register_peer;
-	netif_carrier_off(peer);
-	if (mode == NETKIT_L2)
-		dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
+		nk = netkit_priv(peer);
+		nk->primary = false;
+		nk->policy = policy_peer;
+		nk->scrub = scrub_peer;
+		nk->mode = mode;
+		nk->pair = pair;
+		nk->headroom = headroom;
+		bpf_mprog_bundle_init(&nk->bundle);
+
+		err = register_netdevice(peer);
+		if (err < 0)
+			goto err_register_peer;
+		netif_carrier_off(peer);
+		if (mode == NETKIT_L2)
+			dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
 
-	err = rtnl_configure_link(peer, NULL, 0, NULL);
-	if (err < 0)
-		goto err_configure_peer;
+		err = rtnl_configure_link(peer, NULL, 0, NULL);
+		if (err < 0)
+			goto err_configure_peer;
+	}
 
 	if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS])
 		eth_hw_addr_random(dev);
@@ -438,12 +452,17 @@ static int netkit_new_link(struct net_device *dev,
 		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
 		strscpy(dev->name, "nk%d", IFNAMSIZ);
+	if (headroom)
+		dev->needed_headroom = headroom;
+	if (tailroom)
+		dev->needed_tailroom = tailroom;
 
 	nk = netkit_priv(dev);
 	nk->primary = true;
 	nk->policy = policy_prim;
 	nk->scrub = scrub_prim;
 	nk->mode = mode;
+	nk->pair = pair;
 	nk->headroom = headroom;
 	bpf_mprog_bundle_init(&nk->bundle);
 
@@ -455,10 +474,12 @@ static int netkit_new_link(struct net_device *dev,
 		dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
 
 	rcu_assign_pointer(netkit_priv(dev)->peer, peer);
-	rcu_assign_pointer(netkit_priv(peer)->peer, dev);
+	if (peer)
+		rcu_assign_pointer(netkit_priv(peer)->peer, dev);
 	return 0;
 err_configure_peer:
-	unregister_netdevice(peer);
+	if (peer)
+		unregister_netdevice(peer);
 	return err;
 err_register_peer:
 	free_netdev(peer);
@@ -518,6 +539,8 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi
 	nk = netkit_priv(dev);
 	if (!nk->primary)
 		return ERR_PTR(-EACCES);
+	if (nk->pair == NETKIT_DEVICE_SINGLE)
+		return ERR_PTR(-EOPNOTSUPP);
 	if (which == BPF_NETKIT_PEER) {
 		dev = rcu_dereference_rtnl(nk->peer);
 		if (!dev)
@@ -879,6 +902,7 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 		{ IFLA_NETKIT_PEER_INFO,  "peer info" },
 		{ IFLA_NETKIT_HEADROOM,   "headroom" },
 		{ IFLA_NETKIT_TAILROOM,   "tailroom" },
+		{ IFLA_NETKIT_PAIRING,    "pairing" },
 	};
 
 	if (!nk->primary) {
@@ -898,9 +922,11 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 	}
 
 	if (data[IFLA_NETKIT_POLICY]) {
+		err = -EOPNOTSUPP;
 		attr = data[IFLA_NETKIT_POLICY];
 		policy = nla_get_u32(attr);
-		err = netkit_check_policy(policy, attr, extack);
+		if (nk->pair == NETKIT_DEVICE_PAIR)
+			err = netkit_check_policy(policy, attr, extack);
 		if (err)
 			return err;
 		WRITE_ONCE(nk->policy, policy);
@@ -931,6 +957,7 @@ static size_t netkit_get_size(const struct net_device *dev)
 	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
 	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */
 	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */
+	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */
 	       0;
 }
 
@@ -951,6 +978,8 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		return -EMSGSIZE;
 	if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom))
 		return -EMSGSIZE;
+	if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair))
+		return -EMSGSIZE;
 
 	if (peer) {
 		nk = netkit_priv(peer);
@@ -972,6 +1001,7 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
 	[IFLA_NETKIT_TAILROOM]		= { .type = NLA_U16 },
 	[IFLA_NETKIT_SCRUB]		= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
 	[IFLA_NETKIT_PEER_SCRUB]	= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
+	[IFLA_NETKIT_PAIRING]		= NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE),
 	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
 					    .reject_message = "Primary attribute is read-only" },
 };
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3b491d96e52e..bbd565757298 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1296,6 +1296,11 @@ enum netkit_mode {
 	NETKIT_L3,
 };
 
+enum netkit_pairing {
+	NETKIT_DEVICE_PAIR,
+	NETKIT_DEVICE_SINGLE,
+};
+
 /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
  * the BPF program if attached. This also means the latter can
  * consume the two fields if they were populated earlier.
@@ -1320,6 +1325,7 @@ enum {
 	IFLA_NETKIT_PEER_SCRUB,
 	IFLA_NETKIT_HEADROOM,
 	IFLA_NETKIT_TAILROOM,
+	IFLA_NETKIT_PAIRING,
 	__IFLA_NETKIT_MAX,
 };
 #define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
-- 
cgit v1.2.3


From 8766d61a1d33cb5f15bfdd6ce9832bbe1fc649c2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 20 Jan 2026 18:04:55 -0800
Subject: Revert "Merge branch
 'netkit-support-for-io_uring-zero-copy-and-af_xdp'"

This reverts commit 77b9c4a438fc66e2ab004c411056b3fb71a54f2c, reversing
changes made to 4515ec4ad58a37e70a9e1256c0b993958c9b7497:

 931420a2fc36 ("selftests/net: Add netkit container tests")
 ab771c938d9a ("selftests/net: Make NetDrvContEnv support queue leasing")
 6be87fbb2776 ("selftests/net: Add env for container based tests")
 61d99ce3dfc2 ("selftests/net: Add bpf skb forwarding program")
 920da3634194 ("netkit: Add xsk support for af_xdp applications")
 eef51113f8af ("netkit: Add netkit notifier to check for unregistering devices")
 b5ef109d22d4 ("netkit: Implement rtnl_link_ops->alloc and ndo_queue_create")
 b5c3fa4a0b16 ("netkit: Add single device mode for netkit")
 0073d2fd679d ("xsk: Proxy pool management for leased queues")
 1ecea95dd3b5 ("xsk: Extend xsk_rcv_check validation")
 804bf334d08a ("net: Proxy netdev_queue_get_dma_dev for leased queues")
 0caa9a8ddec3 ("net: Proxy net_mp_{open,close}_rxq for leased queues")
 ff8889ff9107 ("net, ethtool: Disallow leased real rxqs to be resized")
 9e2103f36110 ("net: Add lease info to queue-get response")
 31127deddef4 ("net: Implement netdev_nl_queue_create_doit")
 a5546e18f77c ("net: Add queue-create operation")

The series will conflict with io_uring work, and the code needs more
polish.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml            |  44 ---
 drivers/net/netkit.c                               | 360 ++++-----------------
 include/linux/netdevice.h                          |   6 -
 include/net/netdev_queues.h                        |  19 +-
 include/net/netdev_rx_queue.h                      |  21 +-
 include/net/page_pool/memory_provider.h            |   4 +-
 include/net/xdp_sock_drv.h                         |   2 +-
 include/uapi/linux/if_link.h                       |   6 -
 include/uapi/linux/netdev.h                        |  11 -
 net/core/dev.c                                     |   7 -
 net/core/dev.h                                     |   2 -
 net/core/netdev-genl-gen.c                         |  20 --
 net/core/netdev-genl-gen.h                         |   2 -
 net/core/netdev-genl.c                             | 185 -----------
 net/core/netdev_queues.c                           |  74 +----
 net/core/netdev_rx_queue.c                         | 169 ++--------
 net/ethtool/channels.c                             |  12 +-
 net/ethtool/ioctl.c                                |   9 +-
 net/xdp/xsk.c                                      |  79 +----
 tools/include/uapi/linux/netdev.h                  |  11 -
 tools/testing/selftests/drivers/net/README.rst     |   7 -
 tools/testing/selftests/drivers/net/hw/Makefile    |   2 -
 .../selftests/drivers/net/hw/lib/py/__init__.py    |   7 +-
 .../selftests/drivers/net/hw/nk_forward.bpf.c      |  49 ---
 tools/testing/selftests/drivers/net/hw/nk_netns.py |  23 --
 .../testing/selftests/drivers/net/hw/nk_qlease.py  |  55 ----
 .../selftests/drivers/net/lib/py/__init__.py       |   7 +-
 tools/testing/selftests/drivers/net/lib/py/env.py  | 157 ---------
 28 files changed, 117 insertions(+), 1233 deletions(-)
 delete mode 100644 tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
 delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_netns.py
 delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_qlease.py

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index b86db8656eac..596c306ce52b 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -339,15 +339,6 @@ attribute-sets:
         doc: XSK information for this queue, if any.
         type: nest
         nested-attributes: xsk-info
-      -
-        name: lease
-        doc: |
-          A queue from a virtual device can have a lease which refers to
-          another queue from a physical device. This is useful for memory
-          providers and AF_XDP operations which take an ifindex and queue id
-          to allow applications to bind against virtual devices in containers.
-        type: nest
-        nested-attributes: lease
   -
     name: qstats
     doc: |
@@ -546,24 +537,6 @@ attribute-sets:
         name: id
       -
         name: type
-  -
-    name: lease
-    attributes:
-      -
-        name: ifindex
-        doc: The netdev ifindex to lease the queue from.
-        type: u32
-        checks:
-          min: 1
-      -
-        name: queue
-        doc: The netdev queue to lease from.
-        type: nest
-        nested-attributes: queue-id
-      -
-        name: netns-id
-        doc: The network namespace id of the netdev.
-        type: s32
   -
     name: dmabuf
     attributes:
@@ -713,7 +686,6 @@ operations:
             - dmabuf
             - io-uring
             - xsk
-            - lease
       dump:
         request:
           attributes:
@@ -825,22 +797,6 @@ operations:
         reply:
           attributes:
             - id
-    -
-      name: queue-create
-      doc: |
-        Create a new queue for the given netdevice. Whether this operation
-        is supported depends on the device and the driver.
-      attribute-set: queue
-      flags: [admin-perm]
-      do:
-        request:
-          attributes:
-            - ifindex
-            - type
-            - lease
-        reply: &queue-create-op
-          attributes:
-            - id
 
 kernel-family:
   headers: ["net/netdev_netlink.h"]
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index 0519f855d062..0a2fef7caccb 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -9,21 +9,11 @@
 #include <linux/bpf_mprog.h>
 #include <linux/indirect_call_wrapper.h>
 
-#include <net/netdev_lock.h>
-#include <net/netdev_queues.h>
-#include <net/netdev_rx_queue.h>
-#include <net/xdp_sock_drv.h>
 #include <net/netkit.h>
 #include <net/dst.h>
 #include <net/tcx.h>
 
-#define NETKIT_DRV_NAME	"netkit"
-
-#define NETKIT_NUM_RX_QUEUES_MAX  1024
-#define NETKIT_NUM_TX_QUEUES_MAX  1
-
-#define NETKIT_NUM_RX_QUEUES_REAL 1
-#define NETKIT_NUM_TX_QUEUES_REAL 1
+#define DRV_NAME "netkit"
 
 struct netkit {
 	__cacheline_group_begin(netkit_fastpath);
@@ -36,7 +26,6 @@ struct netkit {
 
 	__cacheline_group_begin(netkit_slowpath);
 	enum netkit_mode mode;
-	enum netkit_pairing pair;
 	bool primary;
 	u32 headroom;
 	__cacheline_group_end(netkit_slowpath);
@@ -47,8 +36,6 @@ struct netkit_link {
 	struct net_device *dev;
 };
 
-static struct rtnl_link_ops netkit_link_ops;
-
 static __always_inline int
 netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
 	   enum netkit_action ret)
@@ -148,10 +135,6 @@ static int netkit_open(struct net_device *dev)
 	struct netkit *nk = netkit_priv(dev);
 	struct net_device *peer = rtnl_dereference(nk->peer);
 
-	if (nk->pair == NETKIT_DEVICE_SINGLE) {
-		netif_carrier_on(dev);
-		return 0;
-	}
 	if (!peer)
 		return -ENOTCONN;
 	if (peer->flags & IFF_UP) {
@@ -236,86 +219,9 @@ static void netkit_get_stats(struct net_device *dev,
 	stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
 }
 
-static bool netkit_xsk_supported_at_phys(const struct net_device *dev)
-{
-	if (!dev->netdev_ops->ndo_bpf ||
-	    !dev->netdev_ops->ndo_xdp_xmit ||
-	    !dev->netdev_ops->ndo_xsk_wakeup)
-		return false;
-	if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK)
-		return false;
-	return true;
-}
-
-static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
-{
-	struct netkit *nk = netkit_priv(dev);
-	struct netdev_bpf xdp_lower;
-	struct netdev_rx_queue *rxq;
-	struct net_device *phys;
-	int ret = -EBUSY;
-
-	switch (xdp->command) {
-	case XDP_SETUP_XSK_POOL:
-		if (nk->pair == NETKIT_DEVICE_PAIR)
-			return -EOPNOTSUPP;
-		if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
-			return -EINVAL;
-
-		rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
-		if (!rxq->lease)
-			return -EOPNOTSUPP;
-
-		phys = rxq->lease->dev;
-		if (!netkit_xsk_supported_at_phys(phys))
-			return -EOPNOTSUPP;
-
-		memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
-		xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease);
-		break;
-	case XDP_SETUP_PROG:
-		return -EPERM;
-	default:
-		return -EINVAL;
-	}
-
-	netdev_lock(phys);
-	if (!dev_get_min_mp_channel_count(phys))
-		ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
-	netdev_unlock(phys);
-	return ret;
-}
-
-static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
-{
-	struct netdev_rx_queue *rxq;
-	struct net_device *phys;
-
-	if (queue_id >= dev->real_num_rx_queues)
-		return -EINVAL;
-
-	rxq = __netif_get_rx_queue(dev, queue_id);
-	if (!rxq->lease)
-		return -EOPNOTSUPP;
-
-	phys = rxq->lease->dev;
-	if (!netkit_xsk_supported_at_phys(phys))
-		return -EOPNOTSUPP;
-
-	return phys->netdev_ops->ndo_xsk_wakeup(phys,
-			get_netdev_rx_queue_index(rxq->lease), flags);
-}
-
-static int netkit_init(struct net_device *dev)
-{
-	netdev_lockdep_set_classes(dev);
-	return 0;
-}
-
 static void netkit_uninit(struct net_device *dev);
 
 static const struct net_device_ops netkit_netdev_ops = {
-	.ndo_init		= netkit_init,
 	.ndo_open		= netkit_open,
 	.ndo_stop		= netkit_close,
 	.ndo_start_xmit		= netkit_xmit,
@@ -326,95 +232,19 @@ static const struct net_device_ops netkit_netdev_ops = {
 	.ndo_get_peer_dev	= netkit_peer_dev,
 	.ndo_get_stats64	= netkit_get_stats,
 	.ndo_uninit		= netkit_uninit,
-	.ndo_bpf		= netkit_xsk,
-	.ndo_xsk_wakeup		= netkit_xsk_wakeup,
 	.ndo_features_check	= passthru_features_check,
 };
 
 static void netkit_get_drvinfo(struct net_device *dev,
 			       struct ethtool_drvinfo *info)
 {
-	strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver));
+	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
 }
 
 static const struct ethtool_ops netkit_ethtool_ops = {
 	.get_drvinfo		= netkit_get_drvinfo,
 };
 
-static int netkit_queue_create(struct net_device *dev)
-{
-	struct netkit *nk = netkit_priv(dev);
-	u32 rxq_count_old, rxq_count_new;
-	int err;
-
-	rxq_count_old = dev->real_num_rx_queues;
-	rxq_count_new = rxq_count_old + 1;
-
-	/* Only allow to lease a queue in single device mode or to
-	 * lease against the peer device which then ends up in the
-	 * target netns.
-	 */
-	if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary)
-		return -EOPNOTSUPP;
-
-	if (netif_running(dev))
-		netif_carrier_off(dev);
-	err = netif_set_real_num_rx_queues(dev, rxq_count_new);
-	if (netif_running(dev))
-		netif_carrier_on(dev);
-
-	return err ? : rxq_count_old;
-}
-
-static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = {
-	.ndo_queue_create	= netkit_queue_create,
-};
-
-static struct net_device *netkit_alloc(struct nlattr *tb[],
-				       const char *ifname,
-				       unsigned char name_assign_type,
-				       unsigned int num_tx_queues,
-				       unsigned int num_rx_queues)
-{
-	const struct rtnl_link_ops *ops = &netkit_link_ops;
-	struct net_device *dev;
-
-	if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX ||
-	    num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	dev = alloc_netdev_mqs(ops->priv_size, ifname,
-			       name_assign_type, ops->setup,
-			       num_tx_queues, num_rx_queues);
-	if (dev) {
-		dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL;
-		dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL;
-	}
-	return dev;
-}
-
-static void netkit_queue_unlease(struct net_device *dev)
-{
-	struct netdev_rx_queue *rxq, *rxq_lease;
-	struct net_device *dev_lease;
-	int i;
-
-	if (dev->real_num_rx_queues == 1)
-		return;
-
-	netdev_lock(dev);
-	for (i = 1; i < dev->real_num_rx_queues; i++) {
-		rxq = __netif_get_rx_queue(dev, i);
-		rxq_lease = rxq->lease;
-		dev_lease = rxq_lease->dev;
-
-		netdev_lock(dev_lease);
-		netdev_rx_queue_unlease(rxq, rxq_lease);
-		netdev_unlock(dev_lease);
-	}
-	netdev_unlock(dev);
-}
-
 static void netkit_setup(struct net_device *dev)
 {
 	static const netdev_features_t netkit_features_hw_vlan =
@@ -445,20 +275,18 @@ static void netkit_setup(struct net_device *dev)
 	dev->priv_flags |= IFF_DISABLE_NETPOLL;
 	dev->lltx = true;
 
-	dev->netdev_ops     = &netkit_netdev_ops;
-	dev->ethtool_ops    = &netkit_ethtool_ops;
-	dev->queue_mgmt_ops = &netkit_queue_mgmt_ops;
+	dev->ethtool_ops = &netkit_ethtool_ops;
+	dev->netdev_ops  = &netkit_netdev_ops;
 
 	dev->features |= netkit_features;
 	dev->hw_features = netkit_features;
 	dev->hw_enc_features = netkit_features;
 	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
 	dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
+
 	dev->needs_free_netdev = true;
 
 	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
-
-	xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK);
 }
 
 static struct net *netkit_get_link_net(const struct net_device *dev)
@@ -497,6 +325,8 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
 	return 0;
 }
 
+static struct rtnl_link_ops netkit_link_ops;
+
 static int netkit_new_link(struct net_device *dev,
 			   struct rtnl_newlink_params *params,
 			   struct netlink_ext_ack *extack)
@@ -505,7 +335,6 @@ static int netkit_new_link(struct net_device *dev,
 	enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
 	enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
 	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr;
-	enum netkit_pairing pair = NETKIT_DEVICE_PAIR;
 	enum netkit_action policy_prim = NETKIT_PASS;
 	enum netkit_action policy_peer = NETKIT_PASS;
 	struct nlattr **data = params->data;
@@ -514,8 +343,7 @@ static int netkit_new_link(struct net_device *dev,
 	struct nlattr **tb = params->tb;
 	u16 headroom = 0, tailroom = 0;
 	struct ifinfomsg *ifmp = NULL;
-	struct net_device *peer = NULL;
-	bool seen_peer = false;
+	struct net_device *peer;
 	char ifname[IFNAMSIZ];
 	struct netkit *nk;
 	int err;
@@ -552,12 +380,6 @@ static int netkit_new_link(struct net_device *dev,
 			headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]);
 		if (data[IFLA_NETKIT_TAILROOM])
 			tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]);
-		if (data[IFLA_NETKIT_PAIRING])
-			pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]);
-
-		seen_peer = data[IFLA_NETKIT_PEER_INFO] ||
-			    data[IFLA_NETKIT_PEER_SCRUB] ||
-			    data[IFLA_NETKIT_PEER_POLICY];
 	}
 
 	if (ifmp && tbp[IFLA_IFNAME]) {
@@ -570,46 +392,45 @@ static int netkit_new_link(struct net_device *dev,
 	if (mode != NETKIT_L2 &&
 	    (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS]))
 		return -EOPNOTSUPP;
-	if (pair == NETKIT_DEVICE_SINGLE &&
-	    (tb != tbp || seen_peer || policy_prim != NETKIT_PASS))
-		return -EOPNOTSUPP;
 
-	if (pair == NETKIT_DEVICE_PAIR) {
-		peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
-					&netkit_link_ops, tbp, extack);
-		if (IS_ERR(peer))
-			return PTR_ERR(peer);
-
-		netif_inherit_tso_max(peer, dev);
-		if (headroom)
-			peer->needed_headroom = headroom;
-		if (tailroom)
-			peer->needed_tailroom = tailroom;
-		if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
-			eth_hw_addr_random(peer);
-		if (ifmp && dev->ifindex)
-			peer->ifindex = ifmp->ifi_index;
+	peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
+				&netkit_link_ops, tbp, extack);
+	if (IS_ERR(peer))
+		return PTR_ERR(peer);
 
-		nk = netkit_priv(peer);
-		nk->primary = false;
-		nk->policy = policy_peer;
-		nk->scrub = scrub_peer;
-		nk->mode = mode;
-		nk->pair = pair;
-		nk->headroom = headroom;
-		bpf_mprog_bundle_init(&nk->bundle);
-
-		err = register_netdevice(peer);
-		if (err < 0)
-			goto err_register_peer;
-		netif_carrier_off(peer);
-		if (mode == NETKIT_L2)
-			dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
-
-		err = rtnl_configure_link(peer, NULL, 0, NULL);
-		if (err < 0)
-			goto err_configure_peer;
+	netif_inherit_tso_max(peer, dev);
+	if (headroom) {
+		peer->needed_headroom = headroom;
+		dev->needed_headroom = headroom;
 	}
+	if (tailroom) {
+		peer->needed_tailroom = tailroom;
+		dev->needed_tailroom = tailroom;
+	}
+
+	if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
+		eth_hw_addr_random(peer);
+	if (ifmp && dev->ifindex)
+		peer->ifindex = ifmp->ifi_index;
+
+	nk = netkit_priv(peer);
+	nk->primary = false;
+	nk->policy = policy_peer;
+	nk->scrub = scrub_peer;
+	nk->mode = mode;
+	nk->headroom = headroom;
+	bpf_mprog_bundle_init(&nk->bundle);
+
+	err = register_netdevice(peer);
+	if (err < 0)
+		goto err_register_peer;
+	netif_carrier_off(peer);
+	if (mode == NETKIT_L2)
+		dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
+
+	err = rtnl_configure_link(peer, NULL, 0, NULL);
+	if (err < 0)
+		goto err_configure_peer;
 
 	if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS])
 		eth_hw_addr_random(dev);
@@ -617,17 +438,12 @@ static int netkit_new_link(struct net_device *dev,
 		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
 		strscpy(dev->name, "nk%d", IFNAMSIZ);
-	if (headroom)
-		dev->needed_headroom = headroom;
-	if (tailroom)
-		dev->needed_tailroom = tailroom;
 
 	nk = netkit_priv(dev);
 	nk->primary = true;
 	nk->policy = policy_prim;
 	nk->scrub = scrub_prim;
 	nk->mode = mode;
-	nk->pair = pair;
 	nk->headroom = headroom;
 	bpf_mprog_bundle_init(&nk->bundle);
 
@@ -639,12 +455,10 @@ static int netkit_new_link(struct net_device *dev,
 		dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
 
 	rcu_assign_pointer(netkit_priv(dev)->peer, peer);
-	if (peer)
-		rcu_assign_pointer(netkit_priv(peer)->peer, dev);
+	rcu_assign_pointer(netkit_priv(peer)->peer, dev);
 	return 0;
 err_configure_peer:
-	if (peer)
-		unregister_netdevice(peer);
+	unregister_netdevice(peer);
 	return err;
 err_register_peer:
 	free_netdev(peer);
@@ -704,8 +518,6 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi
 	nk = netkit_priv(dev);
 	if (!nk->primary)
 		return ERR_PTR(-EACCES);
-	if (nk->pair == NETKIT_DEVICE_SINGLE)
-		return ERR_PTR(-EOPNOTSUPP);
 	if (which == BPF_NETKIT_PEER) {
 		dev = rcu_dereference_rtnl(nk->peer);
 		if (!dev)
@@ -1032,7 +844,6 @@ static void netkit_release_all(struct net_device *dev)
 static void netkit_uninit(struct net_device *dev)
 {
 	netkit_release_all(dev);
-	netkit_queue_unlease(dev);
 }
 
 static void netkit_del_link(struct net_device *dev, struct list_head *head)
@@ -1068,7 +879,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 		{ IFLA_NETKIT_PEER_INFO,  "peer info" },
 		{ IFLA_NETKIT_HEADROOM,   "headroom" },
 		{ IFLA_NETKIT_TAILROOM,   "tailroom" },
-		{ IFLA_NETKIT_PAIRING,    "pairing" },
 	};
 
 	if (!nk->primary) {
@@ -1088,11 +898,9 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 	}
 
 	if (data[IFLA_NETKIT_POLICY]) {
-		err = -EOPNOTSUPP;
 		attr = data[IFLA_NETKIT_POLICY];
 		policy = nla_get_u32(attr);
-		if (nk->pair == NETKIT_DEVICE_PAIR)
-			err = netkit_check_policy(policy, attr, extack);
+		err = netkit_check_policy(policy, attr, extack);
 		if (err)
 			return err;
 		WRITE_ONCE(nk->policy, policy);
@@ -1113,48 +921,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 	return 0;
 }
 
-static void netkit_check_lease_unregister(struct net_device *dev)
-{
-	LIST_HEAD(list_kill);
-	u32 q_idx;
-
-	if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING ||
-	    !dev->dev.parent)
-		return;
-
-	netdev_lock_ops(dev);
-	for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) {
-		struct net_device *tmp = dev;
-		u32 tmp_q_idx = q_idx;
-
-		if (netif_rx_queue_lease_get_owner(&tmp, &tmp_q_idx)) {
-			if (tmp->netdev_ops != &netkit_netdev_ops)
-				continue;
-			/* A single phys device can have multiple queues leased
-			 * to one netkit device. We can only queue that netkit
-			 * device once to the list_kill. Queues of that phys
-			 * device can be leased with different individual netkit
-			 * devices, hence we batch via list_kill.
-			 */
-			if (unregister_netdevice_queued(tmp))
-				continue;
-			netkit_del_link(tmp, &list_kill);
-		}
-	}
-	netdev_unlock_ops(dev);
-	unregister_netdevice_many(&list_kill);
-}
-
-static int netkit_notifier(struct notifier_block *this,
-			   unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	if (event == NETDEV_UNREGISTER)
-		netkit_check_lease_unregister(dev);
-	return NOTIFY_DONE;
-}
-
 static size_t netkit_get_size(const struct net_device *dev)
 {
 	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
@@ -1165,7 +931,6 @@ static size_t netkit_get_size(const struct net_device *dev)
 	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
 	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */
 	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */
-	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */
 	       0;
 }
 
@@ -1186,8 +951,6 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		return -EMSGSIZE;
 	if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom))
 		return -EMSGSIZE;
-	if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair))
-		return -EMSGSIZE;
 
 	if (peer) {
 		nk = netkit_priv(peer);
@@ -1209,15 +972,13 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
 	[IFLA_NETKIT_TAILROOM]		= { .type = NLA_U16 },
 	[IFLA_NETKIT_SCRUB]		= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
 	[IFLA_NETKIT_PEER_SCRUB]	= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
-	[IFLA_NETKIT_PAIRING]		= NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE),
 	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
 					    .reject_message = "Primary attribute is read-only" },
 };
 
 static struct rtnl_link_ops netkit_link_ops = {
-	.kind		= NETKIT_DRV_NAME,
+	.kind		= DRV_NAME,
 	.priv_size	= sizeof(struct netkit),
-	.alloc		= netkit_alloc,
 	.setup		= netkit_setup,
 	.newlink	= netkit_new_link,
 	.dellink	= netkit_del_link,
@@ -1231,39 +992,26 @@ static struct rtnl_link_ops netkit_link_ops = {
 	.maxtype	= IFLA_NETKIT_MAX,
 };
 
-static struct notifier_block netkit_netdev_notifier = {
-	.notifier_call	= netkit_notifier,
-};
-
-static __init int netkit_mod_init(void)
+static __init int netkit_init(void)
 {
-	int ret;
-
 	BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
 		     (int)NETKIT_PASS != (int)TCX_PASS ||
 		     (int)NETKIT_DROP != (int)TCX_DROP ||
 		     (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
 
-	ret = rtnl_link_register(&netkit_link_ops);
-	if (ret)
-		return ret;
-	ret = register_netdevice_notifier(&netkit_netdev_notifier);
-	if (ret)
-		rtnl_link_unregister(&netkit_link_ops);
-	return ret;
+	return rtnl_link_register(&netkit_link_ops);
 }
 
-static __exit void netkit_mod_exit(void)
+static __exit void netkit_exit(void)
 {
-	unregister_netdevice_notifier(&netkit_netdev_notifier);
 	rtnl_link_unregister(&netkit_link_ops);
 }
 
-module_init(netkit_mod_init);
-module_exit(netkit_mod_exit);
+module_init(netkit_init);
+module_exit(netkit_exit);
 
 MODULE_DESCRIPTION("BPF-programmable network device");
 MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
 MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME);
+MODULE_ALIAS_RTNL_LINK(DRV_NAME);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4d146c000e21..d99b0fbc1942 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3400,17 +3400,11 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
 void unregister_netdevice_many(struct list_head *head);
-
 static inline void unregister_netdevice(struct net_device *dev)
 {
 	unregister_netdevice_queue(dev, NULL);
 }
 
-static inline bool unregister_netdevice_queued(const struct net_device *dev)
-{
-	return !list_empty(&dev->unreg_list);
-}
-
 int netdev_refcnt_read(const struct net_device *dev);
 void free_netdev(struct net_device *dev);
 
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 81dc7cb2360c..b55d3b9cb9c2 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -130,11 +130,6 @@ void netdev_stat_queue_sum(struct net_device *netdev,
  * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used
  *			   for this queue. Return NULL on error.
  *
- * @ndo_queue_create: Create a new RX queue which can be leased to another queue.
- *		      Ops on this queue are redirected to the leased queue e.g.
- *		      when opening a memory provider. Return the new queue id on
- *		      success. Return negative error code on failure.
- *
  * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
  * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only
  * be called for an interface which is open.
@@ -154,12 +149,9 @@ struct netdev_queue_mgmt_ops {
 						  int idx);
 	struct device *		(*ndo_queue_get_dma_dev)(struct net_device *dev,
 							 int idx);
-	int			(*ndo_queue_create)(struct net_device *dev);
 };
 
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx);
-bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx);
-bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx);
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx);
 
 /**
  * DOC: Lockless queue stopping / waking helpers.
@@ -348,10 +340,5 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq)
 	})
 
 struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx);
-bool netdev_can_create_queue(const struct net_device *dev,
-			     struct netlink_ext_ack *extack);
-bool netdev_can_lease_queue(const struct net_device *dev,
-			    struct netlink_ext_ack *extack);
-bool netdev_queue_busy(struct net_device *dev, int idx,
-		       struct netlink_ext_ack *extack);
-#endif /* _LINUX_NET_QUEUES_H */
+
+#endif
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index 508d11afaecb..8cdcd138b33f 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -28,8 +28,6 @@ struct netdev_rx_queue {
 #endif
 	struct napi_struct		*napi;
 	struct pp_memory_provider_params mp_params;
-	struct netdev_rx_queue		*lease;
-	netdevice_tracker		lease_tracker;
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -59,22 +57,5 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue)
 }
 
 int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
-void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
-			   struct netdev_rx_queue *rxq_src);
-void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
-			     struct netdev_rx_queue *rxq_src);
-bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq);
 
-enum netif_lease_dir {
-	NETIF_VIRT_TO_PHYS,
-	NETIF_PHYS_TO_VIRT,
-};
-
-struct netdev_rx_queue *
-__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq,
-			   enum netif_lease_dir dir);
-struct netdev_rx_queue *
-netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq);
-void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
-				     struct net_device *dev);
-#endif /* _LINUX_NETDEV_RX_QUEUE_H */
+#endif
diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h
index b6f811c3416b..ada4f968960a 100644
--- a/include/net/page_pool/memory_provider.h
+++ b/include/net/page_pool/memory_provider.h
@@ -23,12 +23,12 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr);
 void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov);
 void net_mp_niov_clear_page_pool(struct net_iov *niov);
 
-int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx,
 		    struct pp_memory_provider_params *p);
 int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
 		      const struct pp_memory_provider_params *p,
 		      struct netlink_ext_ack *extack);
-void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
 		      struct pp_memory_provider_params *old_p);
 void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
 			const struct pp_memory_provider_params *old_p);
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index c07cfb431eac..242e34f771cc 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -28,7 +28,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max);
 void xsk_tx_release(struct xsk_buff_pool *pool);
-struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev,
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
 					    u16 queue_id);
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool);
 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bbd565757298..3b491d96e52e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1296,11 +1296,6 @@ enum netkit_mode {
 	NETKIT_L3,
 };
 
-enum netkit_pairing {
-	NETKIT_DEVICE_PAIR,
-	NETKIT_DEVICE_SINGLE,
-};
-
 /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
  * the BPF program if attached. This also means the latter can
  * consume the two fields if they were populated earlier.
@@ -1325,7 +1320,6 @@ enum {
 	IFLA_NETKIT_PEER_SCRUB,
 	IFLA_NETKIT_HEADROOM,
 	IFLA_NETKIT_TAILROOM,
-	IFLA_NETKIT_PAIRING,
 	__IFLA_NETKIT_MAX,
 };
 #define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7df1056a35fd..e0b579a1df4f 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -160,7 +160,6 @@ enum {
 	NETDEV_A_QUEUE_DMABUF,
 	NETDEV_A_QUEUE_IO_URING,
 	NETDEV_A_QUEUE_XSK,
-	NETDEV_A_QUEUE_LEASE,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -203,15 +202,6 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
-enum {
-	NETDEV_A_LEASE_IFINDEX = 1,
-	NETDEV_A_LEASE_QUEUE,
-	NETDEV_A_LEASE_NETNS_ID,
-
-	__NETDEV_A_LEASE_MAX,
-	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
-};
-
 enum {
 	NETDEV_A_DMABUF_IFINDEX = 1,
 	NETDEV_A_DMABUF_QUEUES,
@@ -238,7 +228,6 @@ enum {
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
 	NETDEV_CMD_BIND_TX,
-	NETDEV_CMD_QUEUE_CREATE,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/net/core/dev.c b/net/core/dev.c
index 13a3de63a825..2661b68f5be3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1114,13 +1114,6 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
 	return __netdev_put_lock_ops_compat(dev, net);
 }
 
-struct net_device *
-netdev_put_lock(struct net_device *dev, netdevice_tracker *tracker)
-{
-	netdev_tracker_free(dev, tracker);
-	return __netdev_put_lock(dev, dev_net(dev));
-}
-
 struct net_device *
 netdev_xa_find_lock(struct net *net, struct net_device *dev,
 		    unsigned long *index)
diff --git a/net/core/dev.h b/net/core/dev.h
index 9bcb76b325d0..da18536cbd35 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -30,8 +30,6 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
 struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 
 struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
-struct net_device *netdev_put_lock(struct net_device *dev,
-				   netdevice_tracker *tracker);
 struct net_device *
 netdev_xa_find_lock(struct net *net, struct net_device *dev,
 		    unsigned long *index);
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 52ba99c019e7..ba673e81716f 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -28,12 +28,6 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range
 };
 
 /* Common nested types */
-const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = {
-	[NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
-	[NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
-	[NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, },
-};
-
 const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
 	[NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
 	[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
@@ -113,13 +107,6 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1]
 	[NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
 };
 
-/* NETDEV_CMD_QUEUE_CREATE - do */
-static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = {
-	[NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
-	[NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
-	[NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy),
-};
-
 /* Ops table for netdev */
 static const struct genl_split_ops netdev_nl_ops[] = {
 	{
@@ -218,13 +205,6 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.maxattr	= NETDEV_A_DMABUF_FD,
 		.flags		= GENL_CMD_CAP_DO,
 	},
-	{
-		.cmd		= NETDEV_CMD_QUEUE_CREATE,
-		.doit		= netdev_nl_queue_create_doit,
-		.policy		= netdev_queue_create_nl_policy,
-		.maxattr	= NETDEV_A_QUEUE_LEASE,
-		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
-	},
 };
 
 static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index d71b435d72c1..cffc08517a41 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -14,7 +14,6 @@
 #include <net/netdev_netlink.h>
 
 /* Common nested types */
-extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1];
 extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
 extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
 
@@ -37,7 +36,6 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
-int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info);
 
 enum {
 	NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 51c830f88f10..470fabbeacd9 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -391,11 +391,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 			 u32 q_idx, u32 q_type, const struct genl_info *info)
 {
 	struct pp_memory_provider_params *params;
-	struct net_device *orig_netdev = netdev;
-	struct nlattr *nest_lease, *nest_queue;
 	struct netdev_rx_queue *rxq;
 	struct netdev_queue *txq;
-	u32 lease_q_idx = q_idx;
 	void *hdr;
 
 	hdr = genlmsg_iput(rsp, info);
@@ -413,37 +410,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 		if (nla_put_napi_id(rsp, rxq->napi))
 			goto nla_put_failure;
 
-		if (netif_rx_queue_lease_get_owner(&netdev, &lease_q_idx)) {
-			struct net *net, *peer_net;
-
-			nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE);
-			if (!nest_lease)
-				goto nla_put_failure;
-			nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE);
-			if (!nest_queue)
-				goto nla_put_failure;
-			if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, lease_q_idx))
-				goto nla_put_failure;
-			if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type))
-				goto nla_put_failure;
-			nla_nest_end(rsp, nest_queue);
-			if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX,
-					READ_ONCE(netdev->ifindex)))
-				goto nla_put_failure;
-			rcu_read_lock();
-			peer_net = dev_net_rcu(netdev);
-			net = dev_net_rcu(orig_netdev);
-			if (!net_eq(net, peer_net)) {
-				s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC);
-
-				if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id))
-					goto nla_put_failure_unlock;
-			}
-			rcu_read_unlock();
-			nla_nest_end(rsp, nest_lease);
-			netdev = orig_netdev;
-		}
-
 		params = &rxq->mp_params;
 		if (params->mp_ops &&
 		    params->mp_ops->nl_fill(params->mp_priv, rsp, rxq))
@@ -471,8 +437,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 
 	return 0;
 
-nla_put_failure_unlock:
-	rcu_read_unlock();
 nla_put_failure:
 	genlmsg_cancel(rsp, hdr);
 	return -EMSGSIZE;
@@ -1156,155 +1120,6 @@ err_genlmsg_free:
 	return err;
 }
 
-int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info)
-{
-	const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
-	const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1;
-	int err, ifindex, ifindex_lease, queue_id, queue_id_lease;
-	struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
-	struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)];
-	struct netdev_rx_queue *rxq, *rxq_lease;
-	struct net_device *dev, *dev_lease;
-	netdevice_tracker dev_tracker;
-	struct nlattr *nest;
-	struct sk_buff *rsp;
-	void *hdr;
-
-	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) ||
-	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
-	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE))
-		return -EINVAL;
-	if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) !=
-	    NETDEV_QUEUE_TYPE_RX) {
-		NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]);
-		return -EINVAL;
-	}
-
-	ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
-
-	nest = info->attrs[NETDEV_A_QUEUE_LEASE];
-	err = nla_parse_nested(ltb, lmaxtype, nest,
-			       netdev_lease_nl_policy, info->extack);
-	if (err < 0)
-		return err;
-	if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) ||
-	    NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE))
-		return -EINVAL;
-	if (ltb[NETDEV_A_LEASE_NETNS_ID]) {
-		NL_SET_BAD_ATTR(info->extack, ltb[NETDEV_A_LEASE_NETNS_ID]);
-		return -EINVAL;
-	}
-
-	ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]);
-
-	nest = ltb[NETDEV_A_LEASE_QUEUE];
-	err = nla_parse_nested(qtb, qmaxtype, nest,
-			       netdev_queue_id_nl_policy, info->extack);
-	if (err < 0)
-		return err;
-	if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) ||
-	    NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE))
-		return -EINVAL;
-	if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
-		NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]);
-		return -EINVAL;
-	}
-	if (ifindex == ifindex_lease) {
-		NL_SET_ERR_MSG(info->extack,
-			       "Lease ifindex cannot be the same as queue creation ifindex");
-		return -EINVAL;
-	}
-
-	queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]);
-
-	rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!rsp)
-		return -ENOMEM;
-
-	hdr = genlmsg_iput(rsp, info);
-	if (!hdr) {
-		err = -EMSGSIZE;
-		goto err_genlmsg_free;
-	}
-
-	/* Locking order is always from the virtual to the physical device
-	 * since this is also the same order when applications open the
-	 * memory provider later on.
-	 */
-	dev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
-	if (!dev) {
-		err = -ENODEV;
-		goto err_genlmsg_free;
-	}
-	if (!netdev_can_create_queue(dev, info->extack)) {
-		err = -EINVAL;
-		goto err_unlock_dev;
-	}
-
-	dev_lease = netdev_get_by_index(genl_info_net(info), ifindex_lease,
-					&dev_tracker, GFP_KERNEL);
-	if (!dev_lease) {
-		err = -ENODEV;
-		goto err_unlock_dev;
-	}
-	if (!netdev_can_lease_queue(dev_lease, info->extack)) {
-		netdev_put(dev_lease, &dev_tracker);
-		err = -EINVAL;
-		goto err_unlock_dev;
-	}
-
-	dev_lease = netdev_put_lock(dev_lease, &dev_tracker);
-	if (!dev_lease) {
-		err = -ENODEV;
-		goto err_unlock_dev;
-	}
-	if (queue_id_lease >= dev_lease->real_num_rx_queues) {
-		err = -ERANGE;
-		NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]);
-		goto err_unlock_dev_lease;
-	}
-	if (netdev_queue_busy(dev_lease, queue_id_lease, info->extack)) {
-		err = -EBUSY;
-		goto err_unlock_dev_lease;
-	}
-
-	rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease);
-	rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1);
-
-	if (rxq->lease && rxq->lease->dev != dev_lease) {
-		err = -EOPNOTSUPP;
-		NL_SET_ERR_MSG(info->extack,
-			       "Leasing multiple queues from different devices not supported");
-		goto err_unlock_dev_lease;
-	}
-
-	err = queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev);
-	if (err < 0) {
-		NL_SET_ERR_MSG(info->extack,
-			       "Device is unable to create a new queue");
-		goto err_unlock_dev_lease;
-	}
-
-	rxq = __netif_get_rx_queue(dev, queue_id);
-	netdev_rx_queue_lease(rxq, rxq_lease);
-
-	nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id);
-	genlmsg_end(rsp, hdr);
-
-	netdev_unlock(dev_lease);
-	netdev_unlock(dev);
-
-	return genlmsg_reply(rsp, info);
-
-err_unlock_dev_lease:
-	netdev_unlock(dev_lease);
-err_unlock_dev:
-	netdev_unlock(dev);
-err_genlmsg_free:
-	nlmsg_free(rsp);
-	return err;
-}
-
 void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
 {
 	INIT_LIST_HEAD(&priv->bindings);
diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c
index 97acf6440829..251f27a8307f 100644
--- a/net/core/netdev_queues.c
+++ b/net/core/netdev_queues.c
@@ -1,37 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <net/netdev_queues.h>
-#include <net/netdev_rx_queue.h>
-#include <net/xdp_sock_drv.h>
 
 /**
  * netdev_queue_get_dma_dev() - get dma device for zero-copy operations
  * @dev:	net_device
  * @idx:	queue index
  *
- * Get dma device for zero-copy operations to be used for this queue. If the
- * queue is leased to a physical queue, we retrieve the latter's dma device.
+ * Get dma device for zero-copy operations to be used for this queue.
  * When such device is not available or valid, the function will return NULL.
  *
  * Return: Device or NULL on error
  */
 struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
 {
-	const struct netdev_queue_mgmt_ops *queue_ops;
+	const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
 	struct device *dma_dev;
 
-	if (idx < dev->real_num_rx_queues) {
-		struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
-
-		if (rxq->lease) {
-			rxq = rxq->lease;
-			dev = rxq->dev;
-			idx = get_netdev_rx_queue_index(rxq);
-		}
-	}
-
-	queue_ops = dev->queue_mgmt_ops;
-
 	if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
 		dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
 	else
@@ -40,58 +25,3 @@ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
 	return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
 }
 
-bool netdev_can_create_queue(const struct net_device *dev,
-			     struct netlink_ext_ack *extack)
-{
-	if (dev->dev.parent) {
-		NL_SET_ERR_MSG(extack, "Device is not a virtual device");
-		return false;
-	}
-	if (!dev->queue_mgmt_ops ||
-	    !dev->queue_mgmt_ops->ndo_queue_create) {
-		NL_SET_ERR_MSG(extack, "Device does not support queue creation");
-		return false;
-	}
-	if (dev->real_num_rx_queues < 1 ||
-	    dev->real_num_tx_queues < 1) {
-		NL_SET_ERR_MSG(extack, "Device must have at least one real queue");
-		return false;
-	}
-	return true;
-}
-
-bool netdev_can_lease_queue(const struct net_device *dev,
-			    struct netlink_ext_ack *extack)
-{
-	if (!dev->dev.parent) {
-		NL_SET_ERR_MSG(extack, "Lease device is a virtual device");
-		return false;
-	}
-	if (!netif_device_present(dev)) {
-		NL_SET_ERR_MSG(extack, "Lease device has been removed from the system");
-		return false;
-	}
-	if (!dev->queue_mgmt_ops) {
-		NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations");
-		return false;
-	}
-	return true;
-}
-
-bool netdev_queue_busy(struct net_device *dev, int idx,
-		       struct netlink_ext_ack *extack)
-{
-	if (netif_rxq_is_leased(dev, idx)) {
-		NL_SET_ERR_MSG(extack, "Lease device queue is already leased");
-		return true;
-	}
-	if (xsk_get_pool_from_qid(dev, idx)) {
-		NL_SET_ERR_MSG(extack, "Lease device queue in use by AF_XDP");
-		return true;
-	}
-	if (netif_rxq_has_mp(dev, idx)) {
-		NL_SET_ERR_MSG(extack, "Lease device queue in use by memory provider");
-		return true;
-	}
-	return false;
-}
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index 75c7a68cb90d..c7d9341b7630 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -9,120 +9,14 @@
 
 #include "page_pool_priv.h"
 
-void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
-			   struct netdev_rx_queue *rxq_src)
-{
-	netdev_assert_locked(rxq_src->dev);
-	netdev_assert_locked(rxq_dst->dev);
-
-	netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL);
-
-	WRITE_ONCE(rxq_src->lease, rxq_dst);
-	WRITE_ONCE(rxq_dst->lease, rxq_src);
-}
-
-void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
-			     struct netdev_rx_queue *rxq_src)
-{
-	netdev_assert_locked(rxq_dst->dev);
-	netdev_assert_locked(rxq_src->dev);
-
-	WRITE_ONCE(rxq_src->lease, NULL);
-	WRITE_ONCE(rxq_dst->lease, NULL);
-
-	netdev_put(rxq_src->dev, &rxq_src->lease_tracker);
-}
-
-bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx)
-{
-	if (rxq_idx < dev->real_num_rx_queues)
-		return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease);
-	return false;
-}
-
-static bool netif_lease_dir_ok(const struct net_device *dev,
-			       enum netif_lease_dir dir)
-{
-	if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent)
-		return true;
-	if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent)
-		return true;
-	return false;
-}
-
-struct netdev_rx_queue *
-__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx,
-			   enum netif_lease_dir dir)
-{
-	struct net_device *orig_dev = *dev;
-	struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx);
-
-	if (rxq->lease) {
-		if (!netif_lease_dir_ok(orig_dev, dir))
-			return NULL;
-		rxq = rxq->lease;
-		*rxq_idx = get_netdev_rx_queue_index(rxq);
-		*dev = rxq->dev;
-	}
-	return rxq;
-}
-
-struct netdev_rx_queue *
-netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx)
-{
-	struct net_device *orig_dev = *dev;
-	struct netdev_rx_queue *rxq;
-
-	/* Locking order is always from the virtual to the physical device
-	 * see netdev_nl_queue_create_doit().
-	 */
-	netdev_ops_assert_locked(orig_dev);
-	rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS);
-	if (rxq && orig_dev != *dev)
-		netdev_lock(*dev);
-	return rxq;
-}
-
-void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
-				     struct net_device *dev)
-{
-	if (orig_dev != dev)
-		netdev_unlock(dev);
-}
-
-bool netif_rx_queue_lease_get_owner(struct net_device **dev,
-				    unsigned int *rxq_idx)
-{
-	struct net_device *orig_dev = *dev;
-	struct netdev_rx_queue *rxq;
-
-	/* The physical device needs to be locked. If there is indeed a lease,
-	 * then the virtual device holds a reference on the physical device
-	 * and the lease stays active until the virtual device is torn down.
-	 * When queues get {un,}leased both devices are always locked.
-	 */
-	netdev_ops_assert_locked(orig_dev);
-	rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_PHYS_TO_VIRT);
-	if (rxq && orig_dev != *dev)
-		return true;
-	return false;
-}
-
 /* See also page_pool_is_unreadable() */
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx)
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
 {
-	if (rxq_idx < dev->real_num_rx_queues)
-		return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops;
-	return false;
-}
-EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+	struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
 
-bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx)
-{
-	if (rxq_idx < dev->real_num_rx_queues)
-		return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv;
-	return false;
+	return !!rxq->mp_params.mp_ops;
 }
+EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
 
 int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
 {
@@ -206,63 +100,49 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
 		      const struct pp_memory_provider_params *p,
 		      struct netlink_ext_ack *extack)
 {
-	struct net_device *orig_dev = dev;
 	struct netdev_rx_queue *rxq;
 	int ret;
 
 	if (!netdev_need_ops_lock(dev))
 		return -EOPNOTSUPP;
+
 	if (rxq_idx >= dev->real_num_rx_queues) {
 		NL_SET_ERR_MSG(extack, "rx queue index out of range");
 		return -ERANGE;
 	}
-
 	rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
-	rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx);
-	if (!rxq) {
-		NL_SET_ERR_MSG(extack, "rx queue peered to a virtual netdev");
-		return -EBUSY;
-	}
-	if (!dev->dev.parent) {
-		NL_SET_ERR_MSG(extack, "rx queue is mapped to a virtual netdev");
-		ret = -EBUSY;
-		goto out;
-	}
+
 	if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
 		NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
-		ret = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 	if (dev->cfg->hds_thresh) {
 		NL_SET_ERR_MSG(extack, "hds-thresh is not zero");
-		ret = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 	if (dev_xdp_prog_count(dev)) {
 		NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
-		ret = -EEXIST;
-		goto out;
+		return -EEXIST;
 	}
+
+	rxq = __netif_get_rx_queue(dev, rxq_idx);
 	if (rxq->mp_params.mp_ops) {
 		NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
-		ret = -EEXIST;
-		goto out;
+		return -EEXIST;
 	}
 #ifdef CONFIG_XDP_SOCKETS
 	if (rxq->pool) {
 		NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP");
-		ret = -EBUSY;
-		goto out;
+		return -EBUSY;
 	}
 #endif
+
 	rxq->mp_params = *p;
 	ret = netdev_rx_queue_restart(dev, rxq_idx);
 	if (ret) {
 		rxq->mp_params.mp_ops = NULL;
 		rxq->mp_params.mp_priv = NULL;
 	}
-out:
-	netif_put_rx_queue_lease_locked(orig_dev, dev);
 	return ret;
 }
 
@@ -277,43 +157,38 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
 	return ret;
 }
 
-void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
 			const struct pp_memory_provider_params *old_p)
 {
-	struct net_device *orig_dev = dev;
 	struct netdev_rx_queue *rxq;
 	int err;
 
-	if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues))
+	if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
 		return;
 
-	rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx);
-	if (WARN_ON_ONCE(!rxq))
-		return;
+	rxq = __netif_get_rx_queue(dev, ifq_idx);
 
 	/* Callers holding a netdev ref may get here after we already
 	 * went thru shutdown via dev_memory_provider_uninstall().
 	 */
 	if (dev->reg_state > NETREG_REGISTERED &&
 	    !rxq->mp_params.mp_ops)
-		goto out;
+		return;
 
 	if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops ||
 			 rxq->mp_params.mp_priv != old_p->mp_priv))
-		goto out;
+		return;
 
 	rxq->mp_params.mp_ops = NULL;
 	rxq->mp_params.mp_priv = NULL;
-	err = netdev_rx_queue_restart(dev, rxq_idx);
+	err = netdev_rx_queue_restart(dev, ifq_idx);
 	WARN_ON(err && err != -ENETDOWN);
-out:
-	netif_put_rx_queue_lease_locked(orig_dev, dev);
 }
 
-void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
 		      struct pp_memory_provider_params *old_p)
 {
 	netdev_lock(dev);
-	__net_mp_close_rxq(dev, rxq_idx, old_p);
+	__net_mp_close_rxq(dev, ifq_idx, old_p);
 	netdev_unlock(dev);
 }
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 797d2a08c515..ca4f80282448 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <net/netdev_queues.h>
+#include <net/xdp_sock_drv.h>
 
 #include "netlink.h"
 #include "common.h"
@@ -169,16 +169,14 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info)
 	if (ret)
 		return ret;
 
-	/* ensure channels are not busy at the moment */
+	/* Disabling channels, query zero-copy AF_XDP sockets */
 	from_channel = channels.combined_count +
 		       min(channels.rx_count, channels.tx_count);
-	for (i = from_channel; i < old_total; i++) {
-		if (netdev_queue_busy(dev, i, NULL)) {
-			GENL_SET_ERR_MSG(info,
-					 "requested channel counts are too low due to busy queues (AF_XDP or queue leasing)");
+	for (i = from_channel; i < old_total; i++)
+		if (xsk_get_pool_from_qid(dev, i)) {
+			GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
 			return -EINVAL;
 		}
-	}
 
 	ret = dev->ethtool_ops->set_channels(dev, &channels);
 	return ret < 0 ? ret : 1;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 02a3454234d6..9431e305b233 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -27,13 +27,12 @@
 #include <linux/net.h>
 #include <linux/pm_runtime.h>
 #include <linux/utsname.h>
-#include <linux/ethtool_netlink.h>
 #include <net/devlink.h>
 #include <net/ipv6.h>
+#include <net/xdp_sock_drv.h>
 #include <net/flow_offload.h>
 #include <net/netdev_lock.h>
-#include <net/netdev_queues.h>
-
+#include <linux/ethtool_netlink.h>
 #include "common.h"
 
 /* State held across locks and calls for commands which have devlink fallback */
@@ -2283,12 +2282,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	/* Disabling channels, query busy queues (AF_XDP, queue leasing) */
+	/* Disabling channels, query zero-copy AF_XDP sockets */
 	from_channel = channels.combined_count +
 		min(channels.rx_count, channels.tx_count);
 	to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
 	for (i = from_channel; i < to_channel; i++)
-		if (netdev_queue_busy(dev, i, NULL))
+		if (xsk_get_pool_from_qid(dev, i))
 			return -EINVAL;
 
 	ret = dev->ethtool_ops->set_channels(dev, &channels);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 92f791433725..3b46bc635c43 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -23,8 +23,6 @@
 #include <linux/netdevice.h>
 #include <linux/rculist.h>
 #include <linux/vmalloc.h>
-
-#include <net/netdev_queues.h>
 #include <net/xdp_sock_drv.h>
 #include <net/busy_poll.h>
 #include <net/netdev_lock.h>
@@ -105,7 +103,7 @@ bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
 }
 EXPORT_SYMBOL(xsk_uses_need_wakeup);
 
-struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev,
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
 					    u16 queue_id)
 {
 	if (queue_id < dev->real_num_rx_queues)
@@ -119,18 +117,10 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid);
 
 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
 {
-	struct net_device *orig_dev = dev;
-	unsigned int id = queue_id;
-
-	if (id < dev->real_num_rx_queues)
-		WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id));
-
-	if (id < dev->real_num_rx_queues)
-		dev->_rx[id].pool = NULL;
-	if (id < dev->real_num_tx_queues)
-		dev->_tx[id].pool = NULL;
-
-	netif_put_rx_queue_lease_locked(orig_dev, dev);
+	if (queue_id < dev->num_rx_queues)
+		dev->_rx[queue_id].pool = NULL;
+	if (queue_id < dev->num_tx_queues)
+		dev->_tx[queue_id].pool = NULL;
 }
 
 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
@@ -140,29 +130,17 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
 			u16 queue_id)
 {
-	struct net_device *orig_dev = dev;
-	unsigned int id = queue_id;
-	int ret = 0;
-
-	if (id >= max(dev->real_num_rx_queues,
-		      dev->real_num_tx_queues))
+	if (queue_id >= max_t(unsigned int,
+			      dev->real_num_rx_queues,
+			      dev->real_num_tx_queues))
 		return -EINVAL;
-	if (id < dev->real_num_rx_queues) {
-		if (!netif_get_rx_queue_lease_locked(&dev, &id))
-			return -EBUSY;
-		if (xsk_get_pool_from_qid(dev, id)) {
-			ret = -EBUSY;
-			goto out;
-		}
-	}
 
-	if (id < dev->real_num_rx_queues)
-		dev->_rx[id].pool = pool;
-	if (id < dev->real_num_tx_queues)
-		dev->_tx[id].pool = pool;
-out:
-	netif_put_rx_queue_lease_locked(orig_dev, dev);
-	return ret;
+	if (queue_id < dev->real_num_rx_queues)
+		dev->_rx[queue_id].pool = pool;
+	if (queue_id < dev->real_num_tx_queues)
+		dev->_tx[queue_id].pool = pool;
+
+	return 0;
 }
 
 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
@@ -346,37 +324,14 @@ static bool xsk_is_bound(struct xdp_sock *xs)
 	return false;
 }
 
-static bool xsk_dev_queue_valid(const struct xdp_sock *xs,
-				const struct xdp_rxq_info *info)
-{
-	struct net_device *dev = xs->dev;
-	u32 queue_index = xs->queue_id;
-	struct netdev_rx_queue *rxq;
-
-	if (info->dev == dev &&
-	    info->queue_index == queue_index)
-		return true;
-
-	if (queue_index < dev->real_num_rx_queues) {
-		rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease);
-		if (!rxq)
-			return false;
-
-		dev = rxq->dev;
-		queue_index = get_netdev_rx_queue_index(rxq);
-
-		return info->dev == dev &&
-		       info->queue_index == queue_index;
-	}
-	return false;
-}
-
 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
 	if (!xsk_is_bound(xs))
 		return -ENXIO;
-	if (!xsk_dev_queue_valid(xs, xdp->rxq))
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
+
 	if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
 		xs->rx_dropped++;
 		return -ENOSPC;
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 7df1056a35fd..e0b579a1df4f 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -160,7 +160,6 @@ enum {
 	NETDEV_A_QUEUE_DMABUF,
 	NETDEV_A_QUEUE_IO_URING,
 	NETDEV_A_QUEUE_XSK,
-	NETDEV_A_QUEUE_LEASE,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -203,15 +202,6 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
-enum {
-	NETDEV_A_LEASE_IFINDEX = 1,
-	NETDEV_A_LEASE_QUEUE,
-	NETDEV_A_LEASE_NETNS_ID,
-
-	__NETDEV_A_LEASE_MAX,
-	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
-};
-
 enum {
 	NETDEV_A_DMABUF_IFINDEX = 1,
 	NETDEV_A_DMABUF_QUEUES,
@@ -238,7 +228,6 @@ enum {
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
 	NETDEV_CMD_BIND_TX,
-	NETDEV_CMD_QUEUE_CREATE,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst
index b94e81c2e030..eb838ae94844 100644
--- a/tools/testing/selftests/drivers/net/README.rst
+++ b/tools/testing/selftests/drivers/net/README.rst
@@ -62,13 +62,6 @@ LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6
 
 Local and remote endpoint IP addresses.
 
-LOCAL_PREFIX_V4, LOCAL_PREFIX_V6
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Local IP prefix/subnet which can be used to allocate extra IP addresses (for
-network name spaces behind macvlan, veth, netkit devices). DUT must be
-reachable using these addresses from the endpoint.
-
 REMOTE_TYPE
 ~~~~~~~~~~~
 
diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index 39ad86d693b3..9c163ba6feee 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -32,8 +32,6 @@ TEST_PROGS = \
 	irq.py \
 	loopback.sh \
 	nic_timestamp.py \
-	nk_netns.py \
-	nk_qlease.py \
 	pp_alloc_fail.py \
 	rss_api.py \
 	rss_ctx.py \
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
index 022008249313..d5d247eca6b7 100644
--- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
@@ -3,7 +3,6 @@
 """
 Driver test environment (hardware-only tests).
 NetDrvEnv and NetDrvEpEnv are the main environment classes.
-NetDrvContEnv extends NetDrvEpEnv with netkit container support.
 Former is for local host only tests, latter creates / connects
 to a remote endpoint. See NIPA wiki for more information about
 running and writing driver tests.
@@ -30,7 +29,7 @@ try:
     from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \
         ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none
     from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner
-    from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv
+    from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv
 
     __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev",
                "EthtoolFamily", "NetdevFamily", "NetshaperFamily",
@@ -45,8 +44,8 @@ try:
                "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt",
                "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt",
                "ksft_not_none", "ksft_not_none",
-               "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic",
-               "Remote", "Iperf3Runner"]
+               "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote",
+               "Iperf3Runner"]
 except ModuleNotFoundError as e:
     print("Failed importing `net` library from kernel sources")
     print(str(e))
diff --git a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
deleted file mode 100644
index 86ebfc1445b6..000000000000
--- a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
-#include <linux/pkt_cls.h>
-#include <linux/if_ether.h>
-#include <linux/ipv6.h>
-#include <linux/in6.h>
-#include <bpf/bpf_endian.h>
-#include <bpf/bpf_helpers.h>
-
-#define TC_ACT_OK 0
-#define ETH_P_IPV6 0x86DD
-
-#define ctx_ptr(field)		((void *)(long)(field))
-
-#define v6_p64_equal(a, b)	(a.s6_addr32[0] == b.s6_addr32[0] && \
-				 a.s6_addr32[1] == b.s6_addr32[1])
-
-volatile __u32 netkit_ifindex;
-volatile __u8 ipv6_prefix[16];
-
-SEC("tc/ingress")
-int tc_redirect_peer(struct __sk_buff *skb)
-{
-	void *data_end = ctx_ptr(skb->data_end);
-	void *data = ctx_ptr(skb->data);
-	struct in6_addr *peer_addr;
-	struct ipv6hdr *ip6h;
-	struct ethhdr *eth;
-
-	peer_addr = (struct in6_addr *)ipv6_prefix;
-
-	if (skb->protocol != bpf_htons(ETH_P_IPV6))
-		return TC_ACT_OK;
-
-	eth = data;
-	if ((void *)(eth + 1) > data_end)
-		return TC_ACT_OK;
-
-	ip6h = data + sizeof(struct ethhdr);
-	if ((void *)(ip6h + 1) > data_end)
-		return TC_ACT_OK;
-
-	if (!v6_p64_equal(ip6h->daddr, (*peer_addr)))
-		return TC_ACT_OK;
-
-	return bpf_redirect_peer(netkit_ifindex, 0);
-}
-
-char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/drivers/net/hw/nk_netns.py b/tools/testing/selftests/drivers/net/hw/nk_netns.py
deleted file mode 100755
index afa8638195d8..000000000000
--- a/tools/testing/selftests/drivers/net/hw/nk_netns.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-
-from lib.py import ksft_run, ksft_exit
-from lib.py import NetDrvContEnv
-from lib.py import cmd
-
-
-def test_ping(cfg) -> None:
-    cfg.require_ipver("6")
-
-    cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote)
-    cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns)
-
-
-def main() -> None:
-    with NetDrvContEnv(__file__) as cfg:
-        ksft_run([test_ping], args=(cfg,))
-    ksft_exit()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py
deleted file mode 100755
index 738a46d2d20c..000000000000
--- a/tools/testing/selftests/drivers/net/hw/nk_qlease.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-
-import re
-from os import path
-from lib.py import ksft_run, ksft_exit
-from lib.py import NetDrvContEnv
-from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen
-
-
-def create_rss_ctx(cfg):
-    output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout
-    values = re.search(r'New RSS context is (\d+)', output).group(1)
-    return int(values)
-
-
-def set_flow_rule(cfg):
-    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout
-    values = re.search(r'ID (\d+)', output).group(1)
-    return int(values)
-
-
-def set_flow_rule_rss(cfg, rss_ctx_id):
-    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout
-    values = re.search(r'ID (\d+)', output).group(1)
-    return int(values)
-
-
-def test_iou_zcrx(cfg) -> None:
-    cfg.require_ipver('6')
-
-    ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}")
-    defer(ethtool, f"-X {cfg.ifname} default")
-
-    flow_rule_id = set_flow_rule(cfg)
-    defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
-
-    rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}"
-    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840"
-    with bkg(rx_cmd, exit_wait=True):
-        wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns)
-        cmd(tx_cmd, host=cfg.remote)
-
-
-def main() -> None:
-    with NetDrvContEnv(__file__, lease=True) as cfg:
-        cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx")
-        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
-        cfg.port = rand_port()
-        ksft_run([test_iou_zcrx], args=(cfg,))
-    ksft_exit()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py
index be3a8a936882..8b75faa9af6d 100644
--- a/tools/testing/selftests/drivers/net/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py
@@ -3,7 +3,6 @@
 """
 Driver test environment.
 NetDrvEnv and NetDrvEpEnv are the main environment classes.
-NetDrvContEnv extends NetDrvEpEnv with netkit container support.
 Former is for local host only tests, latter creates / connects
 to a remote endpoint. See NIPA wiki for more information about
 running and writing driver tests.
@@ -44,12 +43,12 @@ try:
                "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt",
                "ksft_not_none", "ksft_not_none"]
 
-    from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv
+    from .env import NetDrvEnv, NetDrvEpEnv
     from .load import GenerateTraffic, Iperf3Runner
     from .remote import Remote
 
-    __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic",
-                "Remote", "Iperf3Runner"]
+    __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote",
+                "Iperf3Runner"]
 except ModuleNotFoundError as e:
     print("Failed importing `net` library from kernel sources")
     print(str(e))
diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index 7066d78395c6..41cc248ac848 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -1,17 +1,13 @@
 # SPDX-License-Identifier: GPL-2.0
 
-import ipaddress
 import os
-import re
 import time
 from pathlib import Path
 from lib.py import KsftSkipEx, KsftXfailEx
 from lib.py import ksft_setup, wait_file
 from lib.py import cmd, ethtool, ip, CmdExitFailure
 from lib.py import NetNS, NetdevSimDev
-from lib.py import NetdevFamily, EthtoolFamily
 from .remote import Remote
-from . import bpftool
 
 
 class NetDrvEnvBase:
@@ -293,156 +289,3 @@ class NetDrvEpEnv(NetDrvEnvBase):
                 data.get('stats-block-usecs', 0) / 1000 / 1000
 
         time.sleep(self._stats_settle_time)
-
-
-class NetDrvContEnv(NetDrvEpEnv):
-    """
-    Class for an environment with a netkit pair setup for forwarding traffic
-    between the physical interface and a network namespace.
-    """
-
-    def __init__(self, src_path, lease=False, **kwargs):
-        super().__init__(src_path, **kwargs)
-
-        self.require_ipver("6")
-        local_prefix = self.env.get("LOCAL_PREFIX_V6")
-        if not local_prefix:
-            raise KsftSkipEx("LOCAL_PREFIX_V6 required")
-
-        self.netdevnl = NetdevFamily()
-        self.ethnl = EthtoolFamily()
-
-        local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":")
-        self.ipv6_prefix = f"{local_prefix}::"
-        self.nk_host_ipv6 = f"{local_prefix}::2:1"
-        self.nk_guest_ipv6 = f"{local_prefix}::2:2"
-
-        self.netns = None
-        self._nk_host_ifname = None
-        self._nk_guest_ifname = None
-        self._tc_attached = False
-        self._bpf_prog_pref = None
-        self._bpf_prog_id = None
-        self._leased = False
-
-        nk_rxqueues = 1
-        if lease:
-            nk_rxqueues = 2
-        ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}")
-
-        all_links = ip("-d link show", json=True)
-        netkit_links = [link for link in all_links
-                        if link.get('linkinfo', {}).get('info_kind') == 'netkit'
-                        and 'UP' not in link.get('flags', [])]
-
-        if len(netkit_links) != 2:
-            raise KsftSkipEx("Failed to create netkit pair")
-
-        netkit_links.sort(key=lambda x: x['ifindex'])
-        self._nk_host_ifname = netkit_links[1]['ifname']
-        self._nk_guest_ifname = netkit_links[0]['ifname']
-        self.nk_host_ifindex = netkit_links[1]['ifindex']
-        self.nk_guest_ifindex = netkit_links[0]['ifindex']
-
-        if lease:
-            self._lease_queues()
-
-        self._setup_ns()
-        self._attach_bpf()
-
-    def __del__(self):
-        if self._tc_attached:
-            cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}")
-            self._tc_attached = False
-
-        if self._nk_host_ifname:
-            cmd(f"ip link del dev {self._nk_host_ifname}")
-            self._nk_host_ifname = None
-            self._nk_guest_ifname = None
-
-        if self.netns:
-            del self.netns
-            self.netns = None
-
-        if self._leased:
-            self.ethnl.rings_set({'header': {'dev-index': self.ifindex},
-                                  'tcp-data-split': 'unknown',
-                                  'hds-thresh': self._hds_thresh,
-                                  'rx': self._rx_rings})
-            self._leased = False
-
-        super().__del__()
-
-    def _lease_queues(self):
-        channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}})
-        channels = channels['combined-count']
-        if channels < 2:
-            raise KsftSkipEx('Test requires NETIF with at least 2 combined channels')
-
-        rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}})
-        self._rx_rings = rings['rx']
-        self._hds_thresh = rings.get('hds-thresh', 0)
-        self.ethnl.rings_set({'header': {'dev-index': self.ifindex},
-                            'tcp-data-split': 'enabled',
-                            'hds-thresh': 0,
-                            'rx': 64})
-        self.src_queue = channels - 1
-        bind_result = self.netdevnl.queue_create(
-            {
-                "ifindex": self.nk_guest_ifindex,
-                "type": "rx",
-                "lease": {
-                    "ifindex": self.ifindex,
-                    "queue": {"id": self.src_queue, "type": "rx"},
-                },
-            }
-        )
-        self.nk_queue = bind_result['id']
-        self._leased = True
-
-    def _setup_ns(self):
-        self.netns = NetNS()
-        ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}")
-        ip(f"link set dev {self._nk_host_ifname} up")
-        ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad")
-        ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}")
-
-        ip("link set lo up", ns=self.netns)
-        ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns)
-        ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns)
-        ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns)
-        ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns)
-
-    def _attach_bpf(self):
-        bpf_obj = self.test_dir / "nk_forward.bpf.o"
-        if not bpf_obj.exists():
-            raise KsftSkipEx("BPF prog not found")
-
-        cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action")
-        self._tc_attached = True
-
-        tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout
-        match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info)
-        if not match:
-            raise Exception("Failed to get BPF prog ID")
-        self._bpf_prog_pref = int(match.group(1))
-        self._bpf_prog_id = int(match.group(2))
-
-        prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True)
-        map_ids = prog_info.get("map_ids", [])
-
-        bss_map_id = None
-        for map_id in map_ids:
-            map_info = bpftool(f"map show id {map_id}", json=True)
-            if map_info.get("name").endswith("bss"):
-                bss_map_id = map_id
-
-        if bss_map_id is None:
-            raise Exception("Failed to find .bss map")
-
-        ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix)
-        ipv6_bytes = ipv6_addr.packed
-        ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little')
-        value = ipv6_bytes + ifindex_bytes
-        value_hex = ' '.join(f'{b:02x}' for b in value)
-        bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}")
-- 
cgit v1.2.3


From 64dd89ae01f2708a508e028c28b7906e4702a9a7 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 15 Dec 2025 12:57:53 -0500
Subject: mm/block/fs: remove laptop_mode

Laptop mode was introduced to save battery, by delaying and consolidating
writes and thereby maximize the time rotating hard drives wouldn't have to
spin.

Luckily, rotating hard drives, with their high spin-up times and power
draw, are a thing of the past for battery-powered devices.  Reclaim has
also since changed to not write single filesystem pages anymore, and
regular filesystem writeback is lumpy by design.

The juice doesn't appear worth the squeeze anymore.  The footprint of the
feature is small, but nevertheless it's a complicating factor in mm,
block, filesystems.  Developers don't think about it, and it likely hasn't
been tested with new reclaim and writeback changes in years.

Let's sunset it.  Keep the sysctl with a deprecation warning around for a
few more cycles, but remove all functionality behind it.

[akpm@linux-foundation.org: fix Documentation/admin-guide/laptops/index.rst]
Link: https://lkml.kernel.org/r/20251216185201.GH905277@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Deepanshu Kartikey <kartikey406@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/laptops/index.rst       |   1 -
 Documentation/admin-guide/laptops/laptop-mode.rst | 770 ----------------------
 Documentation/admin-guide/sysctl/vm.rst           |   8 -
 block/blk-mq.c                                    |   3 -
 fs/ext4/inode.c                                   |   3 +-
 fs/sync.c                                         |   2 -
 fs/xfs/xfs_super.c                                |   9 -
 include/linux/backing-dev-defs.h                  |   3 -
 include/linux/writeback.h                         |   4 -
 include/trace/events/writeback.h                  |   1 -
 include/uapi/linux/sysctl.h                       |   2 +-
 mm/backing-dev.c                                  |   3 -
 mm/page-writeback.c                               |  74 +--
 mm/vmscan.c                                       |  30 +-
 14 files changed, 25 insertions(+), 888 deletions(-)
 delete mode 100644 Documentation/admin-guide/laptops/laptop-mode.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst
index 6432c251dc95..c0b911d05c59 100644
--- a/Documentation/admin-guide/laptops/index.rst
+++ b/Documentation/admin-guide/laptops/index.rst
@@ -10,7 +10,6 @@ Laptop Drivers
    alienware-wmi
    asus-laptop
    disk-shock-protection
-   laptop-mode
    lg-laptop
    samsung-galaxybook
    sony-laptop
diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst
deleted file mode 100644
index 66eb9cd918b5..000000000000
--- a/Documentation/admin-guide/laptops/laptop-mode.rst
+++ /dev/null
@@ -1,770 +0,0 @@
-===============================================
-How to conserve battery power using laptop-mode
-===============================================
-
-Document Author: Bart Samwel (bart@samwel.tk)
-
-Date created: January 2, 2004
-
-Last modified: December 06, 2004
-
-Introduction
-------------
-
-Laptop mode is used to minimize the time that the hard disk needs to be spun up,
-to conserve battery power on laptops. It has been reported to cause significant
-power savings.
-
-.. Contents
-
-   * Introduction
-   * Installation
-   * Caveats
-   * The Details
-   * Tips & Tricks
-   * Control script
-   * ACPI integration
-   * Monitoring tool
-
-
-Installation
-------------
-
-To use laptop mode, you don't need to set any kernel configuration options
-or anything. Simply install all the files included in this document, and
-laptop mode will automatically be started when you're on battery. For
-your convenience, a tarball containing an installer can be downloaded at:
-
-	http://www.samwel.tk/laptop_mode/laptop_mode/
-
-To configure laptop mode, you need to edit the configuration file, which is
-located in /etc/default/laptop-mode on Debian-based systems, or in
-/etc/sysconfig/laptop-mode on other systems.
-
-Unfortunately, automatic enabling of laptop mode does not work for
-laptops that don't have ACPI. On those laptops, you need to start laptop
-mode manually. To start laptop mode, run "laptop_mode start", and to
-stop it, run "laptop_mode stop". (Note: The laptop mode tools package now
-has experimental support for APM, you might want to try that first.)
-
-
-Caveats
--------
-
-* The downside of laptop mode is that you have a chance of losing up to 10
-  minutes of work. If you cannot afford this, don't use it! The supplied ACPI
-  scripts automatically turn off laptop mode when the battery almost runs out,
-  so that you won't lose any data at the end of your battery life.
-
-* Most desktop hard drives have a very limited lifetime measured in spindown
-  cycles, typically about 50.000 times (it's usually listed on the spec sheet).
-  Check your drive's rating, and don't wear down your drive's lifetime if you
-  don't need to.
-
-* If you mount some of your ext3 filesystems with the -n option, then
-  the control script will not be able to remount them correctly. You must set
-  DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
-  wrong options -- or it will fail because it cannot write to /etc/mtab.
-
-* If you have your filesystems listed as type "auto" in fstab, like I did, then
-  the control script will not recognize them as filesystems that need remounting.
-  You must list the filesystems with their true type instead.
-
-* It has been reported that some versions of the mutt mail client use file access
-  times to determine whether a folder contains new mail. If you use mutt and
-  experience this, you must disable the noatime remounting by setting the option
-  DO_REMOUNT_NOATIME to 0 in the configuration file.
-
-
-The Details
------------
-
-Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is
-present for all kernels that have the laptop mode patch, regardless of any
-configuration options. When the knob is set, any physical disk I/O (that might
-have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The
-result of this is that after a disk has spun down, it will not be spun up
-anymore to write dirty blocks, because those blocks had already been written
-immediately after the most recent read operation. The value of the laptop_mode
-knob determines the time between the occurrence of disk I/O and when the flush
-is triggered. A sensible value for the knob is 5 seconds. Setting the knob to
-0 disables laptop mode.
-
-To increase the effectiveness of the laptop_mode strategy, the laptop_mode
-control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
-/proc/sys/vm to about 10 minutes (by default), which means that pages that are
-dirtied are not forced to be written to disk as often. The control script also
-changes the dirty background ratio, so that background writeback of dirty pages
-is not done anymore. Combined with a higher commit value (also 10 minutes) for
-ext3 filesystem (also done automatically by the control script),
-this results in concentration of disk activity in a small time interval which
-occurs only once every 10 minutes, or whenever the disk is forced to spin up by
-a cache miss. The disk can then be spun down in the periods of inactivity.
-
-
-Configuration
--------------
-
-The laptop mode configuration file is located in /etc/default/laptop-mode on
-Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It
-contains the following options:
-
-MAX_AGE:
-
-Maximum time, in seconds, of hard drive spindown time that you are
-comfortable with. Worst case, it's possible that you could lose this
-amount of work if your battery fails while you're in laptop mode.
-
-MINIMUM_BATTERY_MINUTES:
-
-Automatically disable laptop mode if the remaining number of minutes of
-battery power is less than this value. Default is 10 minutes.
-
-AC_HD/BATT_HD:
-
-The idle timeout that should be set on your hard drive when laptop mode
-is active (BATT_HD) and when it is not active (AC_HD). The defaults are
-20 seconds (value 4) for BATT_HD  and 2 hours (value 244) for AC_HD. The
-possible values are those listed in the manual page for "hdparm" for the
-"-S" option.
-
-HD:
-
-The devices for which the spindown timeout should be adjusted by laptop mode.
-Default is /dev/hda. If you specify multiple devices, separate them by a space.
-
-READAHEAD:
-
-Disk readahead, in 512-byte sectors, while laptop mode is active. A large
-readahead can prevent disk accesses for things like executable pages (which are
-loaded on demand while the application executes) and sequentially accessed data
-(MP3s).
-
-DO_REMOUNTS:
-
-The control script automatically remounts any mounted journaled filesystems
-with appropriate commit interval options. When this option is set to 0, this
-feature is disabled.
-
-DO_REMOUNT_NOATIME:
-
-When remounting, should the filesystems be remounted with the noatime option?
-Normally, this is set to "1" (enabled), but there may be programs that require
-access time recording.
-
-DIRTY_RATIO:
-
-The percentage of memory that is allowed to contain "dirty" or unsaved data
-before a writeback is forced, while laptop mode is active. Corresponds to
-the /proc/sys/vm/dirty_ratio sysctl.
-
-DIRTY_BACKGROUND_RATIO:
-
-The percentage of memory that is allowed to contain "dirty" or unsaved data
-after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set
-this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio
-sysctl.
-
-Note that the behaviour of dirty_background_ratio is quite different
-when laptop mode is active and when it isn't. When laptop mode is inactive,
-dirty_background_ratio is the threshold percentage at which background writeouts
-start taking place. When laptop mode is active, however, background writeouts
-are disabled, and the dirty_background_ratio only determines how much writeback
-is done when dirty_ratio is reached.
-
-DO_CPU:
-
-Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup.
-See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.)
-
-CPU_MAXFREQ:
-
-When on battery, what is the maximum CPU speed that the system should use? Legal
-values are "slowest" for the slowest speed that your CPU is able to operate at,
-or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies.
-
-
-Tips & Tricks
--------------
-
-* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
-  of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1).
-
-* You can spin down the disk while playing MP3, by setting disk readahead
-  to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at
-  once, and will then spin down while the MP3 is playing. (Thanks to Bartek
-  Kania.)
-
-* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
-  of colours that my display uses it consumes less battery power. I've seen
-  this on powerbooks too. I hope that this is a piece of information that
-  might be useful to the Laptop Mode patch or its users."
-
-* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the
-  file after every logging. When you're using laptop-mode and your disk doesn't
-  spin down, this is a likely culprit.
-
-* Richard Atterer observed that laptop mode does not work well with noflushd
-  (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
-  from doing its thing.
-
-* If you're worried about your data, you might want to consider using a USB
-  memory stick or something like that as a "working area". (Be aware though
-  that flash memory can only handle a limited number of writes, and overuse
-  may wear out your memory stick pretty quickly. Do _not_ use journalling
-  filesystems on flash memory sticks.)
-
-
-Configuration file for control and ACPI battery scripts
--------------------------------------------------------
-
-This allows the tunables to be changed for the scripts via an external
-configuration file
-
-It should be installed as /etc/default/laptop-mode on Debian, and as
-/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes.
-
-Config file::
-
-  # Maximum time, in seconds, of hard drive spindown time that you are
-  # comfortable with. Worst case, it's possible that you could lose this
-  # amount of work if your battery fails you while in laptop mode.
-  #MAX_AGE=600
-
-  # Automatically disable laptop mode when the number of minutes of battery
-  # that you have left goes below this threshold.
-  MINIMUM_BATTERY_MINUTES=10
-
-  # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG
-  # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk
-  # will read a complete MP3 at once, and will then spin down while the MP3/OGG is
-  # playing.
-  #READAHEAD=4096
-
-  # Shall we remount journaled fs. with appropriate commit interval? (1=yes)
-  #DO_REMOUNTS=1
-
-  # And shall we add the "noatime" option to that as well? (1=yes)
-  #DO_REMOUNT_NOATIME=1
-
-  # Dirty synchronous ratio.  At this percentage of dirty pages the process
-  # which
-  # calls write() does its own writeback
-  #DIRTY_RATIO=40
-
-  #
-  # Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
-  # exceeded, the kernel will wake flusher threads which will then reduce the
-  # amount of dirty memory to dirty_background_ratio.  Set this nice and low,
-  # so once some writeout has commenced, we do a lot of it.
-  #
-  #DIRTY_BACKGROUND_RATIO=5
-
-  # kernel default dirty buffer age
-  #DEF_AGE=30
-  #DEF_UPDATE=5
-  #DEF_DIRTY_BACKGROUND_RATIO=10
-  #DEF_DIRTY_RATIO=40
-  #DEF_XFS_AGE_BUFFER=15
-  #DEF_XFS_SYNC_INTERVAL=30
-  #DEF_XFS_BUFD_INTERVAL=1
-
-  # This must be adjusted manually to the value of HZ in the running kernel
-  # on 2.4, until the XFS people change their 2.4 external interfaces to work in
-  # centisecs. This can be automated, but it's a work in progress that still
-  # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for
-  # external interfaces, and that is currently always set to 100. So you don't
-  # need to change this on 2.6.
-  #XFS_HZ=100
-
-  # Should the maximum CPU frequency be adjusted down while on battery?
-  # Requires CPUFreq to be setup.
-  # See Documentation/admin-guide/pm/cpufreq.rst for more info
-  #DO_CPU=0
-
-  # When on battery what is the maximum CPU speed that the system should
-  # use? Legal values are "slowest" for the slowest speed that your
-  # CPU is able to operate at, or a value listed in:
-  # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
-  # Only applicable if DO_CPU=1.
-  #CPU_MAXFREQ=slowest
-
-  # Idle timeout for your hard drive (man hdparm for valid values, -S option)
-  # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4).
-  #AC_HD=244
-  #BATT_HD=4
-
-  # The drives for which to adjust the idle timeout. Separate them by a space,
-  # e.g. HD="/dev/hda /dev/hdb".
-  #HD="/dev/hda"
-
-  # Set the spindown timeout on a hard drive?
-  #DO_HD=1
-
-
-Control script
---------------
-
-Please note that this control script works for the Linux 2.4 and 2.6 series (thanks
-to Kiko Piris).
-
-Control script::
-
-  #!/bin/bash
-
-  # start or stop laptop_mode, best run by a power management daemon when
-  # ac gets connected/disconnected from a laptop
-  #
-  # install as /sbin/laptop_mode
-  #
-  # Contributors to this script:   Kiko Piris
-  #				 Bart Samwel
-  #				 Micha Feigin
-  #				 Andrew Morton
-  #				 Herve Eychenne
-  #				 Dax Kelson
-  #
-  # Original Linux 2.4 version by: Jens Axboe
-
-  #############################################################################
-
-  # Source config
-  if [ -f /etc/default/laptop-mode ] ; then
-	# Debian
-	. /etc/default/laptop-mode
-  elif [ -f /etc/sysconfig/laptop-mode ] ; then
-	# Others
-          . /etc/sysconfig/laptop-mode
-  fi
-
-  # Don't raise an error if the config file is incomplete
-  # set defaults instead:
-
-  # Maximum time, in seconds, of hard drive spindown time that you are
-  # comfortable with. Worst case, it's possible that you could lose this
-  # amount of work if your battery fails you while in laptop mode.
-  MAX_AGE=${MAX_AGE:-'600'}
-
-  # Read-ahead, in kilobytes
-  READAHEAD=${READAHEAD:-'4096'}
-
-  # Shall we remount journaled fs. with appropriate commit interval? (1=yes)
-  DO_REMOUNTS=${DO_REMOUNTS:-'1'}
-
-  # And shall we add the "noatime" option to that as well? (1=yes)
-  DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'}
-
-  # Shall we adjust the idle timeout on a hard drive?
-  DO_HD=${DO_HD:-'1'}
-
-  # Adjust idle timeout on which hard drive?
-  HD="${HD:-'/dev/hda'}"
-
-  # spindown time for HD (hdparm -S values)
-  AC_HD=${AC_HD:-'244'}
-  BATT_HD=${BATT_HD:-'4'}
-
-  # Dirty synchronous ratio.  At this percentage of dirty pages the process which
-  # calls write() does its own writeback
-  DIRTY_RATIO=${DIRTY_RATIO:-'40'}
-
-  # cpu frequency scaling
-  # See Documentation/admin-guide/pm/cpufreq.rst for more info
-  DO_CPU=${CPU_MANAGE:-'0'}
-  CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'}
-
-  #
-  # Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
-  # exceeded, the kernel will wake flusher threads which will then reduce the
-  # amount of dirty memory to dirty_background_ratio.  Set this nice and low,
-  # so once some writeout has commenced, we do a lot of it.
-  #
-  DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'}
-
-  # kernel default dirty buffer age
-  DEF_AGE=${DEF_AGE:-'30'}
-  DEF_UPDATE=${DEF_UPDATE:-'5'}
-  DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'}
-  DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'}
-  DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'}
-  DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'}
-  DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'}
-
-  # This must be adjusted manually to the value of HZ in the running kernel
-  # on 2.4, until the XFS people change their 2.4 external interfaces to work in
-  # centisecs. This can be automated, but it's a work in progress that still needs
-  # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external
-  # interfaces, and that is currently always set to 100. So you don't need to
-  # change this on 2.6.
-  XFS_HZ=${XFS_HZ:-'100'}
-
-  #############################################################################
-
-  KLEVEL="$(uname -r |
-               {
-	       IFS='.' read a b c
-	       echo $a.$b
-	     }
-  )"
-  case "$KLEVEL" in
-	"2.4"|"2.6")
-		;;
-	*)
-		echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2
-		exit 1
-		;;
-  esac
-
-  if [ ! -e /proc/sys/vm/laptop_mode ] ; then
-	echo "Kernel is not patched with laptop_mode patch." >&2
-	exit 1
-  fi
-
-  if [ ! -w /proc/sys/vm/laptop_mode ] ; then
-	echo "You do not have enough privileges to enable laptop_mode." >&2
-	exit 1
-  fi
-
-  # Remove an option (the first parameter) of the form option=<number> from
-  # a mount options string (the rest of the parameters).
-  parse_mount_opts () {
-	OPT="$1"
-	shift
-	echo ",$*," | sed		\
-	 -e 's/,'"$OPT"'=[0-9]*,/,/g'	\
-	 -e 's/,,*/,/g'			\
-	 -e 's/^,//'			\
-	 -e 's/,$//'
-  }
-
-  # Remove an option (the first parameter) without any arguments from
-  # a mount option string (the rest of the parameters).
-  parse_nonumber_mount_opts () {
-	OPT="$1"
-	shift
-	echo ",$*," | sed		\
-	 -e 's/,'"$OPT"',/,/g'		\
-	 -e 's/,,*/,/g'			\
-	 -e 's/^,//'			\
-	 -e 's/,$//'
-  }
-
-  # Find out the state of a yes/no option (e.g. "atime"/"noatime") in
-  # fstab for a given filesystem, and use this state to replace the
-  # value of the option in another mount options string. The device
-  # is the first argument, the option name the second, and the default
-  # value the third. The remainder is the mount options string.
-  #
-  # Example:
-  # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
-  #
-  # If fstab contains, say, "rw" for this filesystem, then the result
-  # will be "defaults,atime".
-  parse_yesno_opts_wfstab () {
-	L_DEV="$1"
-	OPT="$2"
-	DEF_OPT="$3"
-	shift 3
-	L_OPTS="$*"
-	PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
-	PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
-	# Watch for a default atime in fstab
-	FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
-	if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then
-		# option specified in fstab: extract the value and use it
-		if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then
-			echo "$PARSEDOPTS1,no$OPT"
-		else
-			# no$OPT not found -- so we must have $OPT.
-			echo "$PARSEDOPTS1,$OPT"
-		fi
-	else
-		# option not specified in fstab -- choose the default.
-		echo "$PARSEDOPTS1,$DEF_OPT"
-	fi
-  }
-
-  # Find out the state of a numbered option (e.g. "commit=NNN") in
-  # fstab for a given filesystem, and use this state to replace the
-  # value of the option in another mount options string. The device
-  # is the first argument, and the option name the second. The
-  # remainder is the mount options string in which the replacement
-  # must be done.
-  #
-  # Example:
-  # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
-  #
-  # If fstab contains, say, "commit=3,rw" for this filesystem, then the
-  # result will be "rw,commit=3".
-  parse_mount_opts_wfstab () {
-	L_DEV="$1"
-	OPT="$2"
-	shift 2
-	L_OPTS="$*"
-	PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
-	# Watch for a default commit in fstab
-	FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
-	if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then
-		# option specified in fstab: extract the value, and use it
-		echo -n "$PARSEDOPTS1,$OPT="
-		echo ",$FSTAB_OPTS," | sed \
-		 -e 's/.*,'"$OPT"'=//'	\
-		 -e 's/,.*//'
-	else
-		# option not specified in fstab: set it to 0
-		echo "$PARSEDOPTS1,$OPT=0"
-	fi
-  }
-
-  deduce_fstype () {
-	MP="$1"
-	# My root filesystem unfortunately has
-	# type "unknown" in /etc/mtab. If we encounter
-	# "unknown", we try to get the type from fstab.
-	cat /etc/fstab |
-	grep -v '^#' |
-	while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do
-		if [ "$FSTAB_MP" = "$MP" ]; then
-			echo $FSTAB_FST
-			exit 0
-		fi
-	done
-  }
-
-  if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then
-	NOATIME_OPT=",noatime"
-  fi
-
-  case "$1" in
-	start)
-		AGE=$((100*$MAX_AGE))
-		XFS_AGE=$(($XFS_HZ*$MAX_AGE))
-		echo -n "Starting laptop_mode"
-
-		if [ -d /proc/sys/vm/pagebuf ] ; then
-			# (For 2.4 and early 2.6.)
-			# This only needs to be set, not reset -- it is only used when
-			# laptop mode is enabled.
-			echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
-		elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
-			# (A couple of early 2.6 laptop mode patches had these.)
-			# The same goes for these.
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
-			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
-			# (2.6.6)
-			# But not for these -- they are also used in normal
-			# operation.
-			echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
-			echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
-			# (2.6.7 upwards)
-			# And not for these either. These are in centisecs,
-			# not USER_HZ, so we have to use $AGE, not $XFS_AGE.
-			echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs
-			echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs
-			echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs
-		fi
-
-		case "$KLEVEL" in
-			"2.4")
-				echo 1					> /proc/sys/vm/laptop_mode
-				echo "30 500 0 0 $AGE $AGE 60 20 0"	> /proc/sys/vm/bdflush
-				;;
-			"2.6")
-				echo 5					> /proc/sys/vm/laptop_mode
-				echo "$AGE"				> /proc/sys/vm/dirty_writeback_centisecs
-				echo "$AGE"				> /proc/sys/vm/dirty_expire_centisecs
-				echo "$DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
-				echo "$DIRTY_BACKGROUND_RATIO"		> /proc/sys/vm/dirty_background_ratio
-				;;
-		esac
-		if [ $DO_REMOUNTS -eq 1 ]; then
-			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
-				PARSEDOPTS="$(parse_mount_opts "$OPTS")"
-				if [ "$FST" = 'unknown' ]; then
-					FST=$(deduce_fstype $MP)
-				fi
-				case "$FST" in
-					"ext3")
-						PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT
-						;;
-					"xfs")
-						mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT
-						;;
-				esac
-				if [ -b $DEV ] ; then
-					blockdev --setra $(($READAHEAD * 2)) $DEV
-				fi
-			done
-		fi
-		if [ $DO_HD -eq 1 ] ; then
-			for THISHD in $HD ; do
-				/sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1
-				/sbin/hdparm -B 1 $THISHD > /dev/null 2>&1
-			done
-		fi
-		if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
-			if [ $CPU_MAXFREQ = 'slowest' ]; then
-				CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq`
-			fi
-			echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
-		fi
-		echo "."
-		;;
-	stop)
-		U_AGE=$((100*$DEF_UPDATE))
-		B_AGE=$((100*$DEF_AGE))
-		echo -n "Stopping laptop_mode"
-		echo 0 > /proc/sys/vm/laptop_mode
-		if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
-			# These need to be restored, if there are no lm_*.
-			echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER))	 	> /proc/sys/fs/xfs/age_buffer
-			echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) 	> /proc/sys/fs/xfs/sync_interval
-		elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
-			# These need to be restored as well.
-			echo $((100*$DEF_XFS_AGE_BUFFER))	> /proc/sys/fs/xfs/age_buffer_centisecs
-			echo $((100*$DEF_XFS_SYNC_INTERVAL))	> /proc/sys/fs/xfs/xfssyncd_centisecs
-			echo $((100*$DEF_XFS_BUFD_INTERVAL))	> /proc/sys/fs/xfs/xfsbufd_centisecs
-		fi
-		case "$KLEVEL" in
-			"2.4")
-				echo "30 500 0 0 $U_AGE $B_AGE 60 20 0"	> /proc/sys/vm/bdflush
-				;;
-			"2.6")
-				echo "$U_AGE"				> /proc/sys/vm/dirty_writeback_centisecs
-				echo "$B_AGE"				> /proc/sys/vm/dirty_expire_centisecs
-				echo "$DEF_DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
-				echo "$DEF_DIRTY_BACKGROUND_RATIO"	> /proc/sys/vm/dirty_background_ratio
-				;;
-		esac
-		if [ $DO_REMOUNTS -eq 1 ] ; then
-			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
-				# Reset commit and atime options to defaults.
-				if [ "$FST" = 'unknown' ]; then
-					FST=$(deduce_fstype $MP)
-				fi
-				case "$FST" in
-					"ext3")
-						PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
-						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
-						;;
-					"xfs")
-						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
-						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
-						;;
-				esac
-				if [ -b $DEV ] ; then
-					blockdev --setra 256 $DEV
-				fi
-			done
-		fi
-		if [ $DO_HD -eq 1 ] ; then
-			for THISHD in $HD ; do
-				/sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1
-				/sbin/hdparm -B 255 $THISHD > /dev/null 2>&1
-			done
-		fi
-		if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
-			echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
-		fi
-		echo "."
-		;;
-	*)
-		echo "Usage: $0 {start|stop}" 2>&1
-		exit 1
-		;;
-
-  esac
-
-  exit 0
-
-
-ACPI integration
-----------------
-
-Dax Kelson submitted this so that the ACPI acpid daemon will
-kick off the laptop_mode script and run hdparm. The part that
-automatically disables laptop mode when the battery is low was
-written by Jan Topinski.
-
-/etc/acpi/events/ac_adapter::
-
-	event=ac_adapter
-	action=/etc/acpi/actions/ac.sh %e
-
-/etc/acpi/events/battery::
-
-	event=battery.*
-	action=/etc/acpi/actions/battery.sh %e
-
-/etc/acpi/actions/ac.sh::
-
-  #!/bin/bash
-
-  # ac on/offline event handler
-
-  status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state`
-
-  case $status in
-          "on-line")
-                  /sbin/laptop_mode stop
-                  exit 0
-          ;;
-          "off-line")
-                  /sbin/laptop_mode start
-                  exit 0
-          ;;
-  esac
-
-
-/etc/acpi/actions/battery.sh::
-
-  #! /bin/bash
-
-  # Automatically disable laptop mode when the battery almost runs out.
-
-  BATT_INFO=/proc/acpi/battery/$2/state
-
-  if [[ -f /proc/sys/vm/laptop_mode ]]
-  then
-     LM=`cat /proc/sys/vm/laptop_mode`
-     if [[ $LM -gt 0 ]]
-     then
-       if [[ -f $BATT_INFO ]]
-       then
-          # Source the config file only now that we know we need
-          if [ -f /etc/default/laptop-mode ] ; then
-                  # Debian
-                  . /etc/default/laptop-mode
-          elif [ -f /etc/sysconfig/laptop-mode ] ; then
-                  # Others
-                  . /etc/sysconfig/laptop-mode
-          fi
-          MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'}
-
-          ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`"
-          if [[ ACTION -eq "discharging" ]]
-          then
-             PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed  "s/.* \([0-9][0-9]* \).*/\1/" `
-             REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed  "s/.* \([0-9][0-9]* \).*/\1/" `
-          fi
-          if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES))
-          then
-             /sbin/laptop_mode stop
-          fi
-       else
-         logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path."
-       fi
-     fi
-  fi
-
-
-Monitoring tool
----------------
-
-Bartek Kania submitted this, it can be used to measure how much time your disk
-spends spun up/down.  See tools/laptop/dslm/dslm.c
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 245bf6394935..ca6ebeb5171c 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -41,7 +41,6 @@ Currently, these files are in /proc/sys/vm:
 - extfrag_threshold
 - highmem_is_dirtyable
 - hugetlb_shm_group
-- laptop_mode
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
@@ -363,13 +362,6 @@ hugetlb_shm_group contains group id that is allowed to create SysV
 shared memory segment using hugetlb page.
 
 
-laptop_mode
-===========
-
-laptop_mode is a knob that controls "laptop mode". All the things that are
-controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst.
-
-
 legacy_va_layout
 ================
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a29d8ac9d3e3..4bae7c4c664e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -811,9 +811,6 @@ void blk_mq_free_request(struct request *rq)
 
 	blk_mq_finish_request(rq);
 
-	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
-		laptop_io_completion(q->disk->bdi);
-
 	rq_qos_done(q, rq);
 
 	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0c466ccbed69..15eb463d5a9b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3305,8 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	/*
 	 * We do something simple for now.  The filemap_flush() will
 	 * also start triggering a write of the data blocks, which is
-	 * not strictly speaking necessary (and for users of
-	 * laptop_mode, not even desirable).  However, to do otherwise
+	 * not strictly speaking necessary.  However, to do otherwise
 	 * would require replicating code paths in:
 	 *
 	 * ext4_writepages() ->
diff --git a/fs/sync.c b/fs/sync.c
index 431fc5f5be06..6330150792f6 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -104,8 +104,6 @@ void ksys_sync(void)
 	iterate_supers(sync_fs_one_sb, &wait);
 	sync_bdevs(false);
 	sync_bdevs(true);
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
 }
 
 SYSCALL_DEFINE0(sync)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bc71aa9dcee8..a2014fb1bc66 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -845,15 +845,6 @@ xfs_fs_sync_fs(
 	if (error)
 		return error;
 
-	if (laptop_mode) {
-		/*
-		 * The disk must be active because we're syncing.
-		 * We schedule log work now (now that the disk is
-		 * active) instead of later (when it might not be).
-		 */
-		flush_delayed_work(&mp->m_log->l_work);
-	}
-
 	/*
 	 * If we are called with page faults frozen out, it means we are about
 	 * to freeze the transaction subsystem. Take the opportunity to shut
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 0217c1073735..c88fd4d37d1f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -46,7 +46,6 @@ enum wb_reason {
 	WB_REASON_VMSCAN,
 	WB_REASON_SYNC,
 	WB_REASON_PERIODIC,
-	WB_REASON_LAPTOP_TIMER,
 	WB_REASON_FS_FREE_SPACE,
 	/*
 	 * There is no bdi forker thread any more and works are done
@@ -204,8 +203,6 @@ struct backing_dev_info {
 	char dev_name[64];
 	struct device *owner;
 
-	struct timer_list laptop_mode_wb_timer;
-
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 #endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f48e8ccffe81..e530112c4b3a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -328,9 +328,6 @@ struct dirty_throttle_control {
 	bool			dirty_exceeded;
 };
 
-void laptop_io_completion(struct backing_dev_info *info);
-void laptop_sync_completion(void);
-void laptop_mode_timer_fn(struct timer_list *t);
 bool node_dirty_ok(struct pglist_data *pgdat);
 int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -342,7 +339,6 @@ extern struct wb_domain global_wb_domain;
 /* These are exported to sysctl. */
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
-extern int laptop_mode;
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 311a341e6fe4..b6f94e97788a 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -42,7 +42,6 @@
 	EM( WB_REASON_VMSCAN,			"vmscan")		\
 	EM( WB_REASON_SYNC,			"sync")			\
 	EM( WB_REASON_PERIODIC,			"periodic")		\
-	EM( WB_REASON_LAPTOP_TIMER,		"laptop_timer")		\
 	EM( WB_REASON_FS_FREE_SPACE,		"fs_free_space")	\
 	EM( WB_REASON_FORKER_THREAD,		"forker_thread")	\
 	EMe(WB_REASON_FOREIGN_FLUSH,		"foreign_flush")
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 63d1464cb71c..6ea9ea8413fa 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -183,7 +183,7 @@ enum
 	VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
-	VM_LAPTOP_MODE=23,	/* vm laptop mode */
+
 	VM_BLOCK_DUMP=24,	/* block dump mode */
 	VM_HUGETLB_GROUP=25,	/* permitted hugetlb group */
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c5740c6d37a2..a0e26d1b717f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1034,7 +1034,6 @@ struct backing_dev_info *bdi_alloc(int node_id)
 	bdi->capabilities = BDI_CAP_WRITEBACK;
 	bdi->ra_pages = VM_READAHEAD_PAGES;
 	bdi->io_pages = VM_READAHEAD_PAGES;
-	timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
 	return bdi;
 }
 EXPORT_SYMBOL(bdi_alloc);
@@ -1156,8 +1155,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
 
 void bdi_unregister(struct backing_dev_info *bdi)
 {
-	timer_delete_sync(&bdi->laptop_mode_wb_timer);
-
 	/* make sure nobody finds us on the bdi_list anymore */
 	bdi_remove_from_list(bdi);
 	wb_shutdown(&bdi->wb);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ccdeb0e84d39..601a5e048d12 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -109,14 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
  */
 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
 
-/*
- * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
- * a full sync is triggered after this time elapses without any disk activity.
- */
-int laptop_mode;
-
-EXPORT_SYMBOL(laptop_mode);
-
 /* End of sysctl-exported parameters */
 
 struct wb_domain global_wb_domain;
@@ -1843,17 +1835,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
 			balance_domain_limits(mdtc, strictlimit);
 		}
 
-		/*
-		 * In laptop mode, we wait until hitting the higher threshold
-		 * before starting background writeout, and then write out all
-		 * the way down to the lower threshold.  So slow writers cause
-		 * minimal disk activity.
-		 *
-		 * In normal mode, we start background writeout at the lower
-		 * background_thresh, to keep the amount of dirty memory low.
-		 */
-		if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
-		    !writeback_in_progress(wb))
+		if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb))
 			wb_start_background_writeback(wb);
 
 		/*
@@ -1876,10 +1858,6 @@ free_running:
 			break;
 		}
 
-		/* Start writeback even when in laptop mode */
-		if (unlikely(!writeback_in_progress(wb)))
-			wb_start_background_writeback(wb);
-
 		mem_cgroup_flush_foreign(wb);
 
 		/*
@@ -2198,41 +2176,6 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int
 }
 #endif
 
-void laptop_mode_timer_fn(struct timer_list *t)
-{
-	struct backing_dev_info *backing_dev_info =
-		timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
-
-	wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
-}
-
-/*
- * We've spun up the disk and we're in laptop mode: schedule writeback
- * of all dirty data a few seconds from now.  If the flush is already scheduled
- * then push it back - the user is still using the disk.
- */
-void laptop_io_completion(struct backing_dev_info *info)
-{
-	mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
-}
-
-/*
- * We're in laptop mode and we've just synced. The sync's writes will have
- * caused another writeback to be scheduled by laptop_io_completion.
- * Nothing needs to be written back anymore, so we unschedule the writeback.
- */
-void laptop_sync_completion(void)
-{
-	struct backing_dev_info *bdi;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
-		timer_delete(&bdi->laptop_mode_wb_timer);
-
-	rcu_read_unlock();
-}
-
 /*
  * If ratelimit_pages is too high then we can get into dirty-data overload
  * if a large number of processes all perform writes at the same time.
@@ -2263,6 +2206,19 @@ static int page_writeback_cpu_online(unsigned int cpu)
 
 #ifdef CONFIG_SYSCTL
 
+static int laptop_mode;
+static int laptop_mode_handler(const struct ctl_table *table, int write,
+			       void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos);
+
+	if (!ret && write)
+		pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n",
+			current->comm);
+
+	return ret;
+}
+
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 
@@ -2332,7 +2288,7 @@ static const struct ctl_table vm_page_writeback_sysctls[] = {
 		.data		= &laptop_mode,
 		.maxlen		= sizeof(laptop_mode),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
+		.proc_handler	= laptop_mode_handler,
 	},
 };
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c87945fa761..fc5691afb998 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -104,13 +104,13 @@ struct scan_control {
 	unsigned int force_deactivate:1;
 	unsigned int skipped_deactivate:1;
 
-	/* Writepage batching in laptop mode; RECLAIM_WRITE */
+	/* zone_reclaim_mode, boost reclaim */
 	unsigned int may_writepage:1;
 
-	/* Can mapped folios be reclaimed? */
+	/* zone_reclaim_mode */
 	unsigned int may_unmap:1;
 
-	/* Can folios be swapped as part of reclaim? */
+	/* zome_reclaim_mode, boost reclaim, cgroup restrictions */
 	unsigned int may_swap:1;
 
 	/* Not allow cache_trim_mode to be turned on as part of reclaim? */
@@ -6365,13 +6365,6 @@ retry:
 
 		if (sc->compaction_ready)
 			break;
-
-		/*
-		 * If we're getting trouble reclaiming, start doing
-		 * writepage even in laptop mode.
-		 */
-		if (sc->priority < DEF_PRIORITY - 2)
-			sc->may_writepage = 1;
 	} while (--sc->priority >= 0);
 
 	last_pgdat = NULL;
@@ -6580,7 +6573,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.order = order,
 		.nodemask = nodemask,
 		.priority = DEF_PRIORITY,
-		.may_writepage = !laptop_mode,
+		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
 	};
@@ -6624,7 +6617,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.target_mem_cgroup = memcg,
-		.may_writepage = !laptop_mode,
+		.may_writepage = 1,
 		.may_unmap = 1,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.may_swap = !noswap,
@@ -6670,7 +6663,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.target_mem_cgroup = memcg,
 		.priority = DEF_PRIORITY,
-		.may_writepage = !laptop_mode,
+		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
 		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
@@ -7051,7 +7044,7 @@ restart:
 		 * from reclaim context. If no pages are reclaimed, the
 		 * reclaim will be aborted.
 		 */
-		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+		sc.may_writepage = !nr_boost_reclaim;
 		sc.may_swap = !nr_boost_reclaim;
 
 		/*
@@ -7061,13 +7054,6 @@ restart:
 		 */
 		kswapd_age_node(pgdat, &sc);
 
-		/*
-		 * If we're getting trouble reclaiming, start doing writepage
-		 * even in laptop mode.
-		 */
-		if (sc.priority < DEF_PRIORITY - 2)
-			sc.may_writepage = 1;
-
 		/* Call soft limit reclaim before calling shrink_node. */
 		sc.nr_scanned = 0;
 		nr_soft_scanned = 0;
@@ -7799,7 +7785,7 @@ int user_proactive_reclaim(char *buf,
 				.reclaim_idx = gfp_zone(gfp_mask),
 				.proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
 				.priority = DEF_PRIORITY,
-				.may_writepage = !laptop_mode,
+				.may_writepage = 1,
 				.nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
 				.may_unmap = 1,
 				.may_swap = 1,
-- 
cgit v1.2.3


From fa05705107a40131a8335ad37817153709261738 Mon Sep 17 00:00:00 2001
From: Detlev Casanova <detlev.casanova@collabora.com>
Date: Fri, 9 Jan 2026 11:15:18 -0500
Subject: media: v4l2-ctrls: Add hevc_ext_sps_[ls]t_rps controls

The vdpu381 decoder found on newer Rockchip SoC need the information
from the long term and short term ref pic sets from the SPS.

So far, it wasn't included in the v4l2 API, so add it with new dynamic
sized controls.

Each element of the hevc_ext_sps_lt_rps array contains the long term ref
pic set at that index.
Each element of the hevc_ext_sps_st_rps contains the short term ref pic
set at that index, as the raw data.
It is the role of the drivers to calculate the reference sets values.

Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Signed-off-by: Detlev Casanova <detlev.casanova@collabora.com>
Signed-off-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil+cisco@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ctrls-core.c | 28 ++++++++++++++
 drivers/media/v4l2-core/v4l2-ctrls-defs.c | 10 +++++
 include/uapi/linux/v4l2-controls.h        | 61 +++++++++++++++++++++++++++++++
 include/uapi/linux/videodev2.h            |  2 +
 4 files changed, 101 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls-core.c b/drivers/media/v4l2-core/v4l2-ctrls-core.c
index 1d4b5859f0e2..79a157975f70 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-core.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-core.c
@@ -424,6 +424,12 @@ void v4l2_ctrl_type_op_log(const struct v4l2_ctrl *ctrl)
 	case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS:
 		pr_cont("HEVC_SLICE_PARAMS");
 		break;
+	case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS:
+		pr_cont("HEVC_EXT_SPS_ST_RPS");
+		break;
+	case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS:
+		pr_cont("HEVC_EXT_SPS_LT_RPS");
+		break;
 	case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX:
 		pr_cont("HEVC_SCALING_MATRIX");
 		break;
@@ -961,6 +967,8 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx,
 	struct v4l2_ctrl_h264_pred_weights *p_h264_pred_weights;
 	struct v4l2_ctrl_h264_slice_params *p_h264_slice_params;
 	struct v4l2_ctrl_h264_decode_params *p_h264_dec_params;
+	struct v4l2_ctrl_hevc_ext_sps_lt_rps *p_hevc_lt_rps;
+	struct v4l2_ctrl_hevc_ext_sps_st_rps *p_hevc_st_rps;
 	struct v4l2_ctrl_hevc_sps *p_hevc_sps;
 	struct v4l2_ctrl_hevc_pps *p_hevc_pps;
 	struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering;
@@ -1254,6 +1262,20 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx,
 	case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS:
 		break;
 
+	case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS:
+		p_hevc_st_rps = p;
+
+		if (p_hevc_st_rps->flags & ~V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED)
+			return -EINVAL;
+		break;
+
+	case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS:
+		p_hevc_lt_rps = p;
+
+		if (p_hevc_lt_rps->flags & ~V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT)
+			return -EINVAL;
+		break;
+
 	case V4L2_CTRL_TYPE_HDR10_CLL_INFO:
 		break;
 
@@ -2006,6 +2028,12 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl,
 	case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS:
 		elem_size = sizeof(struct v4l2_ctrl_hevc_slice_params);
 		break;
+	case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS:
+		elem_size = sizeof(struct v4l2_ctrl_hevc_ext_sps_st_rps);
+		break;
+	case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS:
+		elem_size = sizeof(struct v4l2_ctrl_hevc_ext_sps_lt_rps);
+		break;
 	case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX:
 		elem_size = sizeof(struct v4l2_ctrl_hevc_scaling_matrix);
 		break;
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
index 765aeeec84fe..551426c4cd01 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c
@@ -1235,6 +1235,8 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_STATELESS_HEVC_DECODE_MODE:		return "HEVC Decode Mode";
 	case V4L2_CID_STATELESS_HEVC_START_CODE:		return "HEVC Start Code";
 	case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS:	return "HEVC Entry Point Offsets";
+	case V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS:		return "HEVC Short Term Ref Sets";
+	case V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS:		return "HEVC Long Term Ref Sets";
 	case V4L2_CID_STATELESS_AV1_SEQUENCE:			return "AV1 Sequence Parameters";
 	case V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY:		return "AV1 Tile Group Entry";
 	case V4L2_CID_STATELESS_AV1_FRAME:			return "AV1 Frame Parameters";
@@ -1581,6 +1583,14 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 		*type = V4L2_CTRL_TYPE_U32;
 		*flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY;
 		break;
+	case V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS:
+		*type = V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS;
+		*flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY;
+		break;
+	case V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS:
+		*type = V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS;
+		*flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY;
+		break;
 	case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR:
 		*type = V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR;
 		break;
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 572622e4535e..68dd0c4e47b2 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -2101,6 +2101,8 @@ struct v4l2_ctrl_mpeg2_quantisation {
 #define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
 #define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
 #define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
+#define V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS  (V4L2_CID_CODEC_STATELESS_BASE + 408)
+#define V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS  (V4L2_CID_CODEC_STATELESS_BASE + 409)
 
 enum v4l2_stateless_hevc_decode_mode {
 	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
@@ -2556,6 +2558,65 @@ struct v4l2_ctrl_hevc_scaling_matrix {
 	__u8	scaling_list_dc_coef_32x32[2];
 };
 
+#define V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED	0x1
+
+/*
+ * struct v4l2_ctrl_hevc_ext_sps_st_rps - HEVC short term RPS parameters
+ *
+ * Dynamic size 1-dimension array for short term RPS. The number of elements
+ * is v4l2_ctrl_hevc_sps::num_short_term_ref_pic_sets. It can contain up to 65 elements.
+ *
+ * @delta_idx_minus1: Specifies the delta compare to the index. See details in section 7.4.8
+ *                    "Short-term reference picture set semantics" of the specification.
+ * @delta_rps_sign: Sign of the delta as specified in section 7.4.8 "Short-term reference picture
+ *                  set semantics" of the specification.
+ * @abs_delta_rps_minus1: Absolute delta RPS as specified in section 7.4.8 "Short-term reference
+ *                        picture set semantics" of the specification.
+ * @num_negative_pics: Number of short-term RPS entries that have picture order count values less
+ *                     than the picture order count value of the current picture.
+ * @num_positive_pics: Number of short-term RPS entries that have picture order count values
+ *                     greater than the picture order count value of the current picture.
+ * @used_by_curr_pic: Bit j specifies if short-term RPS j is used by the current picture.
+ * @use_delta_flag: Bit j equals to 1 specifies that the j-th entry in the source candidate
+ *                  short-term RPS is included in this candidate short-term RPS.
+ * @delta_poc_s0_minus1: Specifies the negative picture order count delta for the i-th entry in
+ *                       the short-term RPS. See details in section 7.4.8 "Short-term reference
+ *                       picture set semantics" of the specification.
+ * @delta_poc_s1_minus1: Specifies the positive picture order count delta for the i-th entry in
+ *                       the short-term RPS. See details in section 7.4.8 "Short-term reference
+ *                       picture set semantics" of the specification.
+ * @flags: See V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_ext_sps_st_rps {
+	__u8	delta_idx_minus1;
+	__u8	delta_rps_sign;
+	__u8	num_negative_pics;
+	__u8	num_positive_pics;
+	__u32	used_by_curr_pic;
+	__u32	use_delta_flag;
+	__u16	abs_delta_rps_minus1;
+	__u16	delta_poc_s0_minus1[16];
+	__u16	delta_poc_s1_minus1[16];
+	__u16	flags;
+};
+
+#define V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT		0x1
+
+/*
+ * struct v4l2_ctrl_hevc_ext_sps_lt_rps - HEVC long term RPS parameters
+ *
+ * Dynamic size 1-dimension array for long term RPS. The number of elements
+ * is v4l2_ctrl_hevc_sps::num_long_term_ref_pics_sps. It can contain up to 65 elements.
+ *
+ * @lt_ref_pic_poc_lsb_sps: picture order count modulo MaxPicOrderCntLsb of the i-th candidate
+ *                          long-term reference picture.
+ * @flags: See V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_{}
+ */
+struct v4l2_ctrl_hevc_ext_sps_lt_rps {
+	__u16	lt_ref_pic_poc_lsb_sps;
+	__u16	flags;
+};
+
 /* Stateless VP9 controls */
 
 #define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED	0x1
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 848e86617d5c..eda4492e40dc 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1986,6 +1986,8 @@ enum v4l2_ctrl_type {
 	V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS	= 0x0272,
 	V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX	= 0x0273,
 	V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS	= 0x0274,
+	V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS	= 0x0275,
+	V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS	= 0x0276,
 
 	V4L2_CTRL_TYPE_AV1_SEQUENCE	    = 0x280,
 	V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY = 0x281,
-- 
cgit v1.2.3


From f5f2bad67a45cd1ef6f5b727da104694a81b3666 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 21 Jan 2026 08:31:49 +0100
Subject: block: make the new blkzoned UAPI constants discoverable

The Linux 6.19 merge window added the new BLKREPORTZONESV2 ioctl, and
with it the new BLK_ZONE_REP_CACHED and BLK_ZONE_COND_ACTIVE constants.

The two constants are defined as part of enums, which makes it very
painful for userspace to discover if they are present in the installed
system headers.

Use the #define to the same name trick to make them trivially
discoverable using CPP directives.

Fixes: 0bf0e2e46668 ("block: track zone conditions")
Fixes: b30ffcdc0c15 ("block: introduce BLKREPORTZONESV2 ioctl")
Reported-by: Andrey Albershteyn <aalbersh@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blkzoned.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index e33f02703350..663836120966 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -81,7 +81,8 @@ enum blk_zone_cond {
 	BLK_ZONE_COND_FULL	= 0xE,
 	BLK_ZONE_COND_OFFLINE	= 0xF,
 
-	BLK_ZONE_COND_ACTIVE	= 0xFF,
+	BLK_ZONE_COND_ACTIVE	= 0xFF, /* added in Linux 6.19 */
+#define BLK_ZONE_COND_ACTIVE	BLK_ZONE_COND_ACTIVE
 };
 
 /**
@@ -100,7 +101,8 @@ enum blk_zone_report_flags {
 	BLK_ZONE_REP_CAPACITY	= (1U << 0),
 
 	/* Input flags */
-	BLK_ZONE_REP_CACHED	= (1U << 31),
+	BLK_ZONE_REP_CACHED	= (1U << 31), /* added in Linux 6.19 */
+#define BLK_ZONE_REP_CACHED	BLK_ZONE_REP_CACHED
 };
 
 /**
-- 
cgit v1.2.3


From d7a5da7a0f7fa7ff081140c4f6f971db98882703 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 15 Dec 2025 17:52:04 +0100
Subject: rseq: Add fields and constants for time slice extension

Aside of a Kconfig knob add the following items:

   - Two flag bits for the rseq user space ABI, which allow user space to
     query the availability and enablement without a syscall.

   - A new member to the user space ABI struct rseq, which is going to be
     used to communicate request and grant between kernel and user space.

   - A rseq state struct to hold the kernel state of this

   - Documentation of the new mechanism

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251215155708.669472597@linutronix.de
---
 Documentation/userspace-api/index.rst |   1 +
 Documentation/userspace-api/rseq.rst  | 135 ++++++++++++++++++++++++++++++++++
 include/linux/rseq_types.h            |  28 ++++++-
 include/uapi/linux/rseq.h             |  38 ++++++++++
 init/Kconfig                          |  12 +++
 kernel/rseq.c                         |   7 ++
 6 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/userspace-api/rseq.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index 8a61ac4c1bf1..fa0fe8ada68e 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -21,6 +21,7 @@ System calls
    ebpf/index
    ioctl/index
    mseal
+   rseq
 
 Security-related interfaces
 ===========================
diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst
new file mode 100644
index 000000000000..e1fdb0d5ce69
--- /dev/null
+++ b/Documentation/userspace-api/rseq.rst
@@ -0,0 +1,135 @@
+=====================
+Restartable Sequences
+=====================
+
+Restartable Sequences allow to register a per thread userspace memory area
+to be used as an ABI between kernel and userspace for three purposes:
+
+ * userspace restartable sequences
+
+ * quick access to read the current CPU number, node ID from userspace
+
+ * scheduler time slice extensions
+
+Restartable sequences (per-cpu atomics)
+---------------------------------------
+
+Restartable sequences allow userspace to perform update operations on
+per-cpu data without requiring heavyweight atomic operations. The actual
+ABI is unfortunately only available in the code and selftests.
+
+Quick access to CPU number, node ID
+-----------------------------------
+
+Allows to implement per CPU data efficiently. Documentation is in code and
+selftests. :(
+
+Scheduler time slice extensions
+-------------------------------
+
+This allows a thread to request a time slice extension when it enters a
+critical section to avoid contention on a resource when the thread is
+scheduled out inside of the critical section.
+
+The prerequisites for this functionality are:
+
+    * Enabled in Kconfig
+
+    * Enabled at boot time (default is enabled)
+
+    * A rseq userspace pointer has been registered for the thread
+
+The thread has to enable the functionality via prctl(2)::
+
+    prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET,
+          PR_RSEQ_SLICE_EXT_ENABLE, 0, 0);
+
+prctl() returns 0 on success or otherwise with the following error codes:
+
+========= ==============================================================
+Errorcode Meaning
+========= ==============================================================
+EINVAL	  Functionality not available or invalid function arguments.
+          Note: arg4 and arg5 must be zero
+ENOTSUPP  Functionality was disabled on the kernel command line
+ENXIO	  Available, but no rseq user struct registered
+========= ==============================================================
+
+The state can be also queried via prctl(2)::
+
+  prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_GET, 0, 0, 0);
+
+prctl() returns ``PR_RSEQ_SLICE_EXT_ENABLE`` when it is enabled or 0 if
+disabled. Otherwise it returns with the following error codes:
+
+========= ==============================================================
+Errorcode Meaning
+========= ==============================================================
+EINVAL	  Functionality not available or invalid function arguments.
+          Note: arg3 and arg4 and arg5 must be zero
+========= ==============================================================
+
+The availability and status is also exposed via the rseq ABI struct flags
+field via the ``RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT`` and the
+``RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT``. These bits are read-only for user
+space and only for informational purposes.
+
+If the mechanism was enabled via prctl(), the thread can request a time
+slice extension by setting rseq::slice_ctrl::request to 1. If the thread is
+interrupted and the interrupt results in a reschedule request in the
+kernel, then the kernel can grant a time slice extension and return to
+userspace instead of scheduling out. The length of the extension is
+determined by the ``rseq_slice_extension_nsec`` sysctl.
+
+The kernel indicates the grant by clearing rseq::slice_ctrl::request and
+setting rseq::slice_ctrl::granted to 1. If there is a reschedule of the
+thread after granting the extension, the kernel clears the granted bit to
+indicate that to userspace.
+
+If the request bit is still set when the leaving the critical section,
+userspace can clear it and continue.
+
+If the granted bit is set, then userspace invokes rseq_slice_yield(2) when
+leaving the critical section to relinquish the CPU. The kernel enforces
+this by arming a timer to prevent misbehaving userspace from abusing this
+mechanism.
+
+If both the request bit and the granted bit are false when leaving the
+critical section, then this indicates that a grant was revoked and no
+further action is required by userspace.
+
+The required code flow is as follows::
+
+    rseq->slice_ctrl.request = 1;
+    barrier();  // Prevent compiler reordering
+    critical_section();
+    barrier();  // Prevent compiler reordering
+    rseq->slice_ctrl.request = 0;
+    if (rseq->slice_ctrl.granted)
+        rseq_slice_yield();
+
+As all of this is strictly CPU local, there are no atomicity requirements.
+Checking the granted state is racy, but that cannot be avoided at all::
+
+    if (rseq->slice_ctrl.granted)
+      -> Interrupt results in schedule and grant revocation
+        rseq_slice_yield();
+
+So there is no point in pretending that this might be solved by an atomic
+operation.
+
+If the thread issues a syscall other than rseq_slice_yield(2) within the
+granted timeslice extension, the grant is also revoked and the CPU is
+relinquished immediately when entering the kernel. This is required as
+syscalls might consume arbitrary CPU time until they reach a scheduling
+point when the preemption model is either NONE or VOLUNTARY and therefore
+might exceed the grant by far.
+
+The preferred solution for user space is to use rseq_slice_yield(2) which
+is side effect free. The support for arbitrary syscalls is required to
+support onion layer architectured applications, where the code handling the
+critical section and requesting the time slice extension has no control
+over the code within the critical section.
+
+The kernel enforces flag consistency and terminates the thread with SIGSEGV
+if it detects a violation.
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 332dc14b81c9..67e40c059b1b 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -72,13 +72,36 @@ struct rseq_ids {
 	};
 };
 
+/**
+ * union rseq_slice_state - Status information for rseq time slice extension
+ * @state:	Compound to access the overall state
+ * @enabled:	Time slice extension is enabled for the task
+ * @granted:	Time slice extension was granted to the task
+ */
+union rseq_slice_state {
+	u16			state;
+	struct {
+		u8		enabled;
+		u8		granted;
+	};
+};
+
+/**
+ * struct rseq_slice - Status information for rseq time slice extension
+ * @state:	Time slice extension state
+ */
+struct rseq_slice {
+	union rseq_slice_state	state;
+};
+
 /**
  * struct rseq_data - Storage for all rseq related data
  * @usrptr:	Pointer to the registered user space RSEQ memory
  * @len:	Length of the RSEQ region
- * @sig:	Signature of critial section abort IPs
+ * @sig:	Signature of critical section abort IPs
  * @event:	Storage for event management
  * @ids:	Storage for cached CPU ID and MM CID
+ * @slice:	Storage for time slice extension data
  */
 struct rseq_data {
 	struct rseq __user		*usrptr;
@@ -86,6 +109,9 @@ struct rseq_data {
 	u32				sig;
 	struct rseq_event		event;
 	struct rseq_ids			ids;
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+	struct rseq_slice		slice;
+#endif
 };
 
 #else /* CONFIG_RSEQ */
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 1b76d508400c..6afc219d1545 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -23,9 +23,15 @@ enum rseq_flags {
 };
 
 enum rseq_cs_flags_bit {
+	/* Historical and unsupported bits */
 	RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT	= 0,
 	RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT	= 1,
 	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT	= 2,
+	/* (3) Intentional gap to put new bits into a separate byte */
+
+	/* User read only feature flags */
+	RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT	= 4,
+	RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT	= 5,
 };
 
 enum rseq_cs_flags {
@@ -35,6 +41,11 @@ enum rseq_cs_flags {
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
 	RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE	=
 		(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+
+	RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE	=
+		(1U << RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT),
+	RSEQ_CS_FLAG_SLICE_EXT_ENABLED		=
+		(1U << RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT),
 };
 
 /*
@@ -53,6 +64,27 @@ struct rseq_cs {
 	__u64 abort_ip;
 } __attribute__((aligned(4 * sizeof(__u64))));
 
+/**
+ * rseq_slice_ctrl - Time slice extension control structure
+ * @all:	Compound value
+ * @request:	Request for a time slice extension
+ * @granted:	Granted time slice extension
+ *
+ * @request is set by user space and can be cleared by user space or kernel
+ * space.  @granted is set and cleared by the kernel and must only be read
+ * by user space.
+ */
+struct rseq_slice_ctrl {
+	union {
+		__u32		all;
+		struct {
+			__u8	request;
+			__u8	granted;
+			__u16	__reserved;
+		};
+	};
+};
+
 /*
  * struct rseq is aligned on 4 * 8 bytes to ensure it is always
  * contained within a single cache-line.
@@ -141,6 +173,12 @@ struct rseq {
 	 */
 	__u32 mm_cid;
 
+	/*
+	 * Time slice extension control structure. CPU local updates from
+	 * kernel and user space.
+	 */
+	struct rseq_slice_ctrl slice_ctrl;
+
 	/*
 	 * Flexible array member at end of structure, after last feature field.
 	 */
diff --git a/init/Kconfig b/init/Kconfig
index fa79feb8fe57..00c6fbb66a5a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1938,6 +1938,18 @@ config RSEQ
 
 	  If unsure, say Y.
 
+config RSEQ_SLICE_EXTENSION
+	bool "Enable rseq-based time slice extension mechanism"
+	depends on RSEQ && HIGH_RES_TIMERS && GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS
+	help
+	  Allows userspace to request a limited time slice extension when
+	  returning from an interrupt to user space via the RSEQ shared
+	  data ABI. If granted, that allows to complete a critical section,
+	  so that other threads are not stuck on a conflicted resource,
+	  while the task is scheduled out.
+
+	  If unsure, say N.
+
 config RSEQ_STATS
 	default n
 	bool "Enable lightweight statistics of restartable sequences" if EXPERT
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 395d8b002350..07c324d5a201 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -389,6 +389,8 @@ static bool rseq_reset_ids(void)
  */
 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
 {
+	u32 rseqfl = 0;
+
 	if (flags & RSEQ_FLAG_UNREGISTER) {
 		if (flags & ~RSEQ_FLAG_UNREGISTER)
 			return -EINVAL;
@@ -440,6 +442,9 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	if (!access_ok(rseq, rseq_len))
 		return -EFAULT;
 
+	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
+		rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+
 	scoped_user_write_access(rseq, efault) {
 		/*
 		 * If the rseq_cs pointer is non-NULL on registration, clear it to
@@ -449,11 +454,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 		 * clearing the fields. Don't bother reading it, just reset it.
 		 */
 		unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+		unsafe_put_user(rseqfl, &rseq->flags, efault);
 		/* Initialize IDs in user space */
 		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
 		unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
 		unsafe_put_user(0U, &rseq->node_id, efault);
 		unsafe_put_user(0U, &rseq->mm_cid, efault);
+		unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
 	}
 
 	/*
-- 
cgit v1.2.3


From 28621ec2d46c6adf7d33a6facbd83e2fa566bd34 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 15 Dec 2025 17:52:12 +0100
Subject: rseq: Add prctl() to enable time slice extensions

Implement a prctl() so that tasks can enable the time slice extension
mechanism. This fails, when time slice extensions are disabled at compile
time or on the kernel command line and when no rseq pointer is registered
in the kernel.

That allows to implement a single trivial check in the exit to user mode
hotpath, to decide whether the whole mechanism needs to be invoked.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251215155708.858717691@linutronix.de
---
 include/linux/rseq.h       |  9 ++++++++
 include/uapi/linux/prctl.h | 10 +++++++++
 kernel/rseq.c              | 52 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c               |  6 ++++++
 4 files changed, 77 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 2266f4dc77b6..3c194a02ad0a 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -163,4 +163,13 @@ void rseq_syscall(struct pt_regs *regs);
 static inline void rseq_syscall(struct pt_regs *regs) { }
 #endif /* !CONFIG_DEBUG_RSEQ */
 
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
+#else /* CONFIG_RSEQ_SLICE_EXTENSION */
+static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
+{
+	return -ENOTSUPP;
+}
+#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
+
 #endif /* _LINUX_RSEQ_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 51c4e8c82b1e..79944b7ae50a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -386,4 +386,14 @@ struct prctl_mm_map {
 # define PR_FUTEX_HASH_SET_SLOTS	1
 # define PR_FUTEX_HASH_GET_SLOTS	2
 
+/* RSEQ time slice extensions */
+#define PR_RSEQ_SLICE_EXTENSION			79
+# define PR_RSEQ_SLICE_EXTENSION_GET		1
+# define PR_RSEQ_SLICE_EXTENSION_SET		2
+/*
+ * Bits for RSEQ_SLICE_EXTENSION_GET/SET
+ * PR_RSEQ_SLICE_EXT_ENABLE:	Enable
+ */
+# define PR_RSEQ_SLICE_EXT_ENABLE		0x01
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 415d75b6df2c..09848bb14ec2 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -71,6 +71,7 @@
 #define RSEQ_BUILD_SLOW_PATH
 
 #include <linux/debugfs.h>
+#include <linux/prctl.h>
 #include <linux/ratelimit.h>
 #include <linux/rseq_entry.h>
 #include <linux/sched.h>
@@ -501,6 +502,57 @@ efault:
 #ifdef CONFIG_RSEQ_SLICE_EXTENSION
 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
 
+int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
+{
+	switch (arg2) {
+	case PR_RSEQ_SLICE_EXTENSION_GET:
+		if (arg3)
+			return -EINVAL;
+		return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0;
+
+	case PR_RSEQ_SLICE_EXTENSION_SET: {
+		u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+		bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE);
+
+		if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE)
+			return -EINVAL;
+		if (!rseq_slice_extension_enabled())
+			return -ENOTSUPP;
+		if (!current->rseq.usrptr)
+			return -ENXIO;
+
+		/* No change? */
+		if (enable == !!current->rseq.slice.state.enabled)
+			return 0;
+
+		if (get_user(rflags, &current->rseq.usrptr->flags))
+			goto die;
+
+		if (current->rseq.slice.state.enabled)
+			valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+
+		if ((rflags & valid) != valid)
+			goto die;
+
+		rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+		rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+		if (enable)
+			rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+
+		if (put_user(rflags, &current->rseq.usrptr->flags))
+			goto die;
+
+		current->rseq.slice.state.enabled = enable;
+		return 0;
+	}
+	default:
+		return -EINVAL;
+	}
+die:
+	force_sig(SIGSEGV);
+	return -EFAULT;
+}
+
 static int __init rseq_slice_cmdline(char *str)
 {
 	bool on;
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b58eece4e58..af71987df81c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -53,6 +53,7 @@
 #include <linux/time_namespace.h>
 #include <linux/binfmts.h>
 #include <linux/futex.h>
+#include <linux/rseq.h>
 
 #include <linux/sched.h>
 #include <linux/sched/autogroup.h>
@@ -2868,6 +2869,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_FUTEX_HASH:
 		error = futex_hash_prctl(arg2, arg3, arg4);
 		break;
+	case PR_RSEQ_SLICE_EXTENSION:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = rseq_slice_extension_prctl(arg2, arg3);
+		break;
 	default:
 		trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
 		error = -EINVAL;
-- 
cgit v1.2.3


From d6200245c75e832af2087bc60ba2e6641a90eee9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 19 Jan 2026 11:23:57 +0100
Subject: rseq: Allow registering RSEQ with slice extension

Since glibc cares about the number of syscalls required to initialize a new
thread, allow initializing rseq with slice extension on. This avoids having to
do another prctl().

Requested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260121143207.814193010@infradead.org
---
 include/uapi/linux/rseq.h |  3 ++-
 kernel/rseq.c             | 12 ++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 6afc219d1545..863c4a00a66b 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -19,7 +19,8 @@ enum rseq_cpu_id_state {
 };
 
 enum rseq_flags {
-	RSEQ_FLAG_UNREGISTER = (1 << 0),
+	RSEQ_FLAG_UNREGISTER			= (1 << 0),
+	RSEQ_FLAG_SLICE_EXT_DEFAULT_ON		= (1 << 1),
 };
 
 enum rseq_cs_flags_bit {
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 275d70114107..1c5490a172a8 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -424,7 +424,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 		return 0;
 	}
 
-	if (unlikely(flags))
+	if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)))
 		return -EINVAL;
 
 	if (current->rseq.usrptr) {
@@ -459,8 +459,12 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	if (!access_ok(rseq, rseq_len))
 		return -EFAULT;
 
-	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
+	if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
 		rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+		if (rseq_slice_extension_enabled() &&
+		    (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))
+			rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+	}
 
 	scoped_user_write_access(rseq, efault) {
 		/*
@@ -488,6 +492,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
 	current->rseq.len = rseq_len;
 	current->rseq.sig = sig;
 
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+	current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED);
+#endif
+
 	/*
 	 * If rseq was previously inactive, and has just been
 	 * registered, ensure the cpu_id_start and cpu_id fields
-- 
cgit v1.2.3


From f174a9ffcd48d78a45d560c02ce4071ded036b53 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 19 Jan 2026 10:29:22 +0800
Subject: KVM: arm64: Add exit to userspace on {LD,ST}64B* outside of memslots

The main use of {LD,ST}64B* is to talk to a device, which is hopefully
directly assigned to the guest and requires no additional handling.

However, this does not preclude a VMM from exposing a virtual device
to the guest, and to allow 64 byte accesses as part of the programming
interface. A direct consequence of this is that we need to be able
to forward such access to userspace.

Given that such a contraption is very unlikely to ever exist, we choose
to offer a limited service: userspace gets (as part of a new exit reason)
the ESR, the IPA, and that's it. It is fully expected to handle the full
semantics of the instructions, deal with ACCDATA, the return values and
increment PC. Much fun.

A canonical implementation can also simply inject an abort and be done
with it. Frankly, don't try to do anything else unless you have time
to waste.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Oliver Upton <oupton@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kvm/mmio.c    | 27 ++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h |  3 ++-
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index 54f9358c9e0e..e2285ed8c91d 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -159,6 +159,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 	bool is_write;
 	int len;
 	u8 data_buf[8];
+	u64 esr;
+
+	esr = kvm_vcpu_get_esr(vcpu);
 
 	/*
 	 * No valid syndrome? Ask userspace for help if it has
@@ -168,7 +171,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 	 * though, so directly deliver an exception to the guest.
 	 */
 	if (!kvm_vcpu_dabt_isvalid(vcpu)) {
-		trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
+		trace_kvm_mmio_nisv(*vcpu_pc(vcpu), esr,
 				    kvm_vcpu_get_hfar(vcpu), fault_ipa);
 
 		if (vcpu_is_protected(vcpu))
@@ -185,6 +188,28 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 		return -ENOSYS;
 	}
 
+	/*
+	 * When (DFSC == 0b00xxxx || DFSC == 0b10101x) && DFSC != 0b0000xx
+	 * ESR_EL2[12:11] describe the Load/Store Type. This allows us to
+	 * punt the LD64B/ST64B/ST64BV/ST64BV0 instructions to userspace,
+	 * which will have to provide a full emulation of these 4
+	 * instructions.  No, we don't expect this do be fast.
+	 *
+	 * We rely on traps being set if the corresponding features are not
+	 * enabled, so if we get here, userspace has promised us to handle
+	 * it already.
+	 */
+	switch (kvm_vcpu_trap_get_fault(vcpu)) {
+	case 0b000100 ... 0b001111:
+	case 0b101010 ... 0b101011:
+		if (FIELD_GET(GENMASK(12, 11), esr)) {
+			run->exit_reason = KVM_EXIT_ARM_LDST64B;
+			run->arm_nisv.esr_iss = esr & ~(u64)ESR_ELx_FSC;
+			run->arm_nisv.fault_ipa = fault_ipa;
+			return 0;
+		}
+	}
+
 	/*
 	 * Prepare MMIO operation. First decode the syndrome data we get
 	 * from the CPU. Then try if some in-kernel emulation feels
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index dddb781b0507..88cca0e22ece 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -180,6 +180,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_MEMORY_FAULT     39
 #define KVM_EXIT_TDX              40
 #define KVM_EXIT_ARM_SEA          41
+#define KVM_EXIT_ARM_LDST64B      42
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -402,7 +403,7 @@ struct kvm_run {
 		} eoi;
 		/* KVM_EXIT_HYPERV */
 		struct kvm_hyperv_exit hyperv;
-		/* KVM_EXIT_ARM_NISV */
+		/* KVM_EXIT_ARM_NISV / KVM_EXIT_ARM_LDST64B */
 		struct {
 			__u64 esr_iss;
 			__u64 fault_ipa;
-- 
cgit v1.2.3


From 0f7afd80d81b739c4a9a6e4e24109ba1030c9c56 Mon Sep 17 00:00:00 2001
From: Terry Bowman <terry.bowman@amd.com>
Date: Wed, 14 Jan 2026 12:20:22 -0600
Subject: PCI: Move CXL DVSEC definitions into uapi/linux/pci_regs.h

The CXL DVSECs are currently defined in cxl/core/cxlpci.h. These are not
accessible to other subsystems. Move these to uapi/linux/pci_regs.h.

The CXL DVSEC definitions will be renamed and reformatted to fit better
with existing defines.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20260114182055.46029-2-terry.bowman@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/cxlpci.h          | 53 -----------------------------------
 include/uapi/linux/pci_regs.h | 64 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 58 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 1d526bea8431..cdb7cf3dbcb4 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -7,59 +7,6 @@
 
 #define CXL_MEMORY_PROGIF	0x10
 
-/*
- * See section 8.1 Configuration Space Registers in the CXL 2.0
- * Specification. Names are taken straight from the specification with "CXL" and
- * "DVSEC" redundancies removed. When obvious, abbreviations may be used.
- */
-#define PCI_DVSEC_HEADER1_LENGTH_MASK	GENMASK(31, 20)
-
-/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */
-#define CXL_DVSEC_PCIE_DEVICE					0
-#define   CXL_DVSEC_CAP_OFFSET		0xA
-#define     CXL_DVSEC_MEM_CAPABLE	BIT(2)
-#define     CXL_DVSEC_HDM_COUNT_MASK	GENMASK(5, 4)
-#define   CXL_DVSEC_CTRL_OFFSET		0xC
-#define     CXL_DVSEC_MEM_ENABLE	BIT(2)
-#define   CXL_DVSEC_RANGE_SIZE_HIGH(i)	(0x18 + (i * 0x10))
-#define   CXL_DVSEC_RANGE_SIZE_LOW(i)	(0x1C + (i * 0x10))
-#define     CXL_DVSEC_MEM_INFO_VALID	BIT(0)
-#define     CXL_DVSEC_MEM_ACTIVE	BIT(1)
-#define     CXL_DVSEC_MEM_SIZE_LOW_MASK	GENMASK(31, 28)
-#define   CXL_DVSEC_RANGE_BASE_HIGH(i)	(0x20 + (i * 0x10))
-#define   CXL_DVSEC_RANGE_BASE_LOW(i)	(0x24 + (i * 0x10))
-#define     CXL_DVSEC_MEM_BASE_LOW_MASK	GENMASK(31, 28)
-
-#define CXL_DVSEC_RANGE_MAX		2
-
-/* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */
-#define CXL_DVSEC_FUNCTION_MAP					2
-
-/* CXL 2.0 8.1.5: CXL 2.0 Extensions DVSEC for Ports */
-#define CXL_DVSEC_PORT_EXTENSIONS				3
-
-/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */
-#define CXL_DVSEC_PORT_GPF					4
-#define   CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET		0x0C
-#define     CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK		GENMASK(3, 0)
-#define     CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK		GENMASK(11, 8)
-#define   CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET		0xE
-#define     CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK		GENMASK(3, 0)
-#define     CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK		GENMASK(11, 8)
-
-/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */
-#define CXL_DVSEC_DEVICE_GPF					5
-
-/* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */
-#define CXL_DVSEC_PCIE_FLEXBUS_PORT				7
-
-/* CXL 2.0 8.1.9: Register Locator DVSEC */
-#define CXL_DVSEC_REG_LOCATOR					8
-#define   CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET			0xC
-#define     CXL_DVSEC_REG_LOCATOR_BIR_MASK			GENMASK(2, 0)
-#define	    CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK			GENMASK(15, 8)
-#define     CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK		GENMASK(31, 16)
-
 /*
  * NOTE: Currently all the functions which are enabled for CXL require their
  * vectors to be in the first 16.  Use this as the default max.
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 3add74ae2594..6c4b6f19b18e 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1253,11 +1253,6 @@
 #define PCI_DEV3_STA		0x0c	/* Device 3 Status Register */
 #define  PCI_DEV3_STA_SEGMENT	0x8	/* Segment Captured (end-to-end flit-mode detected) */
 
-/* Compute Express Link (CXL r3.1, sec 8.1.5) */
-#define PCI_DVSEC_CXL_PORT				3
-#define PCI_DVSEC_CXL_PORT_CTL				0x0c
-#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR		0x00000001
-
 /* Integrity and Data Encryption Extended Capability */
 #define PCI_IDE_CAP			0x04
 #define  PCI_IDE_CAP_LINK		0x1  /* Link IDE Stream Supported */
@@ -1338,4 +1333,63 @@
 #define  PCI_IDE_SEL_ADDR_3(x)		(28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE)
 #define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc)  (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc))
 
+/* Compute Express Link (CXL r3.1, sec 8.1.5) */
+#define PCI_DVSEC_CXL_PORT				3
+#define PCI_DVSEC_CXL_PORT_CTL				0x0c
+#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR		0x00000001
+
+/*
+ * Compute Express Link (CXL r3.2, sec 8.1)
+ *
+ * Note that CXL DVSEC id 3 and 7 to be ignored when the CXL link state
+ * is "disconnected" (CXL r3.2, sec 9.12.3). Re-enumerate these
+ * registers on downstream link-up events.
+ */
+#define PCI_DVSEC_HEADER1_LENGTH_MASK  __GENMASK(31, 20)
+
+/* CXL 3.2 8.1.3: PCIe DVSEC for CXL Device */
+#define CXL_DVSEC_PCIE_DEVICE				0
+#define  CXL_DVSEC_CAP_OFFSET				0xA
+#define   CXL_DVSEC_MEM_CAPABLE				_BITUL(2)
+#define   CXL_DVSEC_HDM_COUNT_MASK			__GENMASK(5, 4)
+#define  CXL_DVSEC_CTRL_OFFSET				0xC
+#define   CXL_DVSEC_MEM_ENABLE				_BITUL(2)
+#define  CXL_DVSEC_RANGE_SIZE_HIGH(i)			(0x18 + (i * 0x10))
+#define  CXL_DVSEC_RANGE_SIZE_LOW(i)			(0x1C + (i * 0x10))
+#define   CXL_DVSEC_MEM_INFO_VALID			_BITUL(0)
+#define   CXL_DVSEC_MEM_ACTIVE				_BITUL(1)
+#define   CXL_DVSEC_MEM_SIZE_LOW_MASK			__GENMASK(31, 28)
+#define  CXL_DVSEC_RANGE_BASE_HIGH(i)			(0x20 + (i * 0x10))
+#define  CXL_DVSEC_RANGE_BASE_LOW(i)			(0x24 + (i * 0x10))
+#define   CXL_DVSEC_MEM_BASE_LOW_MASK			__GENMASK(31, 28)
+
+#define CXL_DVSEC_RANGE_MAX				2
+
+/* CXL 3.2 8.1.4: Non-CXL Function Map DVSEC */
+#define CXL_DVSEC_FUNCTION_MAP				2
+
+/* CXL 3.2 8.1.5: Extensions DVSEC for Ports */
+#define CXL_DVSEC_PORT					3
+#define   CXL_DVSEC_PORT_CTL				0x0c
+#define    CXL_DVSEC_PORT_CTL_UNMASK_SBR		0x00000001
+
+/* CXL 3.2 8.1.6: GPF DVSEC for CXL Port */
+#define CXL_DVSEC_PORT_GPF				4
+#define  CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET	0x0C
+#define   CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK	__GENMASK(3, 0)
+#define   CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK	__GENMASK(11, 8)
+#define  CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET	0xE
+#define   CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK	__GENMASK(3, 0)
+#define   CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK	__GENMASK(11, 8)
+
+/* CXL 3.2 8.1.7: GPF DVSEC for CXL Device */
+#define CXL_DVSEC_DEVICE_GPF				5
+
+/* CXL 3.2 8.1.9: Register Locator DVSEC */
+#define CXL_DVSEC_REG_LOCATOR				8
+#define  CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET		0xC
+#define   CXL_DVSEC_REG_LOCATOR_BIR_MASK		__GENMASK(2, 0)
+#define   CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK		__GENMASK(15, 8)
+#define   CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK	__GENMASK(31, 16)
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From 6612bd9ff0b1001cff5f5d79db6ce44427d2e99c Mon Sep 17 00:00:00 2001
From: Terry Bowman <terry.bowman@amd.com>
Date: Wed, 14 Jan 2026 12:20:23 -0600
Subject: PCI: Update CXL DVSEC definitions

CXL DVSEC definitions were recently moved into uapi/pci_regs.h, but the
newly added macros do not follow the file's existing naming conventions.
The current format uses CXL_DVSEC_XYZ, while the new CXL entries must
instead use the PCI_DVSEC_CXL_XYZ prefix to match the conventions already
established in pci_regs.h.

The new CXL DVSEC macros also introduce _MASK and _OFFSET suffixes, which
are not used anywhere else in the file. These suffixes lengthen the
identifiers and reduce readability. Remove _MASK and _OFFSET from the
recently added definitions.

Additionally, remove PCI_DVSEC_HEADER1_LENGTH, as it duplicates the existing
PCI_DVSEC_HEADER1_LEN() macro.

Update all existing references to use the new macro names.

Finally, update the inline documentation to reference the latest revision
of the CXL specification.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20260114182055.46029-3-terry.bowman@amd.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/pci.c        | 58 +++++++++++++-------------
 drivers/cxl/core/regs.c       | 14 +++----
 drivers/cxl/pci.c             |  2 +-
 include/uapi/linux/pci_regs.h | 94 ++++++++++++++++++++-----------------------
 4 files changed, 81 insertions(+), 87 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 5b023a0178a4..077b386e0c8d 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -86,12 +86,12 @@ static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id)
 	i = 1;
 	do {
 		rc = pci_read_config_dword(pdev,
-					   d + CXL_DVSEC_RANGE_SIZE_LOW(id),
+					   d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(id),
 					   &temp);
 		if (rc)
 			return rc;
 
-		valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp);
+		valid = FIELD_GET(PCI_DVSEC_CXL_MEM_INFO_VALID, temp);
 		if (valid)
 			break;
 		msleep(1000);
@@ -121,11 +121,11 @@ static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id)
 	/* Check MEM ACTIVE bit, up to 60s timeout by default */
 	for (i = media_ready_timeout; i; i--) {
 		rc = pci_read_config_dword(
-			pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp);
+			pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(id), &temp);
 		if (rc)
 			return rc;
 
-		active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp);
+		active = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE, temp);
 		if (active)
 			break;
 		msleep(1000);
@@ -154,11 +154,11 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds)
 	u16 cap;
 
 	rc = pci_read_config_word(pdev,
-				  d + CXL_DVSEC_CAP_OFFSET, &cap);
+				  d + PCI_DVSEC_CXL_CAP, &cap);
 	if (rc)
 		return rc;
 
-	hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap);
+	hdm_count = FIELD_GET(PCI_DVSEC_CXL_HDM_COUNT, cap);
 	for (i = 0; i < hdm_count; i++) {
 		rc = cxl_dvsec_mem_range_valid(cxlds, i);
 		if (rc)
@@ -186,16 +186,16 @@ static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val)
 	u16 ctrl;
 	int rc;
 
-	rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
+	rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, &ctrl);
 	if (rc < 0)
 		return rc;
 
-	if ((ctrl & CXL_DVSEC_MEM_ENABLE) == val)
+	if ((ctrl & PCI_DVSEC_CXL_MEM_ENABLE) == val)
 		return 1;
-	ctrl &= ~CXL_DVSEC_MEM_ENABLE;
+	ctrl &= ~PCI_DVSEC_CXL_MEM_ENABLE;
 	ctrl |= val;
 
-	rc = pci_write_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, ctrl);
+	rc = pci_write_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, ctrl);
 	if (rc < 0)
 		return rc;
 
@@ -211,7 +211,7 @@ static int devm_cxl_enable_mem(struct device *host, struct cxl_dev_state *cxlds)
 {
 	int rc;
 
-	rc = cxl_set_mem_enable(cxlds, CXL_DVSEC_MEM_ENABLE);
+	rc = cxl_set_mem_enable(cxlds, PCI_DVSEC_CXL_MEM_ENABLE);
 	if (rc < 0)
 		return rc;
 	if (rc > 0)
@@ -273,11 +273,11 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds,
 		return -ENXIO;
 	}
 
-	rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap);
+	rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CAP, &cap);
 	if (rc)
 		return rc;
 
-	if (!(cap & CXL_DVSEC_MEM_CAPABLE)) {
+	if (!(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) {
 		dev_dbg(dev, "Not MEM Capable\n");
 		return -ENXIO;
 	}
@@ -288,7 +288,7 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds,
 	 * driver is for a spec defined class code which must be CXL.mem
 	 * capable, there is no point in continuing to enable CXL.mem.
 	 */
-	hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap);
+	hdm_count = FIELD_GET(PCI_DVSEC_CXL_HDM_COUNT, cap);
 	if (!hdm_count || hdm_count > 2)
 		return -EINVAL;
 
@@ -297,11 +297,11 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds,
 	 * disabled, and they will remain moot after the HDM Decoder
 	 * capability is enabled.
 	 */
-	rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
+	rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, &ctrl);
 	if (rc)
 		return rc;
 
-	info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl);
+	info->mem_enabled = FIELD_GET(PCI_DVSEC_CXL_MEM_ENABLE, ctrl);
 	if (!info->mem_enabled)
 		return 0;
 
@@ -314,35 +314,35 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds,
 			return rc;
 
 		rc = pci_read_config_dword(
-			pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp);
+			pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i), &temp);
 		if (rc)
 			return rc;
 
 		size = (u64)temp << 32;
 
 		rc = pci_read_config_dword(
-			pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp);
+			pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(i), &temp);
 		if (rc)
 			return rc;
 
-		size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK;
+		size |= temp & PCI_DVSEC_CXL_MEM_SIZE_LOW;
 		if (!size) {
 			continue;
 		}
 
 		rc = pci_read_config_dword(
-			pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp);
+			pdev, d + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), &temp);
 		if (rc)
 			return rc;
 
 		base = (u64)temp << 32;
 
 		rc = pci_read_config_dword(
-			pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp);
+			pdev, d + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), &temp);
 		if (rc)
 			return rc;
 
-		base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK;
+		base |= temp & PCI_DVSEC_CXL_MEM_BASE_LOW;
 
 		info->dvsec_range[ranges++] = (struct range) {
 			.start = base,
@@ -1068,7 +1068,7 @@ u16 cxl_gpf_get_dvsec(struct device *dev)
 		is_port = false;
 
 	dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
-			is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF);
+			is_port ? PCI_DVSEC_CXL_PORT_GPF : PCI_DVSEC_CXL_DEVICE_GPF);
 	if (!dvsec)
 		dev_warn(dev, "%s GPF DVSEC not present\n",
 			 is_port ? "Port" : "Device");
@@ -1084,14 +1084,14 @@ static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase)
 
 	switch (phase) {
 	case 1:
-		offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET;
-		base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK;
-		scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK;
+		offset = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_CONTROL;
+		base = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_BASE;
+		scale = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_SCALE;
 		break;
 	case 2:
-		offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET;
-		base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK;
-		scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK;
+		offset = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_CONTROL;
+		base = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_BASE;
+		scale = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_SCALE;
 		break;
 	default:
 		return -EINVAL;
diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
index 5ca7b0eed568..a010b3214342 100644
--- a/drivers/cxl/core/regs.c
+++ b/drivers/cxl/core/regs.c
@@ -271,10 +271,10 @@ EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, "CXL");
 static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi,
 				struct cxl_register_map *map)
 {
-	u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo);
-	int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo);
+	u8 reg_type = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID, reg_lo);
+	int bar = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BIR, reg_lo);
 	u64 offset = ((u64)reg_hi << 32) |
-		     (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK);
+		     (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW);
 
 	if (offset > pci_resource_len(pdev, bar)) {
 		dev_warn(&pdev->dev,
@@ -311,15 +311,15 @@ static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_ty
 	};
 
 	regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
-					   CXL_DVSEC_REG_LOCATOR);
+					   PCI_DVSEC_CXL_REG_LOCATOR);
 	if (!regloc)
 		return -ENXIO;
 
 	pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, &regloc_size);
-	regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size);
+	regloc_size = PCI_DVSEC_HEADER1_LEN(regloc_size);
 
-	regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET;
-	regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8;
+	regloc += PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1;
+	regblocks = (regloc_size - PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1) / 8;
 
 	for (i = 0; i < regblocks; i++, regloc += 8) {
 		u32 reg_lo, reg_hi;
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 0be4e508affe..b7f694bda913 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -933,7 +933,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	cxlds->rcd = is_cxl_restricted(pdev);
 	cxlds->serial = pci_get_dsn(pdev);
 	cxlds->cxl_dvsec = pci_find_dvsec_capability(
-		pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
+		pdev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_DEVICE);
 	if (!cxlds->cxl_dvsec)
 		dev_warn(&pdev->dev,
 			 "Device DVSEC not present, skip CXL.mem init\n");
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 6c4b6f19b18e..662582bdccf0 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1333,63 +1333,57 @@
 #define  PCI_IDE_SEL_ADDR_3(x)		(28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE)
 #define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc)  (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc))
 
-/* Compute Express Link (CXL r3.1, sec 8.1.5) */
-#define PCI_DVSEC_CXL_PORT				3
-#define PCI_DVSEC_CXL_PORT_CTL				0x0c
-#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR		0x00000001
-
 /*
- * Compute Express Link (CXL r3.2, sec 8.1)
+ * Compute Express Link (CXL r4.0, sec 8.1)
  *
  * Note that CXL DVSEC id 3 and 7 to be ignored when the CXL link state
- * is "disconnected" (CXL r3.2, sec 9.12.3). Re-enumerate these
+ * is "disconnected" (CXL r4.0, sec 9.12.3). Re-enumerate these
  * registers on downstream link-up events.
  */
-#define PCI_DVSEC_HEADER1_LENGTH_MASK  __GENMASK(31, 20)
-
-/* CXL 3.2 8.1.3: PCIe DVSEC for CXL Device */
-#define CXL_DVSEC_PCIE_DEVICE				0
-#define  CXL_DVSEC_CAP_OFFSET				0xA
-#define   CXL_DVSEC_MEM_CAPABLE				_BITUL(2)
-#define   CXL_DVSEC_HDM_COUNT_MASK			__GENMASK(5, 4)
-#define  CXL_DVSEC_CTRL_OFFSET				0xC
-#define   CXL_DVSEC_MEM_ENABLE				_BITUL(2)
-#define  CXL_DVSEC_RANGE_SIZE_HIGH(i)			(0x18 + (i * 0x10))
-#define  CXL_DVSEC_RANGE_SIZE_LOW(i)			(0x1C + (i * 0x10))
-#define   CXL_DVSEC_MEM_INFO_VALID			_BITUL(0)
-#define   CXL_DVSEC_MEM_ACTIVE				_BITUL(1)
-#define   CXL_DVSEC_MEM_SIZE_LOW_MASK			__GENMASK(31, 28)
-#define  CXL_DVSEC_RANGE_BASE_HIGH(i)			(0x20 + (i * 0x10))
-#define  CXL_DVSEC_RANGE_BASE_LOW(i)			(0x24 + (i * 0x10))
-#define   CXL_DVSEC_MEM_BASE_LOW_MASK			__GENMASK(31, 28)
+
+/* CXL r4.0, 8.1.3: PCIe DVSEC for CXL Device */
+#define PCI_DVSEC_CXL_DEVICE				0
+#define  PCI_DVSEC_CXL_CAP				0xA
+#define   PCI_DVSEC_CXL_MEM_CAPABLE			_BITUL(2)
+#define   PCI_DVSEC_CXL_HDM_COUNT			__GENMASK(5, 4)
+#define  PCI_DVSEC_CXL_CTRL				0xC
+#define   PCI_DVSEC_CXL_MEM_ENABLE			_BITUL(2)
+#define  PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i)		(0x18 + (i * 0x10))
+#define  PCI_DVSEC_CXL_RANGE_SIZE_LOW(i)		(0x1C + (i * 0x10))
+#define   PCI_DVSEC_CXL_MEM_INFO_VALID			_BITUL(0)
+#define   PCI_DVSEC_CXL_MEM_ACTIVE			_BITUL(1)
+#define   PCI_DVSEC_CXL_MEM_SIZE_LOW			__GENMASK(31, 28)
+#define  PCI_DVSEC_CXL_RANGE_BASE_HIGH(i)		(0x20 + (i * 0x10))
+#define  PCI_DVSEC_CXL_RANGE_BASE_LOW(i)		(0x24 + (i * 0x10))
+#define   PCI_DVSEC_CXL_MEM_BASE_LOW			__GENMASK(31, 28)
 
 #define CXL_DVSEC_RANGE_MAX				2
 
-/* CXL 3.2 8.1.4: Non-CXL Function Map DVSEC */
-#define CXL_DVSEC_FUNCTION_MAP				2
-
-/* CXL 3.2 8.1.5: Extensions DVSEC for Ports */
-#define CXL_DVSEC_PORT					3
-#define   CXL_DVSEC_PORT_CTL				0x0c
-#define    CXL_DVSEC_PORT_CTL_UNMASK_SBR		0x00000001
-
-/* CXL 3.2 8.1.6: GPF DVSEC for CXL Port */
-#define CXL_DVSEC_PORT_GPF				4
-#define  CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET	0x0C
-#define   CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK	__GENMASK(3, 0)
-#define   CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK	__GENMASK(11, 8)
-#define  CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET	0xE
-#define   CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK	__GENMASK(3, 0)
-#define   CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK	__GENMASK(11, 8)
-
-/* CXL 3.2 8.1.7: GPF DVSEC for CXL Device */
-#define CXL_DVSEC_DEVICE_GPF				5
-
-/* CXL 3.2 8.1.9: Register Locator DVSEC */
-#define CXL_DVSEC_REG_LOCATOR				8
-#define  CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET		0xC
-#define   CXL_DVSEC_REG_LOCATOR_BIR_MASK		__GENMASK(2, 0)
-#define   CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK		__GENMASK(15, 8)
-#define   CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK	__GENMASK(31, 16)
+/* CXL r4.0, 8.1.4: Non-CXL Function Map DVSEC */
+#define PCI_DVSEC_CXL_FUNCTION_MAP			2
+
+/* CXL r4.0, 8.1.5: Extensions DVSEC for Ports */
+#define PCI_DVSEC_CXL_PORT				3
+#define  PCI_DVSEC_CXL_PORT_CTL				0x0c
+#define   PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR		0x00000001
+
+/* CXL r4.0, 8.1.6: GPF DVSEC for CXL Port */
+#define PCI_DVSEC_CXL_PORT_GPF				4
+#define  PCI_DVSEC_CXL_PORT_GPF_PHASE_1_CONTROL		0x0C
+#define   PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_BASE	__GENMASK(3, 0)
+#define   PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_SCALE	__GENMASK(11, 8)
+#define  PCI_DVSEC_CXL_PORT_GPF_PHASE_2_CONTROL		0xE
+#define   PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_BASE	__GENMASK(3, 0)
+#define   PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_SCALE	__GENMASK(11, 8)
+
+/* CXL r4.0, 8.1.7: GPF DVSEC for CXL Device */
+#define PCI_DVSEC_CXL_DEVICE_GPF			5
+
+/* CXL r4.0, 8.1.9: Register Locator DVSEC */
+#define PCI_DVSEC_CXL_REG_LOCATOR			8
+#define  PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1		0xC
+#define   PCI_DVSEC_CXL_REG_LOCATOR_BIR			__GENMASK(2, 0)
+#define   PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID		__GENMASK(15, 8)
+#define   PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW	__GENMASK(31, 16)
 
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From 7c29ba02210c6e4570cdce53813a1ae68fb6d049 Mon Sep 17 00:00:00 2001
From: Terry Bowman <terry.bowman@amd.com>
Date: Wed, 14 Jan 2026 12:20:24 -0600
Subject: PCI: Introduce pcie_is_cxl()

CXL is a protocol that runs on top of PCIe electricals. Its error model
also runs on top of the PCIe AER error model by standardizing "internal"
errors as "CXL" errors. Linux has historically ignored internal errors.

CXL protocol error handling is then a task of enhancing the PCIe AER
core to understand that PCIe ports (upstream and downstream) and
endpoints may throw internal errors that represent standard CXL protocol
errors.

The proposed method to make that determination is to teach 'struct
pci_dev' to cache when its link has trained the CXL.mem and/or CXL.cache
protocols and then treat all internal errors as CXL errors. A design
goal is to not burden the PCIe AER core with CXL knowledge beyond just
enough to forward error notifications to the CXL RAS core. The forwarded
notification looks up a 'struct cxl_port' or 'struct cxl_dport'
companion device to the PCI device.

Introduce set_pcie_cxl() with logic checking for CXL.mem or CXL.cache
status in the CXL Flex Bus DVSEC status register. The CXL Flex Bus DVSEC
presence is used because it is required for all the CXL PCIe devices.[1]

[1] CXL 3.1 Spec, 8.1.1 PCIe Designated Vendor-Specific Extended
    Capability (DVSEC) ID Assignment, Table 8-2

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Ben Cheatham <benjamin.cheatham@amd.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20260114182055.46029-4-terry.bowman@amd.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/pci/probe.c           | 31 +++++++++++++++++++++++++++++++
 include/linux/pci.h           |  6 ++++++
 include/uapi/linux/pci_regs.h |  6 ++++++
 3 files changed, 43 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 41183aed8f5d..bd7ce41d0c7a 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1735,6 +1735,35 @@ static void set_pcie_thunderbolt(struct pci_dev *dev)
 		dev->is_thunderbolt = 1;
 }
 
+static void set_pcie_cxl(struct pci_dev *dev)
+{
+	struct pci_dev *bridge;
+	u16 dvsec, cap;
+
+	if (!pci_is_pcie(dev))
+		return;
+
+	/*
+	 * Update parent's CXL state because alternate protocol training
+	 * may have changed
+	 */
+	bridge = pci_upstream_bridge(dev);
+	if (bridge)
+		set_pcie_cxl(bridge);
+
+	dvsec = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL,
+					  PCI_DVSEC_CXL_FLEXBUS_PORT);
+	if (!dvsec)
+		return;
+
+	pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS,
+			     &cap);
+
+	dev->is_cxl = FIELD_GET(PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_CACHE, cap) ||
+		FIELD_GET(PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_MEM, cap);
+
+}
+
 static void set_pcie_untrusted(struct pci_dev *dev)
 {
 	struct pci_dev *parent = pci_upstream_bridge(dev);
@@ -2065,6 +2094,8 @@ int pci_setup_device(struct pci_dev *dev)
 	/* Need to have dev->cfg_size ready */
 	set_pcie_thunderbolt(dev);
 
+	set_pcie_cxl(dev);
+
 	set_pcie_untrusted(dev);
 
 	if (pci_is_pcie(dev))
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 864775651c6f..f8e8b3df794d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -463,6 +463,7 @@ struct pci_dev {
 	unsigned int	is_pciehp:1;
 	unsigned int	shpc_managed:1;		/* SHPC owned by shpchp */
 	unsigned int	is_thunderbolt:1;	/* Thunderbolt controller */
+	unsigned int	is_cxl:1;               /* Compute Express Link (CXL) */
 	/*
 	 * Devices marked being untrusted are the ones that can potentially
 	 * execute DMA attacks and similar. They are typically connected
@@ -791,6 +792,11 @@ static inline bool pci_is_display(struct pci_dev *pdev)
 	return (pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY;
 }
 
+static inline bool pcie_is_cxl(struct pci_dev *pci_dev)
+{
+	return pci_dev->is_cxl;
+}
+
 #define for_each_pci_bridge(dev, bus)				\
 	list_for_each_entry(dev, &bus->devices, bus_list)	\
 		if (!pci_is_bridge(dev)) {} else
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 662582bdccf0..b6622fd60fd9 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1379,6 +1379,12 @@
 /* CXL r4.0, 8.1.7: GPF DVSEC for CXL Device */
 #define PCI_DVSEC_CXL_DEVICE_GPF			5
 
+/* CXL r4.0, 8.1.8: Flex Bus DVSEC */
+#define PCI_DVSEC_CXL_FLEXBUS_PORT			7
+#define  PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS		0xE
+#define   PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_CACHE	_BITUL(0)
+#define   PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_MEM		_BITUL(2)
+
 /* CXL r4.0, 8.1.9: Register Locator DVSEC */
 #define PCI_DVSEC_CXL_REG_LOCATOR			8
 #define  PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1		0xC
-- 
cgit v1.2.3


From 5247c034a67f5a93cc1faa15e9867eec5b22f38a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 20 Jan 2026 20:47:40 +0000
Subject: io_uring: introduce non-circular SQ

Outside of SQPOLL, normally SQ entries are consumed by the time the
submission syscall returns. For those cases we don't need a circular
buffer and the head/tail tracking, instead the kernel can assume that
entries always start from the beginning of the SQ at index 0. This patch
introduces a setup flag doing exactly that. It's a simpler and helps
to keeps SQEs hot in cache.

The feature is optional and enabled by setting IORING_SETUP_SQ_REWIND.
The flag is rejected if passed together with SQPOLL as it'd require
waiting for SQ before each submission. It also requires
IORING_SETUP_NO_SQARRAY, which can be supported but it's unlikely there
will be users, so leave more space for future optimisations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 12 ++++++++++++
 io_uring/io_uring.c           | 29 ++++++++++++++++++++++-------
 io_uring/io_uring.h           |  3 ++-
 3 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b5b23c0d5283..475094c7a668 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -237,6 +237,18 @@ enum io_uring_sqe_flags_bit {
  */
 #define IORING_SETUP_SQE_MIXED		(1U << 19)
 
+/*
+ * When set, io_uring ignores SQ head and tail and fetches SQEs to submit
+ * starting from index 0 instead from the index stored in the head pointer.
+ * IOW, the user should place all SQE at the beginning of the SQ memory
+ * before issuing a submission syscall.
+ *
+ * It requires IORING_SETUP_NO_SQARRAY and is incompatible with
+ * IORING_SETUP_SQPOLL. The user must also never change the SQ head and tail
+ * values and keep it set to 0. Any other value is undefined behaviour.
+ */
+#define IORING_SETUP_SQ_REWIND		(1U << 20)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a50459238bee..0f88ec74e55d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1945,12 +1945,16 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
 
-	/*
-	 * Ensure any loads from the SQEs are done at this point,
-	 * since once we write the new head, the application could
-	 * write new data to them.
-	 */
-	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
+	if (ctx->flags & IORING_SETUP_SQ_REWIND) {
+		ctx->cached_sq_head = 0;
+	} else {
+		/*
+		 * Ensure any loads from the SQEs are done at this point,
+		 * since once we write the new head, the application could
+		 * write new data to them.
+		 */
+		smp_store_release(&rings->sq.head, ctx->cached_sq_head);
+	}
 }
 
 /*
@@ -1996,10 +2000,15 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	__must_hold(&ctx->uring_lock)
 {
-	unsigned int entries = io_sqring_entries(ctx);
+	unsigned int entries;
 	unsigned int left;
 	int ret;
 
+	if (ctx->flags & IORING_SETUP_SQ_REWIND)
+		entries = ctx->sq_entries;
+	else
+		entries = io_sqring_entries(ctx);
+
 	entries = min(nr, entries);
 	if (unlikely(!entries))
 		return 0;
@@ -2728,6 +2737,12 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
 	if (flags & ~IORING_SETUP_FLAGS)
 		return -EINVAL;
 
+	if (flags & IORING_SETUP_SQ_REWIND) {
+		if ((flags & IORING_SETUP_SQPOLL) ||
+		    !(flags & IORING_SETUP_NO_SQARRAY))
+		return -EINVAL;
+	}
+
 	/* There is no way to mmap rings without a real fd */
 	if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) &&
 	    !(flags & IORING_SETUP_NO_MMAP))
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 29b8f90fdabf..acdc39b9f8d6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -69,7 +69,8 @@ struct io_ctx_config {
 			IORING_SETUP_NO_SQARRAY |\
 			IORING_SETUP_HYBRID_IOPOLL |\
 			IORING_SETUP_CQE_MIXED |\
-			IORING_SETUP_SQE_MIXED)
+			IORING_SETUP_SQE_MIXED |\
+			IORING_SETUP_SQ_REWIND)
 
 #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
 			IORING_ENTER_SQ_WAKEUP |\
-- 
cgit v1.2.3


From e86f89ab24f5ec595879a01eebb5df84f5ed6d2b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:36 +0800
Subject: ublk: add new batch command UBLK_U_IO_PREP_IO_CMDS &
 UBLK_U_IO_COMMIT_IO_CMDS

Add new command UBLK_U_IO_PREP_IO_CMDS, which is the batch version of
UBLK_IO_FETCH_REQ.

Add new command UBLK_U_IO_COMMIT_IO_CMDS, which is for committing io command
result only, still the batch version.

The new command header type is `struct ublk_batch_io`.

This patch doesn't actually implement these commands yet, just validates the
SQE fields.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 87 ++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/ublk_cmd.h | 49 ++++++++++++++++++++++++
 2 files changed, 135 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0f9fcd16258b..22c7296d90f3 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -91,6 +91,11 @@
 	 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
 	 UBLK_PARAM_TYPE_INTEGRITY)
 
+#define UBLK_BATCH_F_ALL  \
+	(UBLK_BATCH_F_HAS_ZONE_LBA | \
+	 UBLK_BATCH_F_HAS_BUF_ADDR | \
+	 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
+
 struct ublk_uring_cmd_pdu {
 	/*
 	 * Store requests in same batch temporarily for queuing them to
@@ -114,6 +119,13 @@ struct ublk_uring_cmd_pdu {
 	u16 tag;
 };
 
+struct ublk_batch_io_data {
+	struct ublk_device *ub;
+	struct io_uring_cmd *cmd;
+	struct ublk_batch_io header;
+	unsigned int issue_flags;
+};
+
 /*
  * io command is active: sqe cmd is received, and its cqe isn't done
  *
@@ -2687,10 +2699,83 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
 }
 
+static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
+{
+	unsigned elem_bytes = sizeof(struct ublk_elem_header);
+
+	if (uc->flags & ~UBLK_BATCH_F_ALL)
+		return -EINVAL;
+
+	/* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
+	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
+			(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
+		return -EINVAL;
+
+	elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
+		(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
+	if (uc->elem_bytes != elem_bytes)
+		return -EINVAL;
+	return 0;
+}
+
+static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
+{
+
+	const struct ublk_batch_io *uc = &data->header;
+
+	if (uc->nr_elem > data->ub->dev_info.queue_depth)
+		return -E2BIG;
+
+	if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
+			!ublk_dev_is_zoned(data->ub))
+		return -EINVAL;
+
+	if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
+			!ublk_dev_need_map_io(data->ub))
+		return -EINVAL;
+
+	if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
+			!ublk_dev_support_auto_buf_reg(data->ub))
+		return -EINVAL;
+
+	return ublk_check_batch_cmd_flags(uc);
+}
+
 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 				       unsigned int issue_flags)
 {
-	return -EOPNOTSUPP;
+	const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe);
+	struct ublk_device *ub = cmd->file->private_data;
+	struct ublk_batch_io_data data = {
+		.ub  = ub,
+		.cmd = cmd,
+		.header = (struct ublk_batch_io) {
+			.q_id = READ_ONCE(uc->q_id),
+			.flags = READ_ONCE(uc->flags),
+			.nr_elem = READ_ONCE(uc->nr_elem),
+			.elem_bytes = READ_ONCE(uc->elem_bytes),
+		},
+		.issue_flags = issue_flags,
+	};
+	u32 cmd_op = cmd->cmd_op;
+	int ret = -EINVAL;
+
+	if (data.header.q_id >= ub->dev_info.nr_hw_queues)
+		goto out;
+
+	switch (cmd_op) {
+	case UBLK_U_IO_PREP_IO_CMDS:
+	case UBLK_U_IO_COMMIT_IO_CMDS:
+		ret = ublk_check_batch_cmd(&data);
+		if (ret)
+			goto out;
+		ret = -EOPNOTSUPP;
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+	}
+out:
+	return ret;
 }
 
 static inline bool ublk_check_ubuf_dir(const struct request *req,
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 90f47da4f435..0cc58e19d401 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -103,6 +103,10 @@
 	_IOWR('u', 0x23, struct ublksrv_io_cmd)
 #define	UBLK_U_IO_UNREGISTER_IO_BUF	\
 	_IOWR('u', 0x24, struct ublksrv_io_cmd)
+#define	UBLK_U_IO_PREP_IO_CMDS	\
+	_IOWR('u', 0x25, struct ublk_batch_io)
+#define	UBLK_U_IO_COMMIT_IO_CMDS	\
+	_IOWR('u', 0x26, struct ublk_batch_io)
 
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
@@ -544,6 +548,51 @@ struct ublksrv_io_cmd {
 	};
 };
 
+struct ublk_elem_header {
+	__u16 tag;	/* IO tag */
+
+	/*
+	 * Buffer index for incoming io command, only valid iff
+	 * UBLK_F_AUTO_BUF_REG is set
+	 */
+	__u16 buf_index;
+	__s32 result;	/* I/O completion result (commit only) */
+};
+
+/*
+ * uring_cmd buffer structure for batch commands
+ *
+ * buffer includes multiple elements, which number is specified by
+ * `nr_elem`. Each element buffer is organized in the following order:
+ *
+ * struct ublk_elem_buffer {
+ * 	// Mandatory fields (8 bytes)
+ * 	struct ublk_elem_header header;
+ *
+ * 	// Optional fields (8 bytes each, included based on flags)
+ *
+ * 	// Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data
+ * 	// between ublk request and ublk server buffer
+ * 	__u64 buf_addr;
+ *
+ * 	// returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA)
+ * 	__u64 zone_lba;
+ * }
+ *
+ * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS`
+ */
+struct ublk_batch_io {
+	__u16  q_id;
+#define UBLK_BATCH_F_HAS_ZONE_LBA	(1 << 0)
+#define UBLK_BATCH_F_HAS_BUF_ADDR 	(1 << 1)
+#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK	(1 << 2)
+	__u16	flags;
+	__u16	nr_elem;
+	__u8	elem_bytes;
+	__u8	reserved;
+	__u64   reserved2;
+};
+
 struct ublk_param_basic {
 #define UBLK_ATTR_READ_ONLY            (1 << 0)
 #define UBLK_ATTR_ROTATIONAL           (1 << 1)
-- 
cgit v1.2.3


From b256795b3606e9a67c725dde8eaae91dd9d21de4 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:37 +0800
Subject: ublk: handle UBLK_U_IO_PREP_IO_CMDS

This commit implements the handling of the UBLK_U_IO_PREP_IO_CMDS command,
which allows userspace to prepare a batch of I/O requests.

The core of this change is the `ublk_walk_cmd_buf` function, which iterates
over the elements in the uring_cmd fixed buffer. For each element, it parses
the I/O details, finds the corresponding `ublk_io` structure, and prepares it
for future dispatch.

Add per-io lock for protecting concurrent delivery and committing.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 191 +++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/ublk_cmd.h |   5 ++
 2 files changed, 195 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 22c7296d90f3..a3840b3f1081 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -208,6 +208,7 @@ struct ublk_io {
 	unsigned task_registered_buffers;
 
 	void *buf_ctx_handle;
+	spinlock_t lock;
 } ____cacheline_aligned_in_smp;
 
 struct ublk_queue {
@@ -280,6 +281,16 @@ static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
 	return false;
 }
 
+static inline void ublk_io_lock(struct ublk_io *io)
+{
+	spin_lock(&io->lock);
+}
+
+static inline void ublk_io_unlock(struct ublk_io *io)
+{
+	spin_unlock(&io->lock);
+}
+
 static inline struct ublksrv_io_desc *
 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
 {
@@ -2699,6 +2710,171 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	return ublk_ch_uring_cmd_local(cmd, issue_flags);
 }
 
+static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
+					const struct ublk_elem_header *elem)
+{
+	const void *buf = elem;
+
+	if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
+		return *(const __u64 *)(buf + sizeof(*elem));
+	return 0;
+}
+
+static struct ublk_auto_buf_reg
+ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
+			const struct ublk_elem_header *elem)
+{
+	struct ublk_auto_buf_reg reg = {
+		.index = elem->buf_index,
+		.flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
+			UBLK_AUTO_BUF_REG_FALLBACK : 0,
+	};
+
+	return reg;
+}
+
+/*
+ * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
+ * it is the least common multiple(LCM) of 8, 16 and 24
+ */
+#define UBLK_CMD_BATCH_TMP_BUF_SZ  (48 * 10)
+struct ublk_batch_io_iter {
+	void __user *uaddr;
+	unsigned done, total;
+	unsigned char elem_bytes;
+	/* copy to this buffer from user space */
+	unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
+};
+
+static inline int
+__ublk_walk_cmd_buf(struct ublk_queue *ubq,
+		    struct ublk_batch_io_iter *iter,
+		    const struct ublk_batch_io_data *data,
+		    unsigned bytes,
+		    int (*cb)(struct ublk_queue *q,
+			    const struct ublk_batch_io_data *data,
+			    const struct ublk_elem_header *elem))
+{
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < bytes; i += iter->elem_bytes) {
+		const struct ublk_elem_header *elem =
+			(const struct ublk_elem_header *)&iter->buf[i];
+
+		if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = cb(ubq, data, elem);
+		if (unlikely(ret))
+			break;
+	}
+
+	iter->done += i;
+	return ret;
+}
+
+static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
+			     const struct ublk_batch_io_data *data,
+			     int (*cb)(struct ublk_queue *q,
+				     const struct ublk_batch_io_data *data,
+				     const struct ublk_elem_header *elem))
+{
+	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
+	int ret = 0;
+
+	while (iter->done < iter->total) {
+		unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
+
+		if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
+			pr_warn("ublk%d: read batch cmd buffer failed\n",
+					data->ub->dev_info.dev_id);
+			return -EFAULT;
+		}
+
+		ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int ublk_batch_unprep_io(struct ublk_queue *ubq,
+				const struct ublk_batch_io_data *data,
+				const struct ublk_elem_header *elem)
+{
+	struct ublk_io *io = &ubq->ios[elem->tag];
+
+	data->ub->nr_io_ready--;
+	ublk_io_lock(io);
+	io->flags = 0;
+	ublk_io_unlock(io);
+	return 0;
+}
+
+static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
+				       const struct ublk_batch_io_data *data)
+{
+	int ret;
+
+	/* Re-process only what we've already processed, starting from beginning */
+	iter->total = iter->done;
+	iter->done = 0;
+
+	ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
+	WARN_ON_ONCE(ret);
+}
+
+static int ublk_batch_prep_io(struct ublk_queue *ubq,
+			      const struct ublk_batch_io_data *data,
+			      const struct ublk_elem_header *elem)
+{
+	struct ublk_io *io = &ubq->ios[elem->tag];
+	const struct ublk_batch_io *uc = &data->header;
+	union ublk_io_buf buf = { 0 };
+	int ret;
+
+	if (ublk_dev_support_auto_buf_reg(data->ub))
+		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
+	else if (ublk_dev_need_map_io(data->ub)) {
+		buf.addr = ublk_batch_buf_addr(uc, elem);
+
+		ret = ublk_check_fetch_buf(data->ub, buf.addr);
+		if (ret)
+			return ret;
+	}
+
+	ublk_io_lock(io);
+	ret = __ublk_fetch(data->cmd, data->ub, io);
+	if (!ret)
+		io->buf = buf;
+	ublk_io_unlock(io);
+
+	return ret;
+}
+
+static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
+{
+	const struct ublk_batch_io *uc = &data->header;
+	struct io_uring_cmd *cmd = data->cmd;
+	struct ublk_batch_io_iter iter = {
+		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
+		.total = uc->nr_elem * uc->elem_bytes,
+		.elem_bytes = uc->elem_bytes,
+	};
+	int ret;
+
+	mutex_lock(&data->ub->mutex);
+	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
+
+	if (ret && iter.done)
+		ublk_batch_revert_prep_cmd(&iter, data);
+	mutex_unlock(&data->ub->mutex);
+	return ret;
+}
+
 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
 {
 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
@@ -2765,6 +2941,11 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 
 	switch (cmd_op) {
 	case UBLK_U_IO_PREP_IO_CMDS:
+		ret = ublk_check_batch_cmd(&data);
+		if (ret)
+			goto out;
+		ret = ublk_handle_batch_prep_cmd(&data);
+		break;
 	case UBLK_U_IO_COMMIT_IO_CMDS:
 		ret = ublk_check_batch_cmd(&data);
 		if (ret)
@@ -2952,7 +3133,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
 	struct ublk_queue *ubq;
 	struct page *page;
 	int numa_node;
-	int size;
+	int size, i;
 
 	/* Determine NUMA node based on queue's CPU affinity */
 	numa_node = ublk_get_queue_numa_node(ub, q_id);
@@ -2977,6 +3158,9 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
 	}
 	ubq->io_cmd_buf = page_address(page);
 
+	for (i = 0; i < ubq->q_depth; i++)
+		spin_lock_init(&ubq->ios[i].lock);
+
 	ub->queues[q_id] = ubq;
 	ubq->dev = ub;
 	return 0;
@@ -3220,6 +3404,11 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
 		return -EINVAL;
 
 	mutex_lock(&ub->mutex);
+	/* device may become not ready in case of F_BATCH */
+	if (!ublk_dev_ready(ub)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
 	    test_bit(UB_STATE_USED, &ub->state)) {
 		ret = -EEXIST;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 0cc58e19d401..1a3d4d33c1d1 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -103,6 +103,11 @@
 	_IOWR('u', 0x23, struct ublksrv_io_cmd)
 #define	UBLK_U_IO_UNREGISTER_IO_BUF	\
 	_IOWR('u', 0x24, struct ublksrv_io_cmd)
+
+/*
+ * return 0 if the command is run successfully, otherwise failure code
+ * is returned
+ */
 #define	UBLK_U_IO_PREP_IO_CMDS	\
 	_IOWR('u', 0x25, struct ublk_batch_io)
 #define	UBLK_U_IO_COMMIT_IO_CMDS	\
-- 
cgit v1.2.3


From 1e500e106d5a82280db59dba06f0108085beba65 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:38 +0800
Subject: ublk: handle UBLK_U_IO_COMMIT_IO_CMDS

Handle UBLK_U_IO_COMMIT_IO_CMDS by walking the uring_cmd fixed buffer:

- read each element into one temp buffer in batch style

- parse and apply each element for committing io result

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 103 +++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/ublk_cmd.h |   8 ++++
 2 files changed, 109 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index a3840b3f1081..162b46c74f16 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2267,7 +2267,7 @@ static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd
 	return 0;
 }
 
-static int ublk_handle_auto_buf_reg(struct ublk_io *io,
+static void ublk_clear_auto_buf_reg(struct ublk_io *io,
 				    struct io_uring_cmd *cmd,
 				    u16 *buf_idx)
 {
@@ -2287,7 +2287,13 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io,
 		if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
 			*buf_idx = io->buf.auto_reg.index;
 	}
+}
 
+static int ublk_handle_auto_buf_reg(struct ublk_io *io,
+				    struct io_uring_cmd *cmd,
+				    u16 *buf_idx)
+{
+	ublk_clear_auto_buf_reg(io, cmd, buf_idx);
 	return ublk_set_auto_buf_reg(io, cmd);
 }
 
@@ -2720,6 +2726,17 @@ static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
 	return 0;
 }
 
+static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
+					const struct ublk_elem_header *elem)
+{
+	const void *buf = elem;
+
+	if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
+		return *(const __u64 *)(buf + sizeof(*elem) +
+				8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
+	return -1;
+}
+
 static struct ublk_auto_buf_reg
 ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
 			const struct ublk_elem_header *elem)
@@ -2875,6 +2892,84 @@ static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
 	return ret;
 }
 
+static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
+				      struct ublk_io *io,
+				      union ublk_io_buf *buf)
+{
+	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+		return -EBUSY;
+
+	/* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
+	if (ublk_need_map_io(ubq) && !buf->addr)
+		return -EINVAL;
+	return 0;
+}
+
+static int ublk_batch_commit_io(struct ublk_queue *ubq,
+				const struct ublk_batch_io_data *data,
+				const struct ublk_elem_header *elem)
+{
+	struct ublk_io *io = &ubq->ios[elem->tag];
+	const struct ublk_batch_io *uc = &data->header;
+	u16 buf_idx = UBLK_INVALID_BUF_IDX;
+	union ublk_io_buf buf = { 0 };
+	struct request *req = NULL;
+	bool auto_reg = false;
+	bool compl = false;
+	int ret;
+
+	if (ublk_dev_support_auto_buf_reg(data->ub)) {
+		buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
+		auto_reg = true;
+	} else if (ublk_dev_need_map_io(data->ub))
+		buf.addr = ublk_batch_buf_addr(uc, elem);
+
+	ublk_io_lock(io);
+	ret = ublk_batch_commit_io_check(ubq, io, &buf);
+	if (!ret) {
+		io->res = elem->result;
+		io->buf = buf;
+		req = ublk_fill_io_cmd(io, data->cmd);
+
+		if (auto_reg)
+			ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
+		compl = ublk_need_complete_req(data->ub, io);
+	}
+	ublk_io_unlock(io);
+
+	if (unlikely(ret)) {
+		pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
+			__func__, data->ub->dev_info.dev_id, ubq->q_id,
+			elem->tag, ret);
+		return ret;
+	}
+
+	/* can't touch 'ublk_io' any more */
+	if (buf_idx != UBLK_INVALID_BUF_IDX)
+		io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
+	if (req_op(req) == REQ_OP_ZONE_APPEND)
+		req->__sector = ublk_batch_zone_lba(uc, elem);
+	if (compl)
+		__ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub));
+	return 0;
+}
+
+static int ublk_handle_batch_commit_cmd(const struct ublk_batch_io_data *data)
+{
+	const struct ublk_batch_io *uc = &data->header;
+	struct io_uring_cmd *cmd = data->cmd;
+	struct ublk_batch_io_iter iter = {
+		.uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
+		.total = uc->nr_elem * uc->elem_bytes,
+		.elem_bytes = uc->elem_bytes,
+	};
+	int ret;
+
+	ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
+
+	return iter.done == 0 ? ret : iter.done;
+}
+
 static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
 {
 	unsigned elem_bytes = sizeof(struct ublk_elem_header);
@@ -2950,7 +3045,7 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 		ret = ublk_check_batch_cmd(&data);
 		if (ret)
 			goto out;
-		ret = -EOPNOTSUPP;
+		ret = ublk_handle_batch_commit_cmd(&data);
 		break;
 	default:
 		ret = -EOPNOTSUPP;
@@ -3659,6 +3754,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
 				UBLK_F_AUTO_BUF_REG))
 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
 
+	/* UBLK_F_BATCH_IO doesn't support GET_DATA */
+	if (ublk_dev_support_batch_io(ub))
+		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
+
 	/*
 	 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
 	 * returning write_append_lba, which is only allowed in case of
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 1a3d4d33c1d1..3894d676dd02 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -110,6 +110,14 @@
  */
 #define	UBLK_U_IO_PREP_IO_CMDS	\
 	_IOWR('u', 0x25, struct ublk_batch_io)
+/*
+ * If failure code is returned, nothing in the command buffer is handled.
+ * Otherwise, the returned value means how many bytes in command buffer
+ * are handled actually, then number of handled IOs can be calculated with
+ * `elem_bytes` for each IO. IOs in the remained bytes are not committed,
+ * userspace has to check return value for dealing with partial committing
+ * correctly.
+ */
 #define	UBLK_U_IO_COMMIT_IO_CMDS	\
 	_IOWR('u', 0x26, struct ublk_batch_io)
 
-- 
cgit v1.2.3


From a4d88375539920b7401ead59d2f944ac23c668ea Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:41 +0800
Subject: ublk: add UBLK_U_IO_FETCH_IO_CMDS for batch I/O processing

Add UBLK_U_IO_FETCH_IO_CMDS command to enable efficient batch processing
of I/O requests. This multishot uring_cmd allows the ublk server to fetch
multiple I/O commands in a single operation, significantly reducing
submission overhead compared to individual FETCH_REQ* commands.

Key Design Features:

1. Multishot Operation: One UBLK_U_IO_FETCH_IO_CMDS can fetch many I/O
   commands, with the batch size limited by the provided buffer length.

2. Dynamic Load Balancing: Multiple fetch commands can be submitted
   simultaneously, but only one is active at any time. This enables
   efficient load distribution across multiple server task contexts.

3. Implicit State Management: The implementation uses three key variables
   to track state:
   - evts_fifo: Queue of request tags awaiting processing
   - fcmd_head: List of available fetch commands
   - active_fcmd: Currently active fetch command (NULL = none active)

   States are derived implicitly:
   - IDLE: No fetch commands available
   - READY: Fetch commands available, none active
   - ACTIVE: One fetch command processing events

4. Lockless Reader Optimization: The active fetch command can read from
   evts_fifo without locking (single reader guarantee), while writers
   (ublk_queue_rq/ublk_queue_rqs) use evts_lock protection. The memory
   barrier pairing plays key role for the single lockless reader
   optimization.

Implementation Details:

- ublk_queue_rq() and ublk_queue_rqs() save request tags to evts_fifo
- __ublk_acquire_fcmd() selects an available fetch command when
  events arrive and no command is currently active
- ublk_batch_dispatch() moves tags from evts_fifo to the fetch command's
  buffer and posts completion via io_uring_mshot_cmd_post_cqe()
- State transitions are coordinated via evts_lock to maintain consistency

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 394 +++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/ublk_cmd.h |   7 +
 2 files changed, 393 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 1b5721c7a536..0a0210f9d417 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -99,6 +99,7 @@
 
 /* ublk batch fetch uring_cmd */
 struct ublk_batch_fetch_cmd {
+	struct list_head node;
 	struct io_uring_cmd *cmd;
 	unsigned short buf_group;
 };
@@ -123,7 +124,10 @@ struct ublk_uring_cmd_pdu {
 	 */
 	struct ublk_queue *ubq;
 
-	u16 tag;
+	union {
+		u16 tag;
+		struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
+	};
 };
 
 struct ublk_batch_io_data {
@@ -245,10 +249,37 @@ struct ublk_queue {
 	 * Make sure just one reader for fetching request from task work
 	 * function to ublk server, so no need to grab the lock in reader
 	 * side.
+	 *
+	 * Batch I/O State Management:
+	 *
+	 * The batch I/O system uses implicit state management based on the
+	 * combination of three key variables below.
+	 *
+	 * - IDLE: list_empty(&fcmd_head) && !active_fcmd
+	 *   No fetch commands available, events queue in evts_fifo
+	 *
+	 * - READY: !list_empty(&fcmd_head) && !active_fcmd
+	 *   Fetch commands available but none processing events
+	 *
+	 * - ACTIVE: active_fcmd
+	 *   One fetch command actively processing events from evts_fifo
+	 *
+	 * Key Invariants:
+	 * - At most one active_fcmd at any time (single reader)
+	 * - active_fcmd is always from fcmd_head list when non-NULL
+	 * - evts_fifo can be read locklessly by the single active reader
+	 * - All state transitions require evts_lock protection
+	 * - Multiple writers to evts_fifo require lock protection
 	 */
 	struct {
 		DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
 		spinlock_t evts_lock;
+
+		/* List of fetch commands available to process events */
+		struct list_head fcmd_head;
+
+		/* Currently active fetch command (NULL = none active) */
+		struct ublk_batch_fetch_cmd  *active_fcmd;
 	}____cacheline_aligned_in_smp;
 
 	struct ublk_io ios[] __counted_by(q_depth);
@@ -303,12 +334,20 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
 		u16 q_id, u16 tag, struct ublk_io *io);
 static inline unsigned int ublk_req_build_flags(struct request *req);
+static void ublk_batch_dispatch(struct ublk_queue *ubq,
+				const struct ublk_batch_io_data *data,
+				struct ublk_batch_fetch_cmd *fcmd);
 
 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
 {
 	return false;
 }
 
+static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
+{
+	return false;
+}
+
 static inline void ublk_io_lock(struct ublk_io *io)
 {
 	spin_lock(&io->lock);
@@ -664,13 +703,45 @@ static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
 
 static DEFINE_MUTEX(ublk_ctl_mutex);
 
+static struct ublk_batch_fetch_cmd *
+ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
+{
+	struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO);
+
+	if (fcmd) {
+		fcmd->cmd = cmd;
+		fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
+	}
+	return fcmd;
+}
+
+static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
+{
+	kfree(fcmd);
+}
+
+static void __ublk_release_fcmd(struct ublk_queue *ubq)
+{
+	WRITE_ONCE(ubq->active_fcmd, NULL);
+}
 
-static void ublk_batch_deinit_fetch_buf(const struct ublk_batch_io_data *data,
+/*
+ * Nothing can move on, so clear ->active_fcmd, and the caller should stop
+ * dispatching
+ */
+static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
+					const struct ublk_batch_io_data *data,
 					struct ublk_batch_fetch_cmd *fcmd,
 					int res)
 {
+	spin_lock(&ubq->evts_lock);
+	list_del(&fcmd->node);
+	WARN_ON_ONCE(fcmd != ubq->active_fcmd);
+	__ublk_release_fcmd(ubq);
+	spin_unlock(&ubq->evts_lock);
+
 	io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
-	fcmd->cmd = NULL;
+	ublk_batch_free_fcmd(fcmd);
 }
 
 static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
@@ -1637,6 +1708,8 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq,
 	bool needs_filter;
 	int ret;
 
+	WARN_ON_ONCE(data->cmd != fcmd->cmd);
+
 	sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
 					 data->issue_flags);
 	if (sel.val < 0)
@@ -1700,21 +1773,93 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq,
 	return ret;
 }
 
-static __maybe_unused void
+static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
+		struct ublk_queue *ubq)
+{
+	struct ublk_batch_fetch_cmd *fcmd;
+
+	lockdep_assert_held(&ubq->evts_lock);
+
+	/*
+	 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
+	 *
+	 * The pair is the smp_mb() in ublk_batch_dispatch().
+	 *
+	 * If ubq->active_fcmd is observed as non-NULL, the new added tags
+	 * can be visisible in ublk_batch_dispatch() with the barrier pairing.
+	 */
+	smp_mb();
+	if (READ_ONCE(ubq->active_fcmd)) {
+		fcmd = NULL;
+	} else {
+		fcmd = list_first_entry_or_null(&ubq->fcmd_head,
+				struct ublk_batch_fetch_cmd, node);
+		WRITE_ONCE(ubq->active_fcmd, fcmd);
+	}
+	return fcmd;
+}
+
+static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
+{
+	unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
+	struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
+	struct ublk_batch_io_data data = {
+		.ub = pdu->ubq->dev,
+		.cmd = fcmd->cmd,
+		.issue_flags = issue_flags,
+	};
+
+	WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
+
+	ublk_batch_dispatch(pdu->ubq, &data, fcmd);
+}
+
+static void
 ublk_batch_dispatch(struct ublk_queue *ubq,
 		    const struct ublk_batch_io_data *data,
 		    struct ublk_batch_fetch_cmd *fcmd)
 {
+	struct ublk_batch_fetch_cmd *new_fcmd;
+	unsigned tried = 0;
 	int ret = 0;
 
+again:
 	while (!ublk_io_evts_empty(ubq)) {
 		ret = __ublk_batch_dispatch(ubq, data, fcmd);
 		if (ret <= 0)
 			break;
 	}
 
-	if (ret < 0)
-		ublk_batch_deinit_fetch_buf(data, fcmd, ret);
+	if (ret < 0) {
+		ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
+		return;
+	}
+
+	__ublk_release_fcmd(ubq);
+	/*
+	 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
+	 * checking ubq->evts_fifo.
+	 *
+	 * The pair is the smp_mb() in __ublk_acquire_fcmd().
+	 */
+	smp_mb();
+	if (likely(ublk_io_evts_empty(ubq)))
+		return;
+
+	spin_lock(&ubq->evts_lock);
+	new_fcmd = __ublk_acquire_fcmd(ubq);
+	spin_unlock(&ubq->evts_lock);
+
+	if (!new_fcmd)
+		return;
+
+	/* Avoid lockup by allowing to handle at most 32 batches */
+	if (new_fcmd == fcmd && tried++ < 32)
+		goto again;
+
+	io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
 }
 
 static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
@@ -1726,6 +1871,21 @@ static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
 	ublk_dispatch_req(ubq, pdu->req);
 }
 
+static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
+{
+	unsigned short tag = rq->tag;
+	struct ublk_batch_fetch_cmd *fcmd = NULL;
+
+	spin_lock(&ubq->evts_lock);
+	kfifo_put(&ubq->evts_fifo, tag);
+	if (last)
+		fcmd = __ublk_acquire_fcmd(ubq);
+	spin_unlock(&ubq->evts_lock);
+
+	if (fcmd)
+		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
+}
+
 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
 {
 	struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
@@ -1836,7 +1996,10 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 		return BLK_STS_OK;
 	}
 
-	ublk_queue_cmd(ubq, rq);
+	if (ublk_support_batch_io(ubq))
+		ublk_batch_queue_cmd(ubq, rq, bd->last);
+	else
+		ublk_queue_cmd(ubq, rq);
 	return BLK_STS_OK;
 }
 
@@ -1848,6 +2011,19 @@ static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
 		(io->task == io2->task);
 }
 
+static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct ublk_queue *ubq = hctx->driver_data;
+	struct ublk_batch_fetch_cmd *fcmd;
+
+	spin_lock(&ubq->evts_lock);
+	fcmd = __ublk_acquire_fcmd(ubq);
+	spin_unlock(&ubq->evts_lock);
+
+	if (fcmd)
+		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
+}
+
 static void ublk_queue_rqs(struct rq_list *rqlist)
 {
 	struct rq_list requeue_list = { };
@@ -1876,6 +2052,57 @@ static void ublk_queue_rqs(struct rq_list *rqlist)
 	*rqlist = requeue_list;
 }
 
+static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
+{
+	unsigned short tags[MAX_NR_TAG];
+	struct ublk_batch_fetch_cmd *fcmd;
+	struct request *rq;
+	unsigned cnt = 0;
+
+	spin_lock(&ubq->evts_lock);
+	rq_list_for_each(l, rq) {
+		tags[cnt++] = (unsigned short)rq->tag;
+		if (cnt >= MAX_NR_TAG) {
+			kfifo_in(&ubq->evts_fifo, tags, cnt);
+			cnt = 0;
+		}
+	}
+	if (cnt)
+		kfifo_in(&ubq->evts_fifo, tags, cnt);
+	fcmd = __ublk_acquire_fcmd(ubq);
+	spin_unlock(&ubq->evts_lock);
+
+	rq_list_init(l);
+	if (fcmd)
+		io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
+}
+
+static void ublk_batch_queue_rqs(struct rq_list *rqlist)
+{
+	struct rq_list requeue_list = { };
+	struct rq_list submit_list = { };
+	struct ublk_queue *ubq = NULL;
+	struct request *req;
+
+	while ((req = rq_list_pop(rqlist))) {
+		struct ublk_queue *this_q = req->mq_hctx->driver_data;
+
+		if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
+			rq_list_add_tail(&requeue_list, req);
+			continue;
+		}
+
+		if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
+			ublk_batch_queue_cmd_list(ubq, &submit_list);
+		ubq = this_q;
+		rq_list_add_tail(&submit_list, req);
+	}
+
+	if (!rq_list_empty(&submit_list))
+		ublk_batch_queue_cmd_list(ubq, &submit_list);
+	*rqlist = requeue_list;
+}
+
 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
 		unsigned int hctx_idx)
 {
@@ -1893,6 +2120,14 @@ static const struct blk_mq_ops ublk_mq_ops = {
 	.timeout	= ublk_timeout,
 };
 
+static const struct blk_mq_ops ublk_batch_mq_ops = {
+	.commit_rqs	= ublk_commit_rqs,
+	.queue_rq       = ublk_queue_rq,
+	.queue_rqs      = ublk_batch_queue_rqs,
+	.init_hctx	= ublk_init_hctx,
+	.timeout	= ublk_timeout,
+};
+
 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
 {
 	int i;
@@ -2290,6 +2525,56 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
 		io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
 }
 
+static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
+				  struct ublk_batch_fetch_cmd *fcmd,
+				  unsigned int issue_flags)
+{
+	bool done;
+
+	spin_lock(&ubq->evts_lock);
+	done = (READ_ONCE(ubq->active_fcmd) != fcmd);
+	if (done)
+		list_del(&fcmd->node);
+	spin_unlock(&ubq->evts_lock);
+
+	if (done) {
+		io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
+		ublk_batch_free_fcmd(fcmd);
+	}
+}
+
+static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
+{
+	struct ublk_batch_fetch_cmd *fcmd;
+	LIST_HEAD(fcmd_list);
+
+	spin_lock(&ubq->evts_lock);
+	ubq->force_abort = true;
+	list_splice_init(&ubq->fcmd_head, &fcmd_list);
+	fcmd = READ_ONCE(ubq->active_fcmd);
+	if (fcmd)
+		list_move(&fcmd->node, &ubq->fcmd_head);
+	spin_unlock(&ubq->evts_lock);
+
+	while (!list_empty(&fcmd_list)) {
+		fcmd = list_first_entry(&fcmd_list,
+				struct ublk_batch_fetch_cmd, node);
+		ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
+	}
+}
+
+static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
+				 unsigned int issue_flags)
+{
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
+	struct ublk_queue *ubq = pdu->ubq;
+
+	ublk_start_cancel(ubq->dev);
+
+	ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
+}
+
 /*
  * The ublk char device won't be closed when calling cancel fn, so both
  * ublk device and queue are guaranteed to be live
@@ -2341,6 +2626,11 @@ static void ublk_cancel_queue(struct ublk_queue *ubq)
 {
 	int i;
 
+	if (ublk_support_batch_io(ubq)) {
+		ublk_batch_cancel_queue(ubq);
+		return;
+	}
+
 	for (i = 0; i < ubq->q_depth; i++)
 		ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
 }
@@ -3246,6 +3536,79 @@ static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
 	return ublk_check_batch_cmd_flags(uc);
 }
 
+static int ublk_batch_attach(struct ublk_queue *ubq,
+			     struct ublk_batch_io_data *data,
+			     struct ublk_batch_fetch_cmd *fcmd)
+{
+	struct ublk_batch_fetch_cmd *new_fcmd = NULL;
+	bool free = false;
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
+
+	spin_lock(&ubq->evts_lock);
+	if (unlikely(ubq->force_abort || ubq->canceling)) {
+		free = true;
+	} else {
+		list_add_tail(&fcmd->node, &ubq->fcmd_head);
+		new_fcmd = __ublk_acquire_fcmd(ubq);
+	}
+	spin_unlock(&ubq->evts_lock);
+
+	if (unlikely(free)) {
+		ublk_batch_free_fcmd(fcmd);
+		return -ENODEV;
+	}
+
+	pdu->ubq = ubq;
+	pdu->fcmd = fcmd;
+	io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
+
+	if (!new_fcmd)
+		goto out;
+
+	/*
+	 * If the two fetch commands are originated from same io_ring_ctx,
+	 * run batch dispatch directly. Otherwise, schedule task work for
+	 * doing it.
+	 */
+	if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
+			io_uring_cmd_ctx_handle(fcmd->cmd)) {
+		data->cmd = new_fcmd->cmd;
+		ublk_batch_dispatch(ubq, data, new_fcmd);
+	} else {
+		io_uring_cmd_complete_in_task(new_fcmd->cmd,
+				ublk_batch_tw_cb);
+	}
+out:
+	return -EIOCBQUEUED;
+}
+
+static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
+{
+	struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
+	struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
+
+	if (!fcmd)
+		return -ENOMEM;
+
+	return ublk_batch_attach(ubq, data, fcmd);
+}
+
+static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
+{
+	const struct ublk_batch_io *uc = &data->header;
+
+	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
+		return -EINVAL;
+
+	if (uc->elem_bytes != sizeof(__u16))
+		return -EINVAL;
+
+	if (uc->flags != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 				       unsigned int issue_flags)
 {
@@ -3265,6 +3628,11 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 	u32 cmd_op = cmd->cmd_op;
 	int ret = -EINVAL;
 
+	if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
+		ublk_batch_cancel_fn(cmd, issue_flags);
+		return 0;
+	}
+
 	if (data.header.q_id >= ub->dev_info.nr_hw_queues)
 		goto out;
 
@@ -3281,6 +3649,12 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 			goto out;
 		ret = ublk_handle_batch_commit_cmd(&data);
 		break;
+	case UBLK_U_IO_FETCH_IO_CMDS:
+		ret = ublk_validate_batch_fetch_cmd(&data);
+		if (ret)
+			goto out;
+		ret = ublk_handle_batch_fetch_cmd(&data);
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 	}
@@ -3503,6 +3877,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
 		ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
 		if (ret)
 			goto fail;
+		INIT_LIST_HEAD(&ubq->fcmd_head);
 	}
 	ub->queues[q_id] = ubq;
 	ubq->dev = ub;
@@ -3625,7 +4000,10 @@ static void ublk_align_max_io_size(struct ublk_device *ub)
 
 static int ublk_add_tag_set(struct ublk_device *ub)
 {
-	ub->tag_set.ops = &ublk_mq_ops;
+	if (ublk_dev_support_batch_io(ub))
+		ub->tag_set.ops = &ublk_batch_mq_ops;
+	else
+		ub->tag_set.ops = &ublk_mq_ops;
 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
 	ub->tag_set.numa_node = NUMA_NO_NODE;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 3894d676dd02..70d8ebbf4326 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -121,6 +121,13 @@
 #define	UBLK_U_IO_COMMIT_IO_CMDS	\
 	_IOWR('u', 0x26, struct ublk_batch_io)
 
+/*
+ * Fetch io commands to provided buffer in multishot style,
+ * `IORING_URING_CMD_MULTISHOT` is required for this command.
+ */
+#define	UBLK_U_IO_FETCH_IO_CMDS 	\
+	_IOWR('u', 0x27, struct ublk_batch_io)
+
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
 #define UBLK_IO_RES_NEED_GET_DATA	1
-- 
cgit v1.2.3


From e2723e6ce6025026b6d79d9a00048386a69e00c3 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:44 +0800
Subject: ublk: add new feature UBLK_F_BATCH_IO

Add new feature UBLK_F_BATCH_IO which replaces the following two
per-io commands:

	- UBLK_U_IO_FETCH_REQ

	- UBLK_U_IO_COMMIT_AND_FETCH_REQ

with three per-queue batch io uring_cmd:

	- UBLK_U_IO_PREP_IO_CMDS

	- UBLK_U_IO_COMMIT_IO_CMDS

	- UBLK_U_IO_FETCH_IO_CMDS

Then ublk can deliver batch io commands to ublk server in single
multishort uring_cmd, also allows to prepare & commit multiple
commands in batch style via single uring_cmd, communication cost is
reduced a lot.

This feature also doesn't limit task context any more for all supported
commands, so any allowed uring_cmd can be issued in any task context.
ublk server implementation becomes much easier.

Meantime load balance becomes much easier to support with this feature.
The command `UBLK_U_IO_FETCH_IO_CMDS` can be issued from multiple task
contexts, so each task can adjust this command's buffer length or number
of inflight commands for controlling how much load is handled by current
task.

Later, priority parameter will be added to command `UBLK_U_IO_FETCH_IO_CMDS`
for improving load balance support.

UBLK_U_IO_NEED_GET_DATA isn't supported in batch io yet, but it may be
enabled in future via its batch pair.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 60 +++++++++++++++++++++++++++++++++++++------
 include/uapi/linux/ublk_cmd.h | 15 +++++++++++
 2 files changed, 67 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 564cf44c238f..bec34b5ab5ab 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -79,7 +79,8 @@
 		| UBLK_F_PER_IO_DAEMON \
 		| UBLK_F_BUF_REG_OFF_DAEMON \
 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
-		| UBLK_F_SAFE_STOP_DEV)
+		| UBLK_F_SAFE_STOP_DEV \
+		| UBLK_F_BATCH_IO)
 
 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
 		| UBLK_F_USER_RECOVERY_REISSUE \
@@ -340,12 +341,12 @@ static void ublk_batch_dispatch(struct ublk_queue *ubq,
 
 static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
 {
-	return false;
+	return ub->dev_info.flags & UBLK_F_BATCH_IO;
 }
 
 static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
 {
-	return false;
+	return ubq->flags & UBLK_F_BATCH_IO;
 }
 
 static inline void ublk_io_lock(struct ublk_io *io)
@@ -3573,9 +3574,11 @@ static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
 
 static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
 {
-
 	const struct ublk_batch_io *uc = &data->header;
 
+	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
+		return -EINVAL;
+
 	if (uc->nr_elem > data->ub->dev_info.queue_depth)
 		return -E2BIG;
 
@@ -3655,6 +3658,9 @@ static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
 {
 	const struct ublk_batch_io *uc = &data->header;
 
+	if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
+		return -EINVAL;
+
 	if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
 		return -EINVAL;
 
@@ -3667,6 +3673,35 @@ static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
 	return 0;
 }
 
+static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
+				     unsigned int issue_flags)
+{
+	const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe);
+	struct ublk_device *ub = cmd->file->private_data;
+	unsigned tag = READ_ONCE(ub_cmd->tag);
+	unsigned q_id = READ_ONCE(ub_cmd->q_id);
+	unsigned index = READ_ONCE(ub_cmd->addr);
+	struct ublk_queue *ubq;
+	struct ublk_io *io;
+
+	if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
+		return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
+
+	if (q_id >= ub->dev_info.nr_hw_queues)
+		return -EINVAL;
+
+	if (tag >= ub->dev_info.queue_depth)
+		return -EINVAL;
+
+	if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
+		return -EOPNOTSUPP;
+
+	ubq = ublk_get_queue(ub, q_id);
+	io = &ubq->ios[tag];
+	return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
+			issue_flags);
+}
+
 static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 				       unsigned int issue_flags)
 {
@@ -3691,9 +3726,6 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 		return 0;
 	}
 
-	if (data.header.q_id >= ub->dev_info.nr_hw_queues)
-		goto out;
-
 	switch (cmd_op) {
 	case UBLK_U_IO_PREP_IO_CMDS:
 		ret = ublk_check_batch_cmd(&data);
@@ -3714,7 +3746,8 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
 		ret = ublk_handle_batch_fetch_cmd(&data);
 		break;
 	default:
-		ret = -EOPNOTSUPP;
+		ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
+		break;
 	}
 out:
 	return ret;
@@ -4437,6 +4470,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
 		UBLK_F_BUF_REG_OFF_DAEMON |
 		UBLK_F_SAFE_STOP_DEV;
 
+	/* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
+	if (ublk_dev_support_batch_io(ub))
+		ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
+
 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
 				UBLK_F_AUTO_BUF_REG))
@@ -4820,6 +4857,13 @@ static int ublk_wait_for_idle_io(struct ublk_device *ub,
 	unsigned int elapsed = 0;
 	int ret;
 
+	/*
+	 * For UBLK_F_BATCH_IO ublk server can get notified with existing
+	 * or new fetch command, so needn't wait any more
+	 */
+	if (ublk_dev_support_batch_io(ub))
+		return 0;
+
 	while (elapsed < timeout_ms && !signal_pending(current)) {
 		unsigned int queues_cancelable = 0;
 		int i;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 70d8ebbf4326..743d31491387 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -340,6 +340,21 @@
  */
 #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14)
 
+/*
+ * Support the following commands for delivering & committing io command
+ * in batch.
+ *
+ * 	- UBLK_U_IO_PREP_IO_CMDS
+ * 	- UBLK_U_IO_COMMIT_IO_CMDS
+ * 	- UBLK_U_IO_FETCH_IO_CMDS
+ * 	- UBLK_U_IO_REGISTER_IO_BUF
+ * 	- UBLK_U_IO_UNREGISTER_IO_BUF
+ *
+ * The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and
+ * UBLK_U_IO_NEED_GET_DATA uring_cmd are not supported for this feature.
+ */
+#define UBLK_F_BATCH_IO		(1ULL << 15)
+
 /*
  * ublk device supports requests with integrity/metadata buffer.
  * Requires UBLK_F_USER_COPY.
-- 
cgit v1.2.3


From fa9893fadbc245e179cb17f3c371c67471b5a8a8 Mon Sep 17 00:00:00 2001
From: Michael Roth <michael.roth@amd.com>
Date: Fri, 9 Jan 2026 17:17:32 -0600
Subject: KVM: Introduce KVM_EXIT_SNP_REQ_CERTS for SNP certificate-fetching

For SEV-SNP, the host can optionally provide a certificate table to the
guest when it issues an attestation request to firmware (see GHCB 2.0
specification regarding "SNP Extended Guest Requests"). This certificate
table can then be used to verify the endorsement key used by firmware to
sign the attestation report.

While it is possible for guests to obtain the certificates through other
means, handling it via the host provides more flexibility in being able
to keep the certificate data in sync with the endorsement key throughout
host-side operations that might resulting in the endorsement key
changing.

In the case of KVM, userspace will be responsible for fetching the
certificate table and keeping it in sync with any modifications to the
endorsement key by other userspace management tools. Define a new
KVM_EXIT_SNP_REQ_CERTS event where userspace is provided with the GPA of
the buffer the guest has provided as part of the attestation request so
that userspace can write the certificate data into it while relying on
filesystem-based locking to keep the certificates up-to-date relative to
the endorsement keys installed/utilized by firmware at the time the
certificates are fetched.

[Melody: Update the documentation scheme about how file locking is
         expected to happen.]

Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Tested-by: Liam Merwick <liam.merwick@oracle.com>
Tested-by: Dionna Glaze <dionnaglaze@google.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Melody Wang <huibo.wang@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Link: https://patch.msgid.link/20260109231732.1160759-2-michael.roth@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 Documentation/virt/kvm/api.rst | 44 ++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/sev.c         | 62 ++++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/svm/svm.h         |  1 +
 include/uapi/linux/kvm.h       |  9 ++++++
 4 files changed, 110 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 01a3abef8abb..428d7d9cb4d6 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7353,6 +7353,50 @@ Please note that the kernel is allowed to use the kvm_run structure as the
 primary storage for certain register types. Therefore, the kernel may use the
 values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
 
+::
+
+		/* KVM_EXIT_SNP_REQ_CERTS */
+		struct kvm_exit_snp_req_certs {
+			__u64 gpa;
+			__u64 npages;
+			__u64 ret;
+		};
+
+KVM_EXIT_SNP_REQ_CERTS indicates an SEV-SNP guest with certificate-fetching
+enabled (see KVM_SEV_SNP_ENABLE_REQ_CERTS) has generated an Extended Guest
+Request NAE #VMGEXIT (SNP_GUEST_REQUEST) with message type MSG_REPORT_REQ,
+i.e. has requested an attestation report from firmware, and would like the
+certificate data corresponding to the attestation report signature to be
+provided by the hypervisor as part of the request.
+
+To allow for userspace to provide the certificate, the 'gpa' and 'npages'
+are forwarded verbatim from the guest request (the RAX and RBX GHCB fields
+respectively).  'ret' is not an "output" from KVM, and is always '0' on
+exit.  KVM verifies the 'gpa' is 4KiB aligned prior to exiting to userspace,
+but otherwise the information from the guest isn't validated.
+
+Upon the next KVM_RUN, e.g. after userspace has serviced the request (or not),
+KVM will complete the #VMGEXIT, using the 'ret' field to determine whether to
+signal success or failure to the guest, and on failure, what reason code will
+be communicated via SW_EXITINFO2.  If 'ret' is set to an unsupported value (see
+the table below), KVM_RUN will fail with -EINVAL.  For a 'ret' of 'ENOSPC', KVM
+also consumes the 'npages' field, i.e. userspace can use the field to inform
+the guest of the number of pages needed to hold all the certificate data.
+
+The supported 'ret' values and their respective SW_EXITINFO2 encodings:
+
+  ======     =============================================================
+  0          0x0, i.e. success.  KVM will emit an SNP_GUEST_REQUEST command
+             to SNP firmware.
+  ENOSPC     0x0000000100000000, i.e. not enough guest pages to hold the
+             certificate table and certificate data.  KVM will also set the
+             RBX field in the GHBC to 'npages'.
+  EAGAIN     0x0000000200000000, i.e. the host is busy and the guest should
+             retry the request.
+  EIO        0xffffffff00000000, for all other errors (this return code is
+             a KVM-defined hypervisor value, as allowed by the GHCB)
+  ======     =============================================================
+
 
 .. _cap_enable:
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f67525007089..9e6a78e448f2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -41,6 +41,16 @@
 
 #define GHCB_HV_FT_SUPPORTED	(GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
 
+/*
+ * The GHCB spec essentially states that all non-zero error codes other than
+ * those explicitly defined above should be treated as an error by the guest.
+ * Define a generic error to cover that case, and choose a value that is not
+ * likely to overlap with new explicit error codes should more be added to
+ * the GHCB spec later. KVM will use this to report generic errors when
+ * handling SNP guest requests.
+ */
+#define SNP_GUEST_VMM_ERR_GENERIC       (~0U)
+
 /* enable/disable SEV support */
 static bool sev_enabled = true;
 module_param_named(sev, sev_enabled, bool, 0444);
@@ -4139,6 +4149,36 @@ out_unlock:
 	return ret;
 }
 
+static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error)
+{
+	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0));
+
+	return 1; /* resume guest */
+}
+
+static int snp_complete_req_certs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+
+	switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) {
+	case 0:
+		return snp_handle_guest_req(svm, control->exit_info_1,
+					    control->exit_info_2);
+	case ENOSPC:
+		vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages;
+		return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN);
+	case EAGAIN:
+		return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY);
+	case EIO:
+		return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC);
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
 static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
 {
 	struct kvm *kvm = svm->vcpu.kvm;
@@ -4154,14 +4194,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r
 	/*
 	 * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
 	 * additional certificate data to be provided alongside the attestation
-	 * report via the guest-provided data pages indicated by RAX/RBX. The
-	 * certificate data is optional and requires additional KVM enablement
-	 * to provide an interface for userspace to provide it, but KVM still
-	 * needs to be able to handle extended guest requests either way. So
-	 * provide a stub implementation that will always return an empty
-	 * certificate table in the guest-provided data pages.
+	 * report via the guest-provided data pages indicated by RAX/RBX. If
+	 * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace
+	 * to give userspace an opportunity to provide the certificate data
+	 * before issuing/completing the attestation request. Otherwise, return
+	 * an empty certificate table in the guest-provided data pages and
+	 * handle the attestation request immediately.
 	 */
 	if (msg_type == SNP_MSG_REPORT_REQ) {
+		struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
 		struct kvm_vcpu *vcpu = &svm->vcpu;
 		u64 data_npages;
 		gpa_t data_gpa;
@@ -4175,6 +4216,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r
 		if (!PAGE_ALIGNED(data_gpa))
 			goto request_invalid;
 
+		if (sev->snp_certs_enabled) {
+			vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS;
+			vcpu->run->snp_req_certs.gpa = data_gpa;
+			vcpu->run->snp_req_certs.npages = data_npages;
+			vcpu->run->snp_req_certs.ret = 0;
+			vcpu->arch.complete_userspace_io = snp_complete_req_certs;
+			return 0;
+		}
+
 		/*
 		 * As per GHCB spec (see "SNP Extended Guest Request"), the
 		 * certificate table is terminated by 24-bytes of zeroes.
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 338fc4f5cc4c..ebd7b36b1ceb 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -115,6 +115,7 @@ struct kvm_sev_info {
 	void *guest_resp_buf;   /* Bounce buffer for SNP Guest Request output */
 	struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
 	cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
+	bool snp_certs_enabled;	/* SNP certificate-fetching support. */
 };
 
 struct kvm_svm {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index dddb781b0507..8cd107cdcf0b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -135,6 +135,12 @@ struct kvm_xen_exit {
 	} u;
 };
 
+struct kvm_exit_snp_req_certs {
+	__u64 gpa;
+	__u64 npages;
+	__u64 ret;
+};
+
 #define KVM_S390_GET_SKEYS_NONE   1
 #define KVM_S390_SKEYS_MAX        1048576
 
@@ -180,6 +186,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_MEMORY_FAULT     39
 #define KVM_EXIT_TDX              40
 #define KVM_EXIT_ARM_SEA          41
+#define KVM_EXIT_SNP_REQ_CERTS    42
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -482,6 +489,8 @@ struct kvm_run {
 			__u64 gva;
 			__u64 gpa;
 		} arm_sea;
+		/* KVM_EXIT_SNP_REQ_CERTS */
+		struct kvm_exit_snp_req_certs snp_req_certs;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
cgit v1.2.3


From ba1b8c97b9a0414432382a11f144a8597f6f597e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 21 Jan 2026 17:11:30 +0100
Subject: geneve: add netlink support for GRO hint

Allow configuring and dumping the new device option, and cache its value
into the geneve socket itself.
The new option is not tie to it any code yet.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/2295d4e4d1e919a3189425141bbc71c7850a2de0.1769011015.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/rt-link.yaml |  3 +++
 drivers/net/geneve.c                     | 29 +++++++++++++++++++++++++----
 include/uapi/linux/if_link.h             |  1 +
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml
index 6beeb6ee5adf..df4b56beb818 100644
--- a/Documentation/netlink/specs/rt-link.yaml
+++ b/Documentation/netlink/specs/rt-link.yaml
@@ -1914,6 +1914,9 @@ attribute-sets:
         name: port-range
         type: binary
         struct: ifla-geneve-port-range
+      -
+        name: gro-hint
+        type: flag
   -
     name: linkinfo-hsr-attrs
     name-prefix: ifla-hsr-
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 64ea4b970376..8719ad66837e 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -56,6 +56,7 @@ struct geneve_config {
 	bool			collect_md;
 	bool			use_udp6_rx_checksums;
 	bool			ttl_inherit;
+	bool			gro_hint;
 	enum ifla_geneve_df	df;
 	bool			inner_proto_inherit;
 	u16			port_min;
@@ -84,6 +85,7 @@ struct geneve_dev {
 
 struct geneve_sock {
 	bool			collect_md;
+	bool			gro_hint;
 	struct list_head	list;
 	struct socket		*sock;
 	struct rcu_head		rcu;
@@ -659,13 +661,15 @@ static void geneve_sock_release(struct geneve_dev *geneve)
 
 static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
 					    sa_family_t family,
-					    __be16 dst_port)
+					    __be16 dst_port,
+					    bool gro_hint)
 {
 	struct geneve_sock *gs;
 
 	list_for_each_entry(gs, &gn->sock_list, list) {
 		if (inet_sk(gs->sock->sk)->inet_sport == dst_port &&
-		    geneve_get_sk_family(gs) == family) {
+		    geneve_get_sk_family(gs) == family &&
+		    gs->gro_hint == gro_hint) {
 			return gs;
 		}
 	}
@@ -676,12 +680,14 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
 {
 	struct net *net = geneve->net;
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
+	bool gro_hint = geneve->cfg.gro_hint;
 	struct geneve_dev_node *node;
 	struct geneve_sock *gs;
 	__u8 vni[3];
 	__u32 hash;
 
-	gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst);
+	gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET,
+			      geneve->cfg.info.key.tp_dst, gro_hint);
 	if (gs) {
 		gs->refcnt++;
 		goto out;
@@ -694,6 +700,7 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
 
 out:
 	gs->collect_md = geneve->cfg.collect_md;
+	gs->gro_hint = gro_hint;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (ipv6) {
 		rcu_assign_pointer(geneve->sock6, gs);
@@ -1257,6 +1264,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_DF]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_INNER_PROTO_INHERIT]	= { .type = NLA_FLAG },
 	[IFLA_GENEVE_PORT_RANGE]	= NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)),
+	[IFLA_GENEVE_GRO_HINT]		= { .type = NLA_FLAG },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1607,10 +1615,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 		cfg->inner_proto_inherit = true;
 	}
 
+	if (data[IFLA_GENEVE_GRO_HINT]) {
+		if (changelink) {
+			attrtype = IFLA_GENEVE_GRO_HINT;
+			goto change_notsup;
+		}
+		cfg->gro_hint = true;
+	}
+
 	return 0;
 change_notsup:
 	NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
-			    "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported");
+			    "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, gro_hint and UDP checksum attributes are not supported");
 	return -EOPNOTSUPP;
 }
 
@@ -1793,6 +1809,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */
 		nla_total_size(0) +	 /* IFLA_GENEVE_INNER_PROTO_INHERIT */
 		nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */
+		nla_total_size(0) +	 /* IFLA_GENEVE_GRO_HINT */
 		0;
 }
 
@@ -1865,6 +1882,10 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports))
 		goto nla_put_failure;
 
+	if (geneve->cfg.gro_hint &&
+	    nla_put_flag(skb, IFLA_GENEVE_GRO_HINT))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3b491d96e52e..e9b5f79e1ee1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1443,6 +1443,7 @@ enum {
 	IFLA_GENEVE_DF,
 	IFLA_GENEVE_INNER_PROTO_INHERIT,
 	IFLA_GENEVE_PORT_RANGE,
+	IFLA_GENEVE_GRO_HINT,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 795663b4d160ba652959f1a46381c5e8b1342a53 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 24 Jan 2026 10:36:17 +0000
Subject: io_uring/zcrx: implement large rx buffer support

There are network cards that support receive buffers larger than 4K, and
that can be vastly beneficial for performance, and benchmarks for this
patch showed up to 30% CPU util improvement for 32K vs 4K buffers.

Allows zcrx users to specify the size in struct
io_uring_zcrx_ifq_reg::rx_buf_len. If set to zero, zcrx will use a
default value. zcrx will check and fail if the memory backing the area
can't be split into physically contiguous chunks of the required size.
It's more restrictive as it only needs dma addresses to be contig, but
that's beyond this series.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: kill duplicate netdev_queues.h include]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  2 +-
 io_uring/zcrx.c               | 38 +++++++++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b5b23c0d5283..3184f7e7f1f2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1082,7 +1082,7 @@ struct io_uring_zcrx_ifq_reg {
 
 	struct io_uring_zcrx_offsets offsets;
 	__u32	zcrx_id;
-	__u32	__resv2;
+	__u32	rx_buf_len;
 	__u64	__resv[3];
 };
 
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index b99cf2c6670a..8a9df72bc094 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -55,6 +55,18 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
 	return area->mem.pages[net_iov_idx(niov) << niov_pages_shift];
 }
 
+static int io_area_max_shift(struct io_zcrx_mem *mem)
+{
+	struct sg_table *sgt = mem->sgt;
+	struct scatterlist *sg;
+	unsigned shift = -1U;
+	unsigned i;
+
+	for_each_sgtable_dma_sg(sgt, sg, i)
+		shift = min(shift, __ffs(sg->length));
+	return shift;
+}
+
 static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
 				struct io_zcrx_area *area)
 {
@@ -416,12 +428,21 @@ static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
 }
 
 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
-			       struct io_uring_zcrx_area_reg *area_reg)
+			       struct io_uring_zcrx_area_reg *area_reg,
+			       struct io_uring_zcrx_ifq_reg *reg)
 {
+	int buf_size_shift = PAGE_SHIFT;
 	struct io_zcrx_area *area;
 	unsigned nr_iovs;
 	int i, ret;
 
+	if (reg->rx_buf_len) {
+		if (!is_power_of_2(reg->rx_buf_len) ||
+		     reg->rx_buf_len < PAGE_SIZE)
+			return -EINVAL;
+		buf_size_shift = ilog2(reg->rx_buf_len);
+	}
+
 	ret = -ENOMEM;
 	area = kzalloc(sizeof(*area), GFP_KERNEL);
 	if (!area)
@@ -432,7 +453,12 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 	if (ret)
 		goto err;
 
-	ifq->niov_shift = PAGE_SHIFT;
+	if (buf_size_shift > io_area_max_shift(&area->mem)) {
+		ret = -ERANGE;
+		goto err;
+	}
+
+	ifq->niov_shift = buf_size_shift;
 	nr_iovs = area->mem.size >> ifq->niov_shift;
 	area->nia.num_niovs = nr_iovs;
 
@@ -742,8 +768,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		return -EINVAL;
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
-	if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) ||
-	    reg.__resv2 || reg.zcrx_id)
+	if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || reg.zcrx_id)
 		return -EINVAL;
 	if (reg.flags & ZCRX_REG_IMPORT)
 		return import_zcrx(ctx, arg, &reg);
@@ -800,10 +825,11 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	}
 	get_device(ifq->dev);
 
-	ret = io_zcrx_create_area(ifq, &area);
+	ret = io_zcrx_create_area(ifq, &area, &reg);
 	if (ret)
 		goto netdev_put_unlock;
 
+	mp_param.rx_page_size = 1U << ifq->niov_shift;
 	mp_param.mp_ops = &io_uring_pp_zc_ops;
 	mp_param.mp_priv = ifq;
 	ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
@@ -821,6 +847,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 			goto err;
 	}
 
+	reg.rx_buf_len = 1U << ifq->niov_shift;
+
 	if (copy_to_user(arg, &reg, sizeof(reg)) ||
 	    copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
 	    copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
-- 
cgit v1.2.3


From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:19:56 +0800
Subject: bpf: add fsession support

The fsession is something that similar to kprobe session. It allow to
attach a single BPF program to both the entry and the exit of the target
functions.

Introduce the struct bpf_fsession_link, which allows to add the link to
both the fentry and fexit progs_hlist of the trampoline.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Co-developed-by: Leon Hwang <leon.hwang@linux.dev>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                | 19 ++++++++
 include/uapi/linux/bpf.h                           |  1 +
 kernel/bpf/btf.c                                   |  2 +
 kernel/bpf/syscall.c                               | 18 +++++++-
 kernel/bpf/trampoline.c                            | 53 ++++++++++++++++++----
 kernel/bpf/verifier.c                              | 12 +++--
 net/bpf/test_run.c                                 |  1 +
 net/core/bpf_sk_storage.c                          |  1 +
 tools/include/uapi/linux/bpf.h                     |  1 +
 .../selftests/bpf/prog_tests/tracing_failure.c     |  2 +-
 10 files changed, 97 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5936f8e2996f..41228b0add52 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type {
 	BPF_TRAMP_MODIFY_RETURN,
 	BPF_TRAMP_MAX,
 	BPF_TRAMP_REPLACE, /* more than MAX */
+	BPF_TRAMP_FSESSION,
 };
 
 struct bpf_tramp_image {
@@ -1875,6 +1876,11 @@ struct bpf_tracing_link {
 	struct bpf_prog *tgt_prog;
 };
 
+struct bpf_fsession_link {
+	struct bpf_tracing_link link;
+	struct bpf_tramp_link fexit;
+};
+
 struct bpf_raw_tp_link {
 	struct bpf_link link;
 	struct bpf_raw_event_map *btp;
@@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op
 
 #endif
 
+static inline int bpf_fsession_cnt(struct bpf_tramp_links *links)
+{
+	struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY];
+	int cnt = 0;
+
+	for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) {
+		if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION)
+			cnt++;
+	}
+
+	return cnt;
+}
+
 int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
 			       const struct bpf_ctx_arg_aux *info, u32 cnt);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2a2ade4be60f..44e7dbc278e3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1145,6 +1145,7 @@ enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
+	BPF_TRACE_FSESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d10b3404260f..8959f3bc1e92 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
 		case BPF_MODIFY_RETURN:
+		case BPF_TRACE_FSESSION:
 			/* allow u64* as ctx */
 			if (btf_is_int(t) && t->size == 8)
 				return 0;
@@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			fallthrough;
 		case BPF_LSM_CGROUP:
 		case BPF_TRACE_FEXIT:
+		case BPF_TRACE_FSESSION:
 			/* When LSM programs are attached to void LSM hooks
 			 * they use FEXIT trampolines and when attached to
 			 * int LSM hooks, they use MODIFY_RETURN trampolines.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3c5c03d43f5f..b9184545c3fd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	case BPF_PROG_TYPE_TRACING:
 		if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
 		    prog->expected_attach_type != BPF_TRACE_FEXIT &&
+		    prog->expected_attach_type != BPF_TRACE_FSESSION &&
 		    prog->expected_attach_type != BPF_MODIFY_RETURN) {
 			err = -EINVAL;
 			goto out_put_prog;
@@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
 	}
 
-	link = kzalloc(sizeof(*link), GFP_USER);
+	if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
+		struct bpf_fsession_link *fslink;
+
+		fslink = kzalloc(sizeof(*fslink), GFP_USER);
+		if (fslink) {
+			bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
+				      &bpf_tracing_link_lops, prog, attach_type);
+			fslink->fexit.cookie = bpf_cookie;
+			link = &fslink->link;
+		} else {
+			link = NULL;
+		}
+	} else {
+		link = kzalloc(sizeof(*link), GFP_USER);
+	}
 	if (!link) {
 		err = -ENOMEM;
 		goto out_put_prog;
@@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_TRACE_RAW_TP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FSESSION:
 	case BPF_MODIFY_RETURN:
 		return BPF_PROG_TYPE_TRACING;
 	case BPF_LSM_MAC:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 2a125d063e62..edf9da43762d 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 	enum bpf_attach_type eatype = prog->expected_attach_type;
 	enum bpf_prog_type ptype = prog->type;
 
-	return (ptype == BPF_PROG_TYPE_TRACING &&
-		(eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
-		 eatype == BPF_MODIFY_RETURN)) ||
-		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
+	switch (ptype) {
+	case BPF_PROG_TYPE_TRACING:
+		if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+		    eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION)
+			return true;
+		return false;
+	case BPF_PROG_TYPE_LSM:
+		return eatype == BPF_LSM_MAC;
+	default:
+		return false;
+	}
 }
 
 void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
@@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 		return BPF_TRAMP_MODIFY_RETURN;
 	case BPF_TRACE_FEXIT:
 		return BPF_TRAMP_FEXIT;
+	case BPF_TRACE_FSESSION:
+		return BPF_TRAMP_FSESSION;
 	case BPF_LSM_MAC:
 		if (!prog->aux->attach_func_proto->type)
 			/* The function returns void, we cannot modify its
@@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
 				      struct bpf_trampoline *tr,
 				      struct bpf_prog *tgt_prog)
 {
+	struct bpf_fsession_link *fslink = NULL;
 	enum bpf_tramp_prog_type kind;
 	struct bpf_tramp_link *link_exiting;
+	struct hlist_head *prog_list;
 	int err = 0;
 	int cnt = 0, i;
 
@@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
 					  BPF_MOD_JUMP, NULL,
 					  link->link.prog->bpf_func);
 	}
+	if (kind == BPF_TRAMP_FSESSION) {
+		prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY];
+		cnt++;
+	} else {
+		prog_list = &tr->progs_hlist[kind];
+	}
 	if (cnt >= BPF_MAX_TRAMP_LINKS)
 		return -E2BIG;
 	if (!hlist_unhashed(&link->tramp_hlist))
 		/* prog already linked */
 		return -EBUSY;
-	hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+	hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) {
 		if (link_exiting->link.prog != link->link.prog)
 			continue;
 		/* prog already linked */
 		return -EBUSY;
 	}
 
-	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
-	tr->progs_cnt[kind]++;
+	hlist_add_head(&link->tramp_hlist, prog_list);
+	if (kind == BPF_TRAMP_FSESSION) {
+		tr->progs_cnt[BPF_TRAMP_FENTRY]++;
+		fslink = container_of(link, struct bpf_fsession_link, link.link);
+		hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
+		tr->progs_cnt[BPF_TRAMP_FEXIT]++;
+	} else {
+		tr->progs_cnt[kind]++;
+	}
 	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
 	if (err) {
 		hlist_del_init(&link->tramp_hlist);
-		tr->progs_cnt[kind]--;
+		if (kind == BPF_TRAMP_FSESSION) {
+			tr->progs_cnt[BPF_TRAMP_FENTRY]--;
+			hlist_del_init(&fslink->fexit.tramp_hlist);
+			tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+		} else {
+			tr->progs_cnt[kind]--;
+		}
 	}
 	return err;
 }
@@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 		guard(mutex)(&tgt_prog->aux->ext_mutex);
 		tgt_prog->aux->is_extended = false;
 		return err;
+	} else if (kind == BPF_TRAMP_FSESSION) {
+		struct bpf_fsession_link *fslink =
+			container_of(link, struct bpf_fsession_link, link.link);
+
+		hlist_del_init(&fslink->fexit.tramp_hlist);
+		tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+		kind = BPF_TRAMP_FENTRY;
 	}
 	hlist_del_init(&link->tramp_hlist);
 	tr->progs_cnt[kind]--;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c7f5234d5fd2..41bbed6418b5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		switch (env->prog->expected_attach_type) {
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
+		case BPF_TRACE_FSESSION:
 			range = retval_range(0, 0);
 			break;
 		case BPF_TRACE_RAW_TP:
@@ -23774,6 +23775,7 @@ patch_map_ops_generic:
 		if (prog_type == BPF_PROG_TYPE_TRACING &&
 		    insn->imm == BPF_FUNC_get_func_ret) {
 			if (eatype == BPF_TRACE_FEXIT ||
+			    eatype == BPF_TRACE_FSESSION ||
 			    eatype == BPF_MODIFY_RETURN) {
 				/* Load nr_args from ctx - 8 */
 				insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
@@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
 		    prog_extension &&
 		    (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
-		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) {
 			/* Program extensions can extend all program types
 			 * except fentry/fexit. The reason is the following.
 			 * The fentry/fexit programs are used for performance
@@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			 * beyond reasonable stack size. Hence extending fentry
 			 * is not allowed.
 			 */
-			bpf_log(log, "Cannot extend fentry/fexit\n");
+			bpf_log(log, "Cannot extend fentry/fexit/fsession\n");
 			return -EINVAL;
 		}
 	} else {
@@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	case BPF_LSM_CGROUP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FSESSION:
 		if (!btf_type_is_func(t)) {
 			bpf_log(log, "attach_btf_id %u is not a function\n",
 				btf_id);
@@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog)
 		case BPF_TRACE_FEXIT:
 		case BPF_MODIFY_RETURN:
 		case BPF_TRACE_ITER:
+		case BPF_TRACE_FSESSION:
 			return true;
 		default:
 			return false;
@@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 			tgt_info.tgt_name);
 		return -EINVAL;
 	} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+		   prog->expected_attach_type == BPF_TRACE_FSESSION ||
 		   prog->expected_attach_type == BPF_MODIFY_RETURN) &&
 		   btf_id_set_contains(&noreturn_deny, btf_id)) {
-		verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n",
+		verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n",
 			tgt_info.tgt_name);
 		return -EINVAL;
 	}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 26cfcfdc45eb..178c4738e63b 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 	switch (prog->expected_attach_type) {
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FSESSION:
 		if (bpf_fentry_test1(1) != 2 ||
 		    bpf_fentry_test2(2, 3) != 5 ||
 		    bpf_fentry_test3(4, 5, 6) != 15 ||
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 850dd736ccd1..de111818f3a0 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
 		return true;
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FSESSION:
 		return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
 				 strlen("bpf_sk_storage"));
 	default:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b816bc53d2e1..3ca7d76e05f0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1145,6 +1145,7 @@ enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
+	BPF_TRACE_FSESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
index 10e231965589..f9f9e1cb87bf 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
@@ -73,7 +73,7 @@ static void test_tracing_deny(void)
 static void test_fexit_noreturns(void)
 {
 	test_tracing_fail_prog("fexit_noreturns",
-			       "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected.");
+			       "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected.");
 }
 
 void test_tracing_failure(void)
-- 
cgit v1.2.3


From 0ac903d1bfdce8ff40657c2b7d996947b72b6645 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 9 Dec 2025 19:28:50 -0500
Subject: NFS: NFSERR_INVAL is not defined by NFSv2

A documenting comment in include/uapi/linux/nfs.h claims incorrectly
that NFSv2 defines NFSERR_INVAL. There is no such definition in either
RFC 1094 or https://pubs.opengroup.org/onlinepubs/9629799/chap7.htm

NFS3ERR_INVAL is introduced in RFC 1813.

NFSD returns NFSERR_INVAL for PROC_GETACL, which has no
specification (yet).

However, nfsd_map_status() maps nfserr_symlink and nfserr_wrong_type
to nfserr_inval, which does not align with RFC 1094. This logic was
introduced only recently by commit 438f81e0e92a ("nfsd: move error
choice for incorrect object types to version-specific code."). Given
that we have no INVAL or SERVERFAULT status in NFSv2, probably the
only choice is NFSERR_IO.

Fixes: 438f81e0e92a ("nfsd: move error choice for incorrect object types to version-specific code.")
Reviewed-by: NeilBrown <neil@brown.name>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs2acl.c        | 2 +-
 fs/nfsd/nfsproc.c        | 2 +-
 include/uapi/linux/nfs.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 5fb202acb0fd..0ac538c76180 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -45,7 +45,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
 	inode = d_inode(fh->fh_dentry);
 
 	if (argp->mask & ~NFS_ACL_MASK) {
-		resp->status = nfserr_inval;
+		resp->status = nfserr_io;
 		goto out;
 	}
 	resp->mask = argp->mask;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 481e789a7697..8873033d1e82 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -33,7 +33,7 @@ static __be32 nfsd_map_status(__be32 status)
 		break;
 	case nfserr_symlink:
 	case nfserr_wrong_type:
-		status = nfserr_inval;
+		status = nfserr_io;
 		break;
 	}
 	return status;
diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
index 71c7196d3281..e629c4953534 100644
--- a/include/uapi/linux/nfs.h
+++ b/include/uapi/linux/nfs.h
@@ -55,7 +55,7 @@
 	NFSERR_NODEV = 19,		/* v2 v3 v4 */
 	NFSERR_NOTDIR = 20,		/* v2 v3 v4 */
 	NFSERR_ISDIR = 21,		/* v2 v3 v4 */
-	NFSERR_INVAL = 22,		/* v2 v3 v4 */
+	NFSERR_INVAL = 22,		/*    v3 v4 */
 	NFSERR_FBIG = 27,		/* v2 v3 v4 */
 	NFSERR_NOSPC = 28,		/* v2 v3 v4 */
 	NFSERR_ROFS = 30,		/* v2 v3 v4 */
-- 
cgit v1.2.3


From 1965bbb8f3c72e5f1972b5eeb6f19a36664a676d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 22 Dec 2025 08:55:10 +0100
Subject: ipc/shm: uapi: remove dependency on libc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using libc types and headers from the UAPI headers is problematic as it
introduces a dependency on a full C toolchain.  shm.h does not even use
any symbols from the libc header as the usage of getpagesize() was removed
a decade ago in commit 060028bac94b ("ipc/shm.c: increase the defaults for
SHMALL, SHMMAX")

Drop the unnecessary inclusion.

Link: https://lkml.kernel.org/r/20251222-uapi-shm-v1-1-270bb7f75d97@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/shm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h
index 8d1f17a4e08e..7269f9f402e3 100644
--- a/include/uapi/linux/shm.h
+++ b/include/uapi/linux/shm.h
@@ -5,9 +5,6 @@
 #include <linux/ipc.h>
 #include <linux/errno.h>
 #include <asm-generic/hugetlb_encode.h>
-#ifndef __KERNEL__
-#include <unistd.h>
-#endif
 
 /*
  * SHMMNI, SHMMAX and SHMALL are default upper limits which can be
-- 
cgit v1.2.3


From 3d702678f57edc524f73a7865382ae304269f590 Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang@huawei.com>
Date: Tue, 23 Dec 2025 19:05:23 +0800
Subject: mm/mempolicy: fix mpol_rebind_nodemask() for MPOL_F_NUMA_BALANCING

commit bda420b98505 ("numa balancing: migrate on fault among multiple
bound nodes") adds new flag MPOL_F_NUMA_BALANCING to enable NUMA balancing
for MPOL_BIND memory policy.

When the cpuset of tasks changes, the mempolicy of the task is rebound by
mpol_rebind_nodemask().  When MPOL_F_STATIC_NODES and
MPOL_F_RELATIVE_NODES are both not set, the behaviour of rebinding should
be same whenever MPOL_F_NUMA_BALANCING is set or not.  So, when an
application calls set_mempolicy() with MPOL_F_NUMA_BALANCING set but both
MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES cleared,
mempolicy.w.cpuset_mems_allowed should be set to
cpuset_current_mems_allowed nodemask.  However, in current implementation,
mpol_store_user_nodemask() wrongly returns true, causing
mempolicy->w.user_nodemask to be incorrectly set to the user-specified
nodemask.  Later, when the cpuset of the application changes,
mpol_rebind_nodemask() ends up rebinding based on the user-specified
nodemask rather than the cpuset_mems_allowed nodemask as intended.

I can reproduce with the following steps in qemu with 4 NUMA nodes:
1. echo '+cpuset' > /sys/fs/cgroup/cgroup.subtree_control
2. mkdir /sys/fs/cgroup/test
3. ./reproducer &
4. cat /proc/$pid/numa_maps, the task is bound to NUMA 1
5. echo $pid > /sys/fs/cgroup/test/cgroup.procs
6. cat /proc/$pid/numa_maps, the task is bound to NUMA 0 now.

The reproducer code:

int main()
{
        struct bitmask *bmp;
        int ret;

        bmp = numa_parse_nodestring("1");
        ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING,
                bmp->maskp, bmp->size + 1);
        if (ret < 0) {
                perror("Failed to call set_mempolicy");
                exit(-1);
        }

        while (1);
        return 0;
}

If I call set_mempolicy() without MPOL_F_NUMA_BALANCING in the reproducer
code.  After step 5, the task is still bound to NUMA 1.

To fix this, only set mempolicy->w.user_nodemask to the user-specified
nodemask if MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES is present.

Link: https://lkml.kernel.org/r/20260120011018.1256654-1-tujinjiang@huawei.com
Link: https://lkml.kernel.org/r/20251223110523.1161421-1-tujinjiang@huawei.com
Fixes: bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes")
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
Reviewed-by: Gregory Price <gourry@gourry.net>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/mempolicy.h | 3 +++
 mm/mempolicy.c                 | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8fbbe613611a..6c962d866e86 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -39,6 +39,9 @@ enum {
 #define MPOL_MODE_FLAGS							\
 	(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING)
 
+/* Whether the nodemask is specified by users */
+#define MPOL_USER_NODEMASK_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
+
 /* Flags for get_mempolicy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
 #define MPOL_F_ADDR	(1<<1)	/* look up vma using address */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 68a98ba57882..76da50425712 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -365,7 +365,7 @@ static const struct mempolicy_operations {
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 {
-	return pol->flags & MPOL_MODE_FLAGS;
+	return pol->flags & MPOL_USER_NODEMASK_FLAGS;
 }
 
 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
-- 
cgit v1.2.3


From 86c6b6e4d187652d718915e15cf126f98e24e955 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Sun, 11 Jan 2026 19:03:48 +0200
Subject: wifi: nl80211/cfg80211: add new FTM capabilities

Add new capabilities to the PMSR FTM capabilities list. The new
capabilities include 6 GHz support, supported number of spatial streams
and supported number of LTF repetitions.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Tested-by: Miriam Rachel Korenblit <miriam.rachel.korenblit@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20260111190221.bf43785c18f6.Ic98cf9790ddee84bf88e5720b93c46c23af3c96c@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 20 +++++++++++++++++++-
 include/uapi/linux/nl80211.h | 29 +++++++++++++++++++++++++++++
 net/wireless/nl80211.c       | 23 +++++++++++++++++++++++
 3 files changed, 71 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6d8e35a0dde4..8153b6aaa998 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5643,6 +5643,17 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type);
  *	not limited)
  * @ftm.trigger_based: trigger based ranging measurement is supported
  * @ftm.non_trigger_based: non trigger based ranging measurement is supported
+ * @ftm.support_6ghz: supports ranging in 6 GHz band
+ * @ftm.max_tx_ltf_rep: maximum number of TX LTF repetitions supported (0 means
+ *	only one LTF, no repetitions)
+ * @ftm.max_rx_ltf_rep: maximum number of RX LTF repetitions supported (0 means
+ *	only one LTF, no repetitions)
+ * @ftm.max_tx_sts: maximum number of TX STS supported (zero based)
+ * @ftm.max_rx_sts: maximum number of RX STS supported (zero based)
+ * @ftm.max_total_ltf_tx: maximum total number of LTFs that can be transmitted
+ *	(0 means unknown)
+ * @ftm.max_total_ltf_rx: maximum total number of LTFs that can be received
+ *	(0 means unknown)
  */
 struct cfg80211_pmsr_capabilities {
 	unsigned int max_peers;
@@ -5660,7 +5671,14 @@ struct cfg80211_pmsr_capabilities {
 		   request_lci:1,
 		   request_civicloc:1,
 		   trigger_based:1,
-		   non_trigger_based:1;
+		   non_trigger_based:1,
+		   support_6ghz:1;
+		u8 max_tx_ltf_rep;
+		u8 max_rx_ltf_rep;
+		u8 max_tx_sts;
+		u8 max_rx_sts;
+		u8 max_total_ltf_tx;
+		u8 max_total_ltf_rx;
 	} ftm;
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index b0f050e36fa4..200703c8b2c1 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -7790,6 +7790,28 @@ enum nl80211_peer_measurement_attrs {
  *	trigger based ranging measurement is supported
  * @NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED: flag attribute indicating
  *	if non-trigger-based ranging measurement is supported
+ * @NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT: flag attribute indicating if
+ *	ranging on the 6 GHz band is supported
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP: u32 attribute indicating
+ *	the maximum number of LTF repetitions the device can transmit in the
+ *	preamble of the ranging NDP (zero means only one LTF, no repetitions)
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP: u32 attribute indicating
+ *	the maximum number of LTF repetitions the device can receive in the
+ *	preamble of the ranging NDP (zero means only one LTF, no repetitions)
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS: u32 attribute indicating
+ *	the maximum number of space-time streams supported for ranging NDP TX
+ *	(zero-based)
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS: u32 attribute indicating
+ *	the maximum number of space-time streams supported for ranging NDP RX
+ *	(zero-based)
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX: u32 attribute indicating the
+ *	maximum total number of LTFs the device can transmit. The total number
+ *	of LTFs is (number of LTF repetitions) * (number of space-time streams).
+ *	This limits the allowed combinations of LTF repetitions and STS.
+ * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX: u32 attribute indicating the
+ *	maximum total number of LTFs the device can receive. The total number
+ *	of LTFs is (number of LTF repetitions) * (number of space-time streams).
+ *	This limits the allowed combinations of LTF repetitions and STS.
  *
  * @NUM_NL80211_PMSR_FTM_CAPA_ATTR: internal
  * @NL80211_PMSR_FTM_CAPA_ATTR_MAX: highest attribute number
@@ -7807,6 +7829,13 @@ enum nl80211_peer_measurement_ftm_capa {
 	NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST,
 	NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED,
 	NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED,
+	NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX,
+	NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX,
 
 	/* keep last */
 	NUM_NL80211_PMSR_FTM_CAPA_ATTR,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 56cc5ed33ea3..74ea922a5e8a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2313,6 +2313,29 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap,
 	if (cap->ftm.non_trigger_based &&
 	    nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED))
 		return -ENOBUFS;
+	if (cap->ftm.support_6ghz &&
+	    nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT))
+		return -ENOBUFS;
+	if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP,
+			cap->ftm.max_tx_ltf_rep))
+		return -ENOBUFS;
+	if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP,
+			cap->ftm.max_rx_ltf_rep))
+		return -ENOBUFS;
+	if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS,
+			cap->ftm.max_tx_sts))
+		return -ENOBUFS;
+	if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS,
+			cap->ftm.max_rx_sts))
+		return -ENOBUFS;
+	if (cap->ftm.max_total_ltf_tx > 0 &&
+	    nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX,
+			cap->ftm.max_total_ltf_tx))
+		return -ENOBUFS;
+	if (cap->ftm.max_total_ltf_rx > 0 &&
+	    nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX,
+			cap->ftm.max_total_ltf_rx))
+		return -ENOBUFS;
 
 	nla_nest_end(msg, ftm);
 	return 0;
-- 
cgit v1.2.3


From 853ce6943c385be2f6cccf371080e592f2e08b0f Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Sun, 11 Jan 2026 19:03:49 +0200
Subject: wifi: nl80211/cfg80211: clarify periodic FTM parameters for non-EDCA
 based ranging

Periodic FTM request attributes are defined based on the periodic
parameters used in EDCA-based ranging negotiation. However, non-EDCA
based ranging (trigger-based/non-trigger-based) does not include
periodic parameters in the negotiation protocol, even though upper
layers may still request periodic measurements.

Clarify the semantics of periodic ranging attributes when used with
non-EDCA based ranging.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20260111190221.b89cb3f68e1a.I7a9d8c6d1c66c77f1b43120a841101c96c3f19ad@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 +++-
 include/uapi/linux/nl80211.h |  7 +++++--
 net/wireless/pmsr.c          | 11 ++++++-----
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8153b6aaa998..8a81adbf3723 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4295,7 +4295,9 @@ struct cfg80211_pmsr_result {
  * @burst_period: burst period to use
  * @asap: indicates to use ASAP mode
  * @num_bursts_exp: number of bursts exponent
- * @burst_duration: burst duration
+ * @burst_duration: burst duration. If @trigger_based or @non_trigger_based is
+ *	set, this is the burst duration in milliseconds, and zero means the
+ *	device should pick an appropriate value based on @ftms_per_burst.
  * @ftms_per_burst: number of FTMs per burst
  * @ftmr_retries: number of retries for FTM request
  * @request_lci: request LCI information
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 200703c8b2c1..71219445f5c7 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -7851,12 +7851,15 @@ enum nl80211_peer_measurement_ftm_capa {
  *	&enum nl80211_preamble), optional for DMG (u32)
  * @NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP: number of bursts exponent as in
  *	802.11-2016 9.4.2.168 "Fine Timing Measurement Parameters element"
- *	(u8, 0-15, optional with default 15 i.e. "no preference")
+ *	(u8, 0-15, optional with default 15 i.e. "no preference". No limit for
+ *	 non-EDCA ranging)
  * @NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD: interval between bursts in units
  *	of 100ms (u16, optional with default 0)
  * @NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION: burst duration, as in 802.11-2016
  *	Table 9-257 "Burst Duration field encoding" (u8, 0-15, optional with
- *	default 15 i.e. "no preference")
+ *	default 15 i.e. "no preference"). For non-EDCA ranging, this is the
+ *	burst duration in milliseconds (optional with default 0, i.e. let the
+ *	device decide).
  * @NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST: number of successful FTM frames
  *	requested per burst
  *	(u8, 0-31, optional with default 0 i.e. "no preference")
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index a117f5093ca2..795683a81303 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -85,11 +85,6 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
 		return -EINVAL;
 	}
 
-	out->ftm.burst_duration = 15;
-	if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION])
-		out->ftm.burst_duration =
-			nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]);
-
 	out->ftm.ftms_per_burst = 0;
 	if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST])
 		out->ftm.ftms_per_burst =
@@ -164,6 +159,12 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
 		return -EINVAL;
 	}
 
+	if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION])
+		out->ftm.burst_duration =
+			nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]);
+	else if (!out->ftm.non_trigger_based && !out->ftm.trigger_based)
+		out->ftm.burst_duration = 15;
+
 	out->ftm.lmr_feedback =
 		!!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK];
 	if (!out->ftm.trigger_based && !out->ftm.non_trigger_based &&
-- 
cgit v1.2.3


From cfd46d1c6f4bf232c5630b1cf5c8b317d38101c5 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Sun, 11 Jan 2026 19:03:50 +0200
Subject: wifi: nl80211/cfg80211: add negotiated burst period to FTM result

The FTM result includes some of the periodic measurement negotiated
parameters (like the burst duration and number of bursts), but it
doesn't include the burst period. Add it to the FTM result
notification.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20260111190221.e0778f86edef.I3c98c1933eb639963bc3ffdef81a8788b59f2188@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 3 +++
 net/wireless/pmsr.c          | 1 +
 3 files changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8a81adbf3723..535fd95b0d83 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4192,6 +4192,7 @@ struct cfg80211_ftm_responder_stats {
  * @num_bursts_exp: actual number of bursts exponent negotiated
  * @burst_duration: actual burst duration negotiated
  * @ftms_per_burst: actual FTMs per burst negotiated
+ * @burst_period: actual burst period negotiated in units of 100ms
  * @lci_len: length of LCI information (if present)
  * @civicloc_len: length of civic location information (if present)
  * @lci: LCI data (may be %NULL)
@@ -4233,6 +4234,7 @@ struct cfg80211_pmsr_ftm_result {
 	u8 num_bursts_exp;
 	u8 burst_duration;
 	u8 ftms_per_burst;
+	u16 burst_period;
 	s32 rssi_avg;
 	s32 rssi_spread;
 	struct rate_info tx_rate, rx_rate;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 71219445f5c7..8910b709bfb1 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -7992,6 +7992,8 @@ enum nl80211_peer_measurement_ftm_failure_reasons {
  *	9.4.2.22.1) starting with the Measurement Token, with Measurement
  *	Type 11.
  * @NL80211_PMSR_FTM_RESP_ATTR_PAD: ignore, for u64/s64 padding only
+ * @NL80211_PMSR_FTM_RESP_ATTR_BURST_PERIOD: actual burst period used by
+ *	the responder (similar to request, u16)
  *
  * @NUM_NL80211_PMSR_FTM_RESP_ATTR: internal
  * @NL80211_PMSR_FTM_RESP_ATTR_MAX: highest attribute number
@@ -8020,6 +8022,7 @@ enum nl80211_peer_measurement_ftm_resp {
 	NL80211_PMSR_FTM_RESP_ATTR_LCI,
 	NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC,
 	NL80211_PMSR_FTM_RESP_ATTR_PAD,
+	NL80211_PMSR_FTM_RESP_ATTR_BURST_PERIOD,
 
 	/* keep last */
 	NUM_NL80211_PMSR_FTM_RESP_ATTR,
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index 795683a81303..d5077d320098 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -454,6 +454,7 @@ static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg,
 	PUT(u8, NUM_BURSTS_EXP, num_bursts_exp);
 	PUT(u8, BURST_DURATION, burst_duration);
 	PUT(u8, FTMS_PER_BURST, ftms_per_burst);
+	PUT(u16, BURST_PERIOD, burst_period);
 	PUTOPT(s32, RSSI_AVG, rssi_avg);
 	PUTOPT(s32, RSSI_SPREAD, rssi_spread);
 	if (res->ftm.tx_rate_valid &&
-- 
cgit v1.2.3


From 853800c746d38486673ef67f461b660a01d52716 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Sun, 11 Jan 2026 19:03:51 +0200
Subject: wifi: nl80211/cfg80211: support operating as RSTA in PMSR FTM request

Add an option to operate as the RSTA in an FTM measurement request.
When requested, the device will dwell on the requested channel until
the peer starts the FTM negotiation. This option is only valid for
trigger-based/non trigger-based measurement with LMR feedback which
will allow the RSTA to receive the results of the measurement.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20260111190221.1f95fc0afab4.Iae2d32783b8e7c4a29089fec0f4c6bce94d303cc@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  7 ++++++-
 include/uapi/linux/nl80211.h | 12 ++++++++++++
 net/wireless/nl80211.c       |  4 ++++
 net/wireless/pmsr.c          | 15 +++++++++++++++
 4 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 535fd95b0d83..ac7df439bd24 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4312,6 +4312,8 @@ struct cfg80211_pmsr_result {
  *		 EDCA based ranging will be used.
  * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either
  *		 @trigger_based or @non_trigger_based is set.
+ * @rsta: Operate as the RSTA in the measurement. Only valid if @lmr_feedback
+ *	and either @trigger_based or @non_trigger_based is set.
  * @bss_color: the bss color of the responder. Optional. Set to zero to
  *	indicate the driver should set the BSS color. Only valid if
  *	@non_trigger_based or @trigger_based is set.
@@ -4327,7 +4329,8 @@ struct cfg80211_pmsr_ftm_request_peer {
 	   request_civicloc:1,
 	   trigger_based:1,
 	   non_trigger_based:1,
-	   lmr_feedback:1;
+	   lmr_feedback:1,
+	   rsta:1;
 	u8 num_bursts_exp;
 	u8 burst_duration;
 	u8 ftms_per_burst;
@@ -5658,6 +5661,7 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type);
  *	(0 means unknown)
  * @ftm.max_total_ltf_rx: maximum total number of LTFs that can be received
  *	(0 means unknown)
+ * @ftm.support_rsta: supports operating as RSTA in PMSR FTM request
  */
 struct cfg80211_pmsr_capabilities {
 	unsigned int max_peers;
@@ -5683,6 +5687,7 @@ struct cfg80211_pmsr_capabilities {
 		u8 max_rx_sts;
 		u8 max_total_ltf_tx;
 		u8 max_total_ltf_rx;
+		u8 support_rsta:1;
 	} ftm;
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8910b709bfb1..54ddbd9a5459 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -7812,6 +7812,8 @@ enum nl80211_peer_measurement_attrs {
  *	maximum total number of LTFs the device can receive. The total number
  *	of LTFs is (number of LTF repetitions) * (number of space-time streams).
  *	This limits the allowed combinations of LTF repetitions and STS.
+ * @NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT: flag attribute indicating the
+ *	device supports operating as the RSTA in PMSR FTM request
  *
  * @NUM_NL80211_PMSR_FTM_CAPA_ATTR: internal
  * @NL80211_PMSR_FTM_CAPA_ATTR_MAX: highest attribute number
@@ -7836,6 +7838,7 @@ enum nl80211_peer_measurement_ftm_capa {
 	NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS,
 	NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX,
 	NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX,
+	NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT,
 
 	/* keep last */
 	NUM_NL80211_PMSR_FTM_CAPA_ATTR,
@@ -7888,6 +7891,14 @@ enum nl80211_peer_measurement_ftm_capa {
  * @NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR: optional. The BSS color of the
  *	responder. Only valid if %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED
  *	or %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED is set.
+ * @NL80211_PMSR_FTM_REQ_ATTR_RSTA: optional. Request to perform the measurement
+ *	as the RSTA (flag). When set, the device is expected to dwell on the
+ *	channel specified in %NL80211_PMSR_PEER_ATTR_CHAN until it receives the
+ *	FTM request from the peer or the timeout specified by
+ *	%NL80211_ATTR_TIMEOUT has expired.
+ *	Only valid if %NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK is set (so the
+ *	RSTA will have the measurement results to report back in the FTM
+ *	response).
  *
  * @NUM_NL80211_PMSR_FTM_REQ_ATTR: internal
  * @NL80211_PMSR_FTM_REQ_ATTR_MAX: highest attribute number
@@ -7908,6 +7919,7 @@ enum nl80211_peer_measurement_ftm_req {
 	NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED,
 	NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK,
 	NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR,
+	NL80211_PMSR_FTM_REQ_ATTR_RSTA,
 
 	/* keep last */
 	NUM_NL80211_PMSR_FTM_REQ_ATTR,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 74ea922a5e8a..85e30fda4c46 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -361,6 +361,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = {
 	[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG },
 	[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG },
 	[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR] = { .type = NLA_U8 },
+	[NL80211_PMSR_FTM_REQ_ATTR_RSTA] = { .type = NLA_FLAG },
 };
 
 static const struct nla_policy
@@ -2336,6 +2337,9 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap,
 	    nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX,
 			cap->ftm.max_total_ltf_rx))
 		return -ENOBUFS;
+	if (cap->ftm.support_rsta &&
+	    nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT))
+		return -ENOBUFS;
 
 	nla_nest_end(msg, ftm);
 	return 0;
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index d5077d320098..60e1e31c2185 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -187,6 +187,21 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
 			nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]);
 	}
 
+	out->ftm.rsta = !!tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA];
+	if (out->ftm.rsta && !capa->ftm.support_rsta) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA],
+				    "FTM: RSTA not supported by device");
+		return -EOPNOTSUPP;
+	}
+
+	if (out->ftm.rsta && !out->ftm.lmr_feedback) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA],
+				    "FTM: RSTA set without LMR feedback");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Tue, 27 Jan 2026 08:51:10 +0000
Subject: bpf: add new BPF_CGROUP_ITER_CHILDREN control option

Currently, the BPF cgroup iterator supports walking descendants in
either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order
(BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive
depth-first search (DFS) of the hierarchy. In scenarios where a BPF
program may need to inspect only the direct children of a given parent
cgroup, a full DFS is unnecessarily expensive.

This patch introduces a new BPF cgroup iterator control option,
BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal
to the immediate children of a specified parent cgroup, allowing for
more targeted and efficient iteration, particularly when exhaustive
depth-first search (DFS) traversal is not required.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       |  8 ++++++++
 kernel/bpf/cgroup_iter.c       | 26 +++++++++++++++++++++-----
 tools/include/uapi/linux/bpf.h |  8 ++++++++
 3 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 44e7dbc278e3..c8d400b7680a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order {
 	BPF_CGROUP_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
 	BPF_CGROUP_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
 	BPF_CGROUP_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
+	/*
+	 * Walks the immediate children of the specified parent
+	 * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE,
+	 * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP
+	 * the iterator does not include the specified parent as one of the
+	 * returned iterator elements.
+	 */
+	BPF_CGROUP_ITER_CHILDREN,
 };
 
 union bpf_iter_link_info {
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index f04a468cf6a7..fd51fe3d92cc 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -8,12 +8,13 @@
 
 #include "../cgroup/cgroup-internal.h"  /* cgroup_mutex and cgroup_is_dead */
 
-/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+/* cgroup_iter provides five modes of traversal to the cgroup hierarchy.
  *
  *  1. Walk the descendants of a cgroup in pre-order.
  *  2. Walk the descendants of a cgroup in post-order.
  *  3. Walk the ancestors of a cgroup.
  *  4. Show the given cgroup only.
+ *  5. Walk the children of a given parent cgroup.
  *
  * For walking descendants, cgroup_iter can walk in either pre-order or
  * post-order. For walking ancestors, the iter walks up from a cgroup to
@@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
 		return css_next_descendant_pre(NULL, p->start_css);
 	else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
 		return css_next_descendant_post(NULL, p->start_css);
+	else if (p->order == BPF_CGROUP_ITER_CHILDREN)
+		return css_next_child(NULL, p->start_css);
 	else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
 		return p->start_css;
 }
@@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		return css_next_descendant_post(curr, p->start_css);
 	else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
 		return curr->parent;
+	else if (p->order == BPF_CGROUP_ITER_CHILDREN)
+		return css_next_child(curr, p->start_css);
 	else  /* BPF_CGROUP_ITER_SELF_ONLY */
 		return NULL;
 }
@@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
 	int order = linfo->cgroup.order;
 	struct cgroup *cgrp;
 
-	if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
-	    order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
-	    order != BPF_CGROUP_ITER_ANCESTORS_UP &&
-	    order != BPF_CGROUP_ITER_SELF_ONLY)
+	switch (order) {
+	case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+	case BPF_CGROUP_ITER_DESCENDANTS_POST:
+	case BPF_CGROUP_ITER_ANCESTORS_UP:
+	case BPF_CGROUP_ITER_SELF_ONLY:
+	case BPF_CGROUP_ITER_CHILDREN:
+		break;
+	default:
 		return -EINVAL;
+	}
 
 	if (fd && id)
 		return -EINVAL;
@@ -257,6 +267,8 @@ show_order:
 		seq_puts(seq, "order: descendants_post\n");
 	else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
 		seq_puts(seq, "order: ancestors_up\n");
+	else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN)
+		seq_puts(seq, "order: children\n");
 	else /* BPF_CGROUP_ITER_SELF_ONLY */
 		seq_puts(seq, "order: self_only\n");
 }
@@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
 	case BPF_CGROUP_ITER_DESCENDANTS_PRE:
 	case BPF_CGROUP_ITER_DESCENDANTS_POST:
 	case BPF_CGROUP_ITER_ANCESTORS_UP:
+	case BPF_CGROUP_ITER_CHILDREN:
 		break;
 	default:
 		return -EINVAL;
@@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i
 	case BPF_CGROUP_ITER_DESCENDANTS_POST:
 		kit->pos = css_next_descendant_post(kit->pos, kit->start);
 		break;
+	case BPF_CGROUP_ITER_CHILDREN:
+		kit->pos = css_next_child(kit->pos, kit->start);
+		break;
 	case BPF_CGROUP_ITER_ANCESTORS_UP:
 		kit->pos = kit->pos ? kit->pos->parent : kit->start;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3ca7d76e05f0..5e38b4887de6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order {
 	BPF_CGROUP_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
 	BPF_CGROUP_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
 	BPF_CGROUP_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
+	/*
+	 * Walks the immediate children of the specified parent
+	 * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE,
+	 * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP
+	 * the iterator does not include the specified parent as one of the
+	 * returned iterator elements.
+	 */
+	BPF_CGROUP_ITER_CHILDREN,
 };
 
 union bpf_iter_link_info {
-- 
cgit v1.2.3


From d42eb05e60fea31de49897d63a1d73f933303bd4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Jan 2026 08:24:02 -0700
Subject: io_uring: add support for BPF filtering for opcode restrictions

Add support for loading classic BPF programs with io_uring to provide
fine-grained filtering of SQE operations. Unlike
IORING_REGISTER_RESTRICTIONS which only allows bitmap-based allow/deny
of opcodes, BPF filters can inspect request attributes and make dynamic
decisions.

The filter is registered via IORING_REGISTER_BPF_FILTER with a struct
io_uring_bpf:

struct io_uring_bpf_filter {
	__u32	opcode;		/* io_uring opcode to filter */
	__u32	flags;
	__u32	filter_len;	/* number of BPF instructions */
	__u32	resv;
	__u64	filter_ptr;	/* pointer to BPF filter */
	__u64	resv2[5];
};

enum {
	IO_URING_BPF_CMD_FILTER	= 1,
};

struct io_uring_bpf {
	__u16	cmd_type;	/* IO_URING_BPF_* values */
	__u16	cmd_flags;	/* none so far */
	__u32	resv;
	union {
		struct io_uring_bpf_filter	filter;
	};
};

and the filters get supplied a struct io_uring_bpf_ctx:

struct io_uring_bpf_ctx {
	__u64	user_data;
	__u8	opcode;
	__u8	sqe_flags;
	__u8	pdu_size;
	__u8	pad[5];
};

where it's possible to filter on opcode and sqe_flags, with pdu_size
indicating how much extra data is being passed in beyond the pad field.
This will used for specific finer grained filtering inside an opcode.
An example of that for sockets is in one of the following patches.
Anything the opcode supports can end up in this struct, populated by
the opcode itself, and hence can be filtered for.

Filters have the following semantics:
  - Return 1 to allow the request
  - Return 0 to deny the request with -EACCES
  - Multiple filters can be stacked per opcode. All filters must
    return 1 for the opcode to be allowed.
  - Filters are evaluated in registration order (most recent first)

The implementation uses classic BPF (cBPF) rather than eBPF for as
that's required for containers, and since they can be used by any
user in the system.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h           |   9 +
 include/uapi/linux/io_uring.h            |   3 +
 include/uapi/linux/io_uring/bpf_filter.h |  50 +++++
 io_uring/Kconfig                         |   5 +
 io_uring/Makefile                        |   1 +
 io_uring/bpf_filter.c                    | 321 +++++++++++++++++++++++++++++++
 io_uring/bpf_filter.h                    |  42 ++++
 io_uring/io_uring.c                      |   8 +
 io_uring/register.c                      |   8 +
 9 files changed, 447 insertions(+)
 create mode 100644 include/uapi/linux/io_uring/bpf_filter.h
 create mode 100644 io_uring/bpf_filter.c
 create mode 100644 io_uring/bpf_filter.h

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index dc6bd6940a0d..74bf98362876 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -219,9 +219,18 @@ struct io_rings {
 	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
+struct io_bpf_filter;
+struct io_bpf_filters {
+	refcount_t refs;	/* ref for ->bpf_filters */
+	spinlock_t lock;	/* protects ->bpf_filters modifications */
+	struct io_bpf_filter __rcu **filters;
+	struct rcu_head rcu_head;
+};
+
 struct io_restriction {
 	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+	struct io_bpf_filters *bpf_filters;
 	u8 sqe_flags_allowed;
 	u8 sqe_flags_required;
 	/* IORING_OP_* restrictions exist */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b5b23c0d5283..94669b77fee8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -700,6 +700,9 @@ enum io_uring_register_op {
 	/* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
 	IORING_REGISTER_ZCRX_CTRL		= 36,
 
+	/* register bpf filtering programs */
+	IORING_REGISTER_BPF_FILTER		= 37,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h
new file mode 100644
index 000000000000..2d4d0e5743e4
--- /dev/null
+++ b/include/uapi/linux/io_uring/bpf_filter.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
+/*
+ * Header file for the io_uring BPF filters.
+ */
+#ifndef LINUX_IO_URING_BPF_FILTER_H
+#define LINUX_IO_URING_BPF_FILTER_H
+
+#include <linux/types.h>
+
+/*
+ * Struct passed to filters.
+ */
+struct io_uring_bpf_ctx {
+	__u64	user_data;
+	__u8	opcode;
+	__u8	sqe_flags;
+	__u8	pdu_size;	/* size of aux data for filter */
+	__u8	pad[5];
+};
+
+enum {
+	/*
+	 * If set, any currently unset opcode will have a deny filter attached
+	 */
+	IO_URING_BPF_FILTER_DENY_REST	= 1,
+};
+
+struct io_uring_bpf_filter {
+	__u32	opcode;		/* io_uring opcode to filter */
+	__u32	flags;
+	__u32	filter_len;	/* number of BPF instructions */
+	__u32	resv;
+	__u64	filter_ptr;	/* pointer to BPF filter */
+	__u64	resv2[5];
+};
+
+enum {
+	IO_URING_BPF_CMD_FILTER	= 1,
+};
+
+struct io_uring_bpf {
+	__u16	cmd_type;	/* IO_URING_BPF_* values */
+	__u16	cmd_flags;	/* none so far */
+	__u32	resv;
+	union {
+		struct io_uring_bpf_filter	filter;
+	};
+};
+
+#endif
diff --git a/io_uring/Kconfig b/io_uring/Kconfig
index 4b949c42c0bf..a7ae23cf1035 100644
--- a/io_uring/Kconfig
+++ b/io_uring/Kconfig
@@ -9,3 +9,8 @@ config IO_URING_ZCRX
 	depends on PAGE_POOL
 	depends on INET
 	depends on NET_RX_BUSY_POLL
+
+config IO_URING_BPF
+	def_bool y
+	depends on BPF
+	depends on NET
diff --git a/io_uring/Makefile b/io_uring/Makefile
index bf9eff88427a..931f9156132a 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_NET_RX_BUSY_POLL)	+= napi.o
 obj-$(CONFIG_NET) += net.o cmd_net.o
 obj-$(CONFIG_PROC_FS) += fdinfo.o
 obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
+obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
new file mode 100644
index 000000000000..5207226d72ea
--- /dev/null
+++ b/io_uring/bpf_filter.c
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF filter support for io_uring. Supports SQE opcodes for now.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/io_uring.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "bpf_filter.h"
+#include "net.h"
+
+struct io_bpf_filter {
+	struct bpf_prog		*prog;
+	struct io_bpf_filter	*next;
+};
+
+/* Deny if this is set as the filter */
+static const struct io_bpf_filter dummy_filter;
+
+static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
+				      struct io_kiocb *req)
+{
+	bctx->opcode = req->opcode;
+	bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS;
+	bctx->user_data = req->cqe.user_data;
+	/* clear residual, anything from pdu_size and below */
+	memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0,
+		sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size));
+}
+
+/*
+ * Run registered filters for a given opcode. For filters, a return of 0 denies
+ * execution of the request, a return of 1 allows it. If any filter for an
+ * opcode returns 0, filter processing is stopped, and the request is denied.
+ * This also stops the processing of filters.
+ *
+ * __io_uring_run_bpf_filters() returns 0 on success, allow running the
+ * request, and -EACCES when a request is denied.
+ */
+int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
+{
+	struct io_bpf_filter *filter;
+	struct io_uring_bpf_ctx bpf_ctx;
+	int ret;
+
+	/* Fast check for existence of filters outside of RCU */
+	if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
+		return 0;
+
+	/*
+	 * req->opcode has already been validated to be within the range
+	 * of what we expect, io_init_req() does this.
+	 */
+	guard(rcu)();
+	filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
+	if (!filter)
+		return 0;
+	else if (filter == &dummy_filter)
+		return -EACCES;
+
+	io_uring_populate_bpf_ctx(&bpf_ctx, req);
+
+	/*
+	 * Iterate registered filters. The opcode is allowed IFF all filters
+	 * return 1. If any filter returns denied, opcode will be denied.
+	 */
+	do {
+		if (filter == &dummy_filter)
+			return -EACCES;
+		ret = bpf_prog_run(filter->prog, &bpf_ctx);
+		if (!ret)
+			return -EACCES;
+		filter = filter->next;
+	} while (filter);
+
+	return 0;
+}
+
+static void io_free_bpf_filters(struct rcu_head *head)
+{
+	struct io_bpf_filter __rcu **filter;
+	struct io_bpf_filters *filters;
+	int i;
+
+	filters = container_of(head, struct io_bpf_filters, rcu_head);
+	scoped_guard(spinlock, &filters->lock) {
+		filter = filters->filters;
+		if (!filter)
+			return;
+	}
+
+	for (i = 0; i < IORING_OP_LAST; i++) {
+		struct io_bpf_filter *f;
+
+		rcu_read_lock();
+		f = rcu_dereference(filter[i]);
+		while (f) {
+			struct io_bpf_filter *next = f->next;
+
+			/*
+			 * Even if stacked, dummy filter will always be last
+			 * as it can only get installed into an empty spot.
+			 */
+			if (f == &dummy_filter)
+				break;
+			bpf_prog_destroy(f->prog);
+			kfree(f);
+			f = next;
+		}
+		rcu_read_unlock();
+	}
+	kfree(filters->filters);
+	kfree(filters);
+}
+
+static void __io_put_bpf_filters(struct io_bpf_filters *filters)
+{
+	if (refcount_dec_and_test(&filters->refs))
+		call_rcu(&filters->rcu_head, io_free_bpf_filters);
+}
+
+void io_put_bpf_filters(struct io_restriction *res)
+{
+	if (res->bpf_filters)
+		__io_put_bpf_filters(res->bpf_filters);
+}
+
+static struct io_bpf_filters *io_new_bpf_filters(void)
+{
+	struct io_bpf_filters *filters __free(kfree) = NULL;
+
+	filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
+	if (!filters)
+		return ERR_PTR(-ENOMEM);
+
+	filters->filters = kcalloc(IORING_OP_LAST,
+				   sizeof(struct io_bpf_filter *),
+				   GFP_KERNEL_ACCOUNT);
+	if (!filters->filters)
+		return ERR_PTR(-ENOMEM);
+
+	refcount_set(&filters->refs, 1);
+	spin_lock_init(&filters->lock);
+	return no_free_ptr(filters);
+}
+
+/*
+ * Validate classic BPF filter instructions. Only allow a safe subset of
+ * operations - no packet data access, just context field loads and basic
+ * ALU/jump operations.
+ */
+static int io_uring_check_cbpf_filter(struct sock_filter *filter,
+				      unsigned int flen)
+{
+	int pc;
+
+	for (pc = 0; pc < flen; pc++) {
+		struct sock_filter *ftest = &filter[pc];
+		u16 code = ftest->code;
+		u32 k = ftest->k;
+
+		switch (code) {
+		case BPF_LD | BPF_W | BPF_ABS:
+			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
+			/* 32-bit aligned and not out of bounds. */
+			if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3)
+				return -EINVAL;
+			continue;
+		case BPF_LD | BPF_W | BPF_LEN:
+			ftest->code = BPF_LD | BPF_IMM;
+			ftest->k = sizeof(struct io_uring_bpf_ctx);
+			continue;
+		case BPF_LDX | BPF_W | BPF_LEN:
+			ftest->code = BPF_LDX | BPF_IMM;
+			ftest->k = sizeof(struct io_uring_bpf_ctx);
+			continue;
+		/* Explicitly include allowed calls. */
+		case BPF_RET | BPF_K:
+		case BPF_RET | BPF_A:
+		case BPF_ALU | BPF_ADD | BPF_K:
+		case BPF_ALU | BPF_ADD | BPF_X:
+		case BPF_ALU | BPF_SUB | BPF_K:
+		case BPF_ALU | BPF_SUB | BPF_X:
+		case BPF_ALU | BPF_MUL | BPF_K:
+		case BPF_ALU | BPF_MUL | BPF_X:
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU | BPF_DIV | BPF_X:
+		case BPF_ALU | BPF_AND | BPF_K:
+		case BPF_ALU | BPF_AND | BPF_X:
+		case BPF_ALU | BPF_OR | BPF_K:
+		case BPF_ALU | BPF_OR | BPF_X:
+		case BPF_ALU | BPF_XOR | BPF_K:
+		case BPF_ALU | BPF_XOR | BPF_X:
+		case BPF_ALU | BPF_LSH | BPF_K:
+		case BPF_ALU | BPF_LSH | BPF_X:
+		case BPF_ALU | BPF_RSH | BPF_K:
+		case BPF_ALU | BPF_RSH | BPF_X:
+		case BPF_ALU | BPF_NEG:
+		case BPF_LD | BPF_IMM:
+		case BPF_LDX | BPF_IMM:
+		case BPF_MISC | BPF_TAX:
+		case BPF_MISC | BPF_TXA:
+		case BPF_LD | BPF_MEM:
+		case BPF_LDX | BPF_MEM:
+		case BPF_ST:
+		case BPF_STX:
+		case BPF_JMP | BPF_JA:
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JSET | BPF_K:
+		case BPF_JMP | BPF_JSET | BPF_X:
+			continue;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+#define IO_URING_BPF_FILTER_FLAGS	IO_URING_BPF_FILTER_DENY_REST
+
+int io_register_bpf_filter(struct io_restriction *res,
+			   struct io_uring_bpf __user *arg)
+{
+	struct io_bpf_filter *filter, *old_filter;
+	struct io_bpf_filters *filters;
+	struct io_uring_bpf reg;
+	struct bpf_prog *prog;
+	struct sock_fprog fprog;
+	int ret;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
+		return -EINVAL;
+	if (reg.cmd_flags || reg.resv)
+		return -EINVAL;
+
+	if (reg.filter.opcode >= IORING_OP_LAST)
+		return -EINVAL;
+	if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
+		return -EINVAL;
+	if (reg.filter.resv)
+		return -EINVAL;
+	if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2)))
+		return -EINVAL;
+	if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS)
+		return -EINVAL;
+
+	fprog.len = reg.filter.filter_len;
+	fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);
+
+	ret = bpf_prog_create_from_user(&prog, &fprog,
+					io_uring_check_cbpf_filter, false);
+	if (ret)
+		return ret;
+
+	/*
+	 * No existing filters, allocate set.
+	 */
+	filters = res->bpf_filters;
+	if (!filters) {
+		filters = io_new_bpf_filters();
+		if (IS_ERR(filters)) {
+			ret = PTR_ERR(filters);
+			goto err_prog;
+		}
+	}
+
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
+	if (!filter) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	filter->prog = prog;
+	res->bpf_filters = filters;
+
+	/*
+	 * Insert filter - if the current opcode already has a filter
+	 * attached, add to the set.
+	 */
+	rcu_read_lock();
+	spin_lock_bh(&filters->lock);
+	old_filter = rcu_dereference(filters->filters[reg.filter.opcode]);
+	if (old_filter)
+		filter->next = old_filter;
+	rcu_assign_pointer(filters->filters[reg.filter.opcode], filter);
+
+	/*
+	 * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered
+	 * opcode with the dummy filter. That will cause them to be denied.
+	 */
+	if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) {
+		for (int i = 0; i < IORING_OP_LAST; i++) {
+			if (i == reg.filter.opcode)
+				continue;
+			old_filter = rcu_dereference(filters->filters[i]);
+			if (old_filter)
+				continue;
+			rcu_assign_pointer(filters->filters[i], &dummy_filter);
+		}
+	}
+
+	spin_unlock_bh(&filters->lock);
+	rcu_read_unlock();
+	return 0;
+err:
+	if (filters != res->bpf_filters)
+		__io_put_bpf_filters(filters);
+err_prog:
+	bpf_prog_destroy(prog);
+	return ret;
+}
diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h
new file mode 100644
index 000000000000..27eae9705473
--- /dev/null
+++ b/io_uring/bpf_filter.h
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IO_URING_BPF_FILTER_H
+#define IO_URING_BPF_FILTER_H
+
+#include <uapi/linux/io_uring/bpf_filter.h>
+
+#ifdef CONFIG_IO_URING_BPF
+
+int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req);
+
+int io_register_bpf_filter(struct io_restriction *res,
+			   struct io_uring_bpf __user *arg);
+
+void io_put_bpf_filters(struct io_restriction *res);
+
+static inline int io_uring_run_bpf_filters(struct io_restriction *res,
+					   struct io_kiocb *req)
+{
+	if (res->bpf_filters)
+		return __io_uring_run_bpf_filters(res, req);
+
+	return 0;
+}
+
+#else
+
+static inline int io_register_bpf_filter(struct io_restriction *res,
+					 struct io_uring_bpf __user *arg)
+{
+	return -EINVAL;
+}
+static inline int io_uring_run_bpf_filters(struct io_restriction *res,
+					   struct io_kiocb *req)
+{
+	return 0;
+}
+static inline void io_put_bpf_filters(struct io_restriction *res)
+{
+}
+#endif /* CONFIG_IO_URING_BPF */
+
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a50459238bee..9b9794dfc27a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -94,6 +94,7 @@
 #include "alloc_cache.h"
 #include "eventfd.h"
 #include "wait.h"
+#include "bpf_filter.h"
 
 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
 			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -1874,6 +1875,12 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	if (unlikely(ret))
 		return io_submit_fail_init(sqe, req, ret);
 
+	if (unlikely(ctx->restrictions.bpf_filters)) {
+		ret = io_uring_run_bpf_filters(&ctx->restrictions, req);
+		if (ret)
+			return io_submit_fail_init(sqe, req, ret);
+	}
+
 	trace_io_uring_submit_req(req);
 
 	/*
@@ -2161,6 +2168,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
 	io_req_caches_free(ctx);
+	io_put_bpf_filters(&ctx->restrictions);
 
 	WARN_ON_ONCE(ctx->nr_req_allocated);
 
diff --git a/io_uring/register.c b/io_uring/register.c
index 8551f13920dc..30957c2cb5eb 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -33,6 +33,7 @@
 #include "memmap.h"
 #include "zcrx.h"
 #include "query.h"
+#include "bpf_filter.h"
 
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -830,6 +831,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_REGISTER_ZCRX_CTRL:
 		ret = io_zcrx_ctrl(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_BPF_FILTER:
+		ret = -EINVAL;
+
+		if (nr_args != 1)
+			break;
+		ret = io_register_bpf_filter(&ctx->restrictions, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From cff1c26b4223820431129696b45525e5928e6409 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 Jan 2026 14:50:05 -0700
Subject: io_uring/net: allow filtering on IORING_OP_SOCKET data

Example population method for the BPF based opcode filtering. This
exposes the socket family, type, and protocol to a registered BPF
filter. This in turn enables the filter to make decisions based on
what was passed in to the IORING_OP_SOCKET request type.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/bpf_filter.h |  7 +++++++
 io_uring/bpf_filter.c                    | 11 +++++++++++
 io_uring/net.c                           |  9 +++++++++
 io_uring/net.h                           |  6 ++++++
 4 files changed, 33 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h
index 2d4d0e5743e4..4dbc89bbbf10 100644
--- a/include/uapi/linux/io_uring/bpf_filter.h
+++ b/include/uapi/linux/io_uring/bpf_filter.h
@@ -16,6 +16,13 @@ struct io_uring_bpf_ctx {
 	__u8	sqe_flags;
 	__u8	pdu_size;	/* size of aux data for filter */
 	__u8	pad[5];
+	union {
+		struct {
+			__u32	family;
+			__u32	type;
+			__u32	protocol;
+		} socket;
+	};
 };
 
 enum {
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
index 5207226d72ea..889fa915fa54 100644
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -30,6 +30,17 @@ static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
 	/* clear residual, anything from pdu_size and below */
 	memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0,
 		sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size));
+
+	/*
+	 * Opcodes can provide a handler fo populating more data into bctx,
+	 * for filters to use.
+	 */
+	switch (req->opcode) {
+	case IORING_OP_SOCKET:
+		bctx->pdu_size = sizeof(bctx->socket);
+		io_socket_bpf_populate(bctx, req);
+		break;
+	}
 }
 
 /*
diff --git a/io_uring/net.c b/io_uring/net.c
index 519ea055b761..4fcba36bd0bb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1699,6 +1699,15 @@ retry:
 	return IOU_COMPLETE;
 }
 
+void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
+{
+	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
+
+	bctx->socket.family = sock->domain;
+	bctx->socket.type = sock->type;
+	bctx->socket.protocol = sock->protocol;
+}
+
 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
diff --git a/io_uring/net.h b/io_uring/net.h
index 43e5ce5416b7..a862960a3bb9 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -3,6 +3,7 @@
 #include <linux/net.h>
 #include <linux/uio.h>
 #include <linux/io_uring_types.h>
+#include <uapi/linux/io_uring/bpf_filter.h>
 
 struct io_async_msghdr {
 #if defined(CONFIG_NET)
@@ -44,6 +45,7 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags);
 
 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_socket(struct io_kiocb *req, unsigned int issue_flags);
+void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req);
 
 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_connect(struct io_kiocb *req, unsigned int issue_flags);
@@ -64,4 +66,8 @@ void io_netmsg_cache_free(const void *entry);
 static inline void io_netmsg_cache_free(const void *entry)
 {
 }
+static inline void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx,
+					  struct io_kiocb *req)
+{
+}
 #endif
-- 
cgit v1.2.3


From 8768770cf5d76d177fa2200e6957a372e61e06b5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 19 Jan 2026 15:59:54 -0700
Subject: io_uring/bpf_filter: allow filtering on contents of struct open_how

This adds custom filtering for IORING_OP_OPENAT and IORING_OP_OPENAT2,
where the open_how flags, mode, and resolve can be checked by filters.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/bpf_filter.h | 5 +++++
 io_uring/bpf_filter.c                    | 6 ++++++
 io_uring/openclose.c                     | 9 +++++++++
 io_uring/openclose.h                     | 3 +++
 4 files changed, 23 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h
index 4dbc89bbbf10..220351b81bc0 100644
--- a/include/uapi/linux/io_uring/bpf_filter.h
+++ b/include/uapi/linux/io_uring/bpf_filter.h
@@ -22,6 +22,11 @@ struct io_uring_bpf_ctx {
 			__u32	type;
 			__u32	protocol;
 		} socket;
+		struct {
+			__u64	flags;
+			__u64	mode;
+			__u64	resolve;
+		} open;
 	};
 };
 
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
index 889fa915fa54..ff723ec44828 100644
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -12,6 +12,7 @@
 #include "io_uring.h"
 #include "bpf_filter.h"
 #include "net.h"
+#include "openclose.h"
 
 struct io_bpf_filter {
 	struct bpf_prog		*prog;
@@ -40,6 +41,11 @@ static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
 		bctx->pdu_size = sizeof(bctx->socket);
 		io_socket_bpf_populate(bctx, req);
 		break;
+	case IORING_OP_OPENAT:
+	case IORING_OP_OPENAT2:
+		bctx->pdu_size = sizeof(bctx->open);
+		io_openat_bpf_populate(bctx, req);
+		break;
 	}
 }
 
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 15dde9bd6ff6..31c687adf873 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -85,6 +85,15 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	return 0;
 }
 
+void io_openat_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
+{
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
+
+	bctx->open.flags = open->how.flags;
+	bctx->open.mode = open->how.mode;
+	bctx->open.resolve = open->how.resolve;
+}
+
 int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 4ca2a9935abc..566739920658 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -1,11 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include "bpf_filter.h"
+
 int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
 		     unsigned int offset);
 
 int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_openat(struct io_kiocb *req, unsigned int issue_flags);
 void io_open_cleanup(struct io_kiocb *req);
+void io_openat_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req);
 
 int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
-- 
cgit v1.2.3


From cad3337bb6c3a2ba2307d6a9061e752e15681d2b Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 19 Dec 2025 19:40:33 +0200
Subject: PCI: Add dword #defines for Bus Number + Secondary Latency Timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

uapi/linux/pci_regs.h defines Primary/Secondary/Subordinate Bus Numbers
and Secondary Latency Timer (PCIe r7.0, sec. 7.5.1.3) as byte register
offsets, but in practice the code may read/write the entire dword. In the
lack of #defines to handle the dword fields, the code ends up using
literals which are not as easy to read.

Add dword field masks for the Bus Number and Secondary Latency Timer
fields and use them in probe.c.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: squash new #defines and uses together]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20251219174036.16738-21-ilpo.jarvinen@linux.intel.com
Link: https://patch.msgid.link/20251219174036.16738-22-ilpo.jarvinen@linux.intel.com
---
 drivers/pci/probe.c           | 25 +++++++++++++------------
 include/uapi/linux/pci_regs.h |  5 +++++
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ed4d26833640..53ec1879fb99 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -524,8 +524,8 @@ static void pci_read_bridge_windows(struct pci_dev *bridge)
 
 	pci_read_config_dword(bridge, PCI_PRIMARY_BUS, &buses);
 	res.flags = IORESOURCE_BUS;
-	res.start = (buses >> 8) & 0xff;
-	res.end = (buses >> 16) & 0xff;
+	res.start = FIELD_GET(PCI_SECONDARY_BUS_MASK, buses);
+	res.end = FIELD_GET(PCI_SUBORDINATE_BUS_MASK, buses);
 	pci_info(bridge, "PCI bridge to %pR%s\n", &res,
 		 bridge->transparent ? " (subtractive decode)" : "");
 
@@ -1393,9 +1393,9 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 	pm_runtime_get_sync(&dev->dev);
 
 	pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses);
-	primary = buses & 0xFF;
-	secondary = (buses >> 8) & 0xFF;
-	subordinate = (buses >> 16) & 0xFF;
+	primary = FIELD_GET(PCI_PRIMARY_BUS_MASK, buses);
+	secondary = FIELD_GET(PCI_SECONDARY_BUS_MASK, buses);
+	subordinate = FIELD_GET(PCI_SUBORDINATE_BUS_MASK, buses);
 
 	pci_dbg(dev, "scanning [bus %02x-%02x] behind bridge, pass %d\n",
 		secondary, subordinate, pass);
@@ -1476,7 +1476,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 				 * ranges.
 				 */
 				pci_write_config_dword(dev, PCI_PRIMARY_BUS,
-						       buses & ~0xffffff);
+						       buses & PCI_SEC_LATENCY_TIMER_MASK);
 			goto out;
 		}
 
@@ -1507,18 +1507,19 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 		if (available_buses)
 			available_buses--;
 
-		buses = (buses & 0xff000000)
-		      | ((unsigned int)(child->primary)     <<  0)
-		      | ((unsigned int)(child->busn_res.start)   <<  8)
-		      | ((unsigned int)(child->busn_res.end) << 16);
+		buses = (buses & PCI_SEC_LATENCY_TIMER_MASK) |
+			FIELD_PREP(PCI_PRIMARY_BUS_MASK, child->primary) |
+			FIELD_PREP(PCI_SECONDARY_BUS_MASK, child->busn_res.start) |
+			FIELD_PREP(PCI_SUBORDINATE_BUS_MASK, child->busn_res.end);
 
 		/*
 		 * yenta.c forces a secondary latency timer of 176.
 		 * Copy that behaviour here.
 		 */
 		if (is_cardbus) {
-			buses &= ~0xff000000;
-			buses |= CARDBUS_LATENCY_TIMER << 24;
+			buses &= ~PCI_SEC_LATENCY_TIMER_MASK;
+			buses |= FIELD_PREP(PCI_SEC_LATENCY_TIMER_MASK,
+					    CARDBUS_LATENCY_TIMER);
 		}
 
 		/* We need to blast all three values with a single write */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 3add74ae2594..8be55ece2a21 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -132,6 +132,11 @@
 #define PCI_SECONDARY_BUS	0x19	/* Secondary bus number */
 #define PCI_SUBORDINATE_BUS	0x1a	/* Highest bus number behind the bridge */
 #define PCI_SEC_LATENCY_TIMER	0x1b	/* Latency timer for secondary interface */
+/* Masks for dword-sized processing of Bus Number and Sec Latency Timer fields */
+#define  PCI_PRIMARY_BUS_MASK		0x000000ff
+#define  PCI_SECONDARY_BUS_MASK		0x0000ff00
+#define  PCI_SUBORDINATE_BUS_MASK	0x00ff0000
+#define  PCI_SEC_LATENCY_TIMER_MASK	0xff000000
 #define PCI_IO_BASE		0x1c	/* I/O range behind the bridge */
 #define PCI_IO_LIMIT		0x1d
 #define  PCI_IO_RANGE_TYPE_MASK	0x0fUL	/* I/O bridging type */
-- 
cgit v1.2.3


From e698127eb7249d6c70fa8f2bdba469c0e54f2e2b Mon Sep 17 00:00:00 2001
From: Jonathan Kim <jonathan.kim@amd.com>
Date: Wed, 23 Jul 2025 10:07:28 -0400
Subject: drm/amdkfd: add extended capabilities to device snapshot

Add additional capabilities reporting.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: James Zhu <james.zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 1 +
 include/uapi/linux/kfd_ioctl.h         | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 27176b2dc714..8f8a0975f1a7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -1108,6 +1108,7 @@ int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
 		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
 		device_info.capability = topo_dev->node_props.capability;
 		device_info.debug_prop = topo_dev->node_props.debug_prop;
+		device_info.capability2 = topo_dev->node_props.capability2;
 
 		if (exception_clear_mask)
 			pdd->exception_status &= ~exception_clear_mask;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 047bcb1cc078..e72359370857 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -149,6 +149,8 @@ struct kfd_dbg_device_info_entry {
 	__u32 num_xcc;
 	__u32 capability;
 	__u32 debug_prop;
+	__u32 capability2;
+	__u32 pad;
 };
 
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
-- 
cgit v1.2.3


From d8316b837c2ca5f92e781fa1575095c0132ae3c1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 6 Jan 2026 13:59:50 -0500
Subject: nfsd: add controls to set the minimum number of threads per pool

Add a new "min_threads" variable to the nfsd_net, along with the
corresponding netlink interface, to set that value from userland.
Pass that value to svc_set_pool_threads() and svc_set_num_threads().

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/netlink/specs/nfsd.yaml |  5 +++++
 fs/nfsd/netlink.c                     |  5 +++--
 fs/nfsd/netns.h                       |  6 ++++++
 fs/nfsd/nfsctl.c                      |  6 ++++++
 fs/nfsd/nfssvc.c                      |  4 ++--
 fs/nfsd/trace.h                       | 19 +++++++++++++++++++
 include/uapi/linux/nfsd_netlink.h     |  1 +
 7 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml
index 100363029e82..badb2fe57c98 100644
--- a/Documentation/netlink/specs/nfsd.yaml
+++ b/Documentation/netlink/specs/nfsd.yaml
@@ -78,6 +78,9 @@ attribute-sets:
       -
         name: scope
         type: string
+      -
+        name: min-threads
+        type: u32
   -
     name: version
     attributes:
@@ -159,6 +162,7 @@ operations:
             - gracetime
             - leasetime
             - scope
+            - min-threads
     -
       name: threads-get
       doc: get the number of running threads
@@ -170,6 +174,7 @@ operations:
             - gracetime
             - leasetime
             - scope
+            - min-threads
     -
       name: version-set
       doc: set nfs enabled versions
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index ac51a44e1065..887525964451 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -24,11 +24,12 @@ const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = {
 };
 
 /* NFSD_CMD_THREADS_SET - do */
-static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE + 1] = {
+static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_MIN_THREADS + 1] = {
 	[NFSD_A_SERVER_THREADS] = { .type = NLA_U32, },
 	[NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, },
 	[NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, },
 	[NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, },
+	[NFSD_A_SERVER_MIN_THREADS] = { .type = NLA_U32, },
 };
 
 /* NFSD_CMD_VERSION_SET - do */
@@ -57,7 +58,7 @@ static const struct genl_split_ops nfsd_nl_ops[] = {
 		.cmd		= NFSD_CMD_THREADS_SET,
 		.doit		= nfsd_nl_threads_set_doit,
 		.policy		= nfsd_threads_set_nl_policy,
-		.maxattr	= NFSD_A_SERVER_SCOPE,
+		.maxattr	= NFSD_A_SERVER_MIN_THREADS,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 	{
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d83c68872c4c..9fa600602658 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -129,6 +129,12 @@ struct nfsd_net {
 	seqlock_t writeverf_lock;
 	unsigned char writeverf[8];
 
+	/*
+	 * Minimum number of threads to run per pool.  If 0 then the
+	 * min == max requested number of threads.
+	 */
+	unsigned int min_threads;
+
 	u32 clientid_base;
 	u32 clientid_counter;
 	u32 clverifier_counter;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 084fc517e9e1..7a58e54760be 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1642,6 +1642,10 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
 			scope = nla_data(attr);
 	}
 
+	attr = info->attrs[NFSD_A_SERVER_MIN_THREADS];
+	if (attr)
+		nn->min_threads = nla_get_u32(attr);
+
 	ret = nfsd_svc(nrpools, nthreads, net, get_current_cred(), scope);
 	if (ret > 0)
 		ret = 0;
@@ -1681,6 +1685,8 @@ int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info)
 			  nn->nfsd4_grace) ||
 	      nla_put_u32(skb, NFSD_A_SERVER_LEASETIME,
 			  nn->nfsd4_lease) ||
+	      nla_put_u32(skb, NFSD_A_SERVER_MIN_THREADS,
+			  nn->min_threads) ||
 	      nla_put_string(skb, NFSD_A_SERVER_SCOPE,
 			  nn->nfsd_name);
 	if (err)
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1e2570e3c754..0887ee601d3c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -690,7 +690,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
 
 	/* Special case: When n == 1, distribute threads equally among pools. */
 	if (n == 1)
-		return svc_set_num_threads(nn->nfsd_serv, 0, nthreads[0]);
+		return svc_set_num_threads(nn->nfsd_serv, nn->min_threads, nthreads[0]);
 
 	if (n > nn->nfsd_serv->sv_nrpools)
 		n = nn->nfsd_serv->sv_nrpools;
@@ -718,7 +718,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
 	for (i = 0; i < n; i++) {
 		err = svc_set_pool_threads(nn->nfsd_serv,
 					   &nn->nfsd_serv->sv_pools[i],
-					   0, nthreads[i]);
+					   nn->min_threads, nthreads[i]);
 		if (err)
 			goto out;
 	}
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 8885fd9bead9..d1d0b0dd0545 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -2164,6 +2164,25 @@ TRACE_EVENT(nfsd_ctl_maxblksize,
 	)
 );
 
+TRACE_EVENT(nfsd_ctl_minthreads,
+	TP_PROTO(
+		const struct net *net,
+		int minthreads
+	),
+	TP_ARGS(net, minthreads),
+	TP_STRUCT__entry(
+		__field(unsigned int, netns_ino)
+		__field(int, minthreads)
+	),
+	TP_fast_assign(
+		__entry->netns_ino = net->ns.inum;
+		__entry->minthreads = minthreads
+	),
+	TP_printk("minthreads=%d",
+		__entry->minthreads
+	)
+);
+
 TRACE_EVENT(nfsd_ctl_time,
 	TP_PROTO(
 		const struct net *net,
diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h
index e157e2009ea8..e9efbc9e63d8 100644
--- a/include/uapi/linux/nfsd_netlink.h
+++ b/include/uapi/linux/nfsd_netlink.h
@@ -35,6 +35,7 @@ enum {
 	NFSD_A_SERVER_GRACETIME,
 	NFSD_A_SERVER_LEASETIME,
 	NFSD_A_SERVER_SCOPE,
+	NFSD_A_SERVER_MIN_THREADS,
 
 	__NFSD_A_SERVER_MAX,
 	NFSD_A_SERVER_MAX = (__NFSD_A_SERVER_MAX - 1)
-- 
cgit v1.2.3


From a006ed4ecd4905b69402980ad7d4e5f31bf44953 Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Mon, 19 Jan 2026 15:32:55 +0100
Subject: vduse: add v1 API definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows the kernel to detect whether the userspace VDUSE device
supports the VQ group and ASID features.  VDUSE devices that don't set
the V1 API will not receive the new messages, and vdpa device will be
created with only one vq group and asid.

The next patches implement the new feature incrementally, only enabling
the VDUSE device to set the V1 API version by the end of the series.

Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20260119143306.1818855-3-eperezma@redhat.com>
---
 include/uapi/linux/vduse.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 10ad71aa00d6..ccb92a1efce0 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -10,6 +10,10 @@
 
 #define VDUSE_API_VERSION	0
 
+/* VQ groups and ASID support */
+
+#define VDUSE_API_VERSION_1	1
+
 /*
  * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION).
  * This is used for future extension.
-- 
cgit v1.2.3


From 9350a09afd086771b0612c7b7c9583e8a1568135 Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Mon, 19 Jan 2026 15:32:56 +0100
Subject: vduse: add vq group support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows separate the different virtqueues in groups that shares the
same address space.  Asking the VDUSE device for the groups of the vq at
the beginning as they're needed for the DMA API.

Allocating 3 vq groups as net is the device that need the most groups:
* Dataplane (guest passthrough)
* CVQ
* Shadowed vrings.

Future versions of the series can include dynamic allocation of the
groups array so VDUSE can declare more groups.

Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20260119143306.1818855-4-eperezma@redhat.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 47 ++++++++++++++++++++++++++++++++++----
 include/uapi/linux/vduse.h         | 12 +++++++---
 2 files changed, 51 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index ae357d014564..5bffc25a266e 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -39,6 +39,7 @@
 #define DRV_LICENSE  "GPL v2"
 
 #define VDUSE_DEV_MAX (1U << MINORBITS)
+#define VDUSE_DEV_MAX_GROUPS 0xffff
 #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
 #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
@@ -58,6 +59,7 @@ struct vduse_virtqueue {
 	struct vdpa_vq_state state;
 	bool ready;
 	bool kicked;
+	u32 group;
 	spinlock_t kick_lock;
 	spinlock_t irq_lock;
 	struct eventfd_ctx *kickfd;
@@ -114,6 +116,7 @@ struct vduse_dev {
 	u8 status;
 	u32 vq_num;
 	u32 vq_align;
+	u32 ngroups;
 	struct vduse_umem *umem;
 	struct mutex mem_lock;
 	unsigned int bounce_size;
@@ -592,6 +595,16 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
 	return 0;
 }
 
+static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+	if (dev->api_version < VDUSE_API_VERSION_1)
+		return 0;
+
+	return dev->vqs[idx]->group;
+}
+
 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
 				struct vdpa_vq_state *state)
 {
@@ -789,6 +802,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
 	.set_vq_cb		= vduse_vdpa_set_vq_cb,
 	.set_vq_num             = vduse_vdpa_set_vq_num,
 	.get_vq_size		= vduse_vdpa_get_vq_size,
+	.get_vq_group		= vduse_get_vq_group,
 	.set_vq_ready		= vduse_vdpa_set_vq_ready,
 	.get_vq_ready		= vduse_vdpa_get_vq_ready,
 	.set_vq_state		= vduse_vdpa_set_vq_state,
@@ -1252,12 +1266,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 		if (config.index >= dev->vq_num)
 			break;
 
-		if (!is_mem_zero((const char *)config.reserved,
-				 sizeof(config.reserved)))
+		if (dev->api_version < VDUSE_API_VERSION_1) {
+			if (config.group)
+				break;
+		} else {
+			if (config.group >= dev->ngroups)
+				break;
+			if (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)
+				break;
+		}
+
+		if (config.reserved1 ||
+		    !is_mem_zero((const char *)config.reserved2,
+				 sizeof(config.reserved2)))
 			break;
 
 		index = array_index_nospec(config.index, dev->vq_num);
 		dev->vqs[index]->num_max = config.max_size;
+		dev->vqs[index]->group = config.group;
 		ret = 0;
 		break;
 	}
@@ -1737,12 +1763,20 @@ static bool features_is_valid(struct vduse_dev_config *config)
 	return true;
 }
 
-static bool vduse_validate_config(struct vduse_dev_config *config)
+static bool vduse_validate_config(struct vduse_dev_config *config,
+				  u64 api_version)
 {
 	if (!is_mem_zero((const char *)config->reserved,
 			 sizeof(config->reserved)))
 		return false;
 
+	if (api_version < VDUSE_API_VERSION_1 && config->ngroups)
+		return false;
+
+	if (api_version >= VDUSE_API_VERSION_1 &&
+	    (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS))
+		return false;
+
 	if (config->vq_align > PAGE_SIZE)
 		return false;
 
@@ -1858,6 +1892,9 @@ static int vduse_create_dev(struct vduse_dev_config *config,
 	dev->device_features = config->features;
 	dev->device_id = config->device_id;
 	dev->vendor_id = config->vendor_id;
+	dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1)
+		       ? 1
+		       : config->ngroups;
 	dev->name = kstrdup(config->name, GFP_KERNEL);
 	if (!dev->name)
 		goto err_str;
@@ -1936,7 +1973,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd,
 			break;
 
 		ret = -EINVAL;
-		if (vduse_validate_config(&config) == false)
+		if (!vduse_validate_config(&config, control->api_version))
 			break;
 
 		buf = vmemdup_user(argp + size, config.config_size);
@@ -2017,7 +2054,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
 
 	vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
 				 &vduse_vdpa_config_ops, &vduse_map_ops,
-				 1, 1, name, true);
+				 dev->ngroups, 1, name, true);
 	if (IS_ERR(vdev))
 		return PTR_ERR(vdev);
 
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index ccb92a1efce0..a3d51cf6df3a 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -31,6 +31,7 @@
  * @features: virtio features
  * @vq_num: the number of virtqueues
  * @vq_align: the allocation alignment of virtqueue's metadata
+ * @ngroups: number of vq groups that VDUSE device declares
  * @reserved: for future use, needs to be initialized to zero
  * @config_size: the size of the configuration space
  * @config: the buffer of the configuration space
@@ -45,7 +46,8 @@ struct vduse_dev_config {
 	__u64 features;
 	__u32 vq_num;
 	__u32 vq_align;
-	__u32 reserved[13];
+	__u32 ngroups; /* if VDUSE_API_VERSION >= 1 */
+	__u32 reserved[12];
 	__u32 config_size;
 	__u8 config[];
 };
@@ -122,14 +124,18 @@ struct vduse_config_data {
  * struct vduse_vq_config - basic configuration of a virtqueue
  * @index: virtqueue index
  * @max_size: the max size of virtqueue
- * @reserved: for future use, needs to be initialized to zero
+ * @reserved1: for future use, needs to be initialized to zero
+ * @group: virtqueue group
+ * @reserved2: for future use, needs to be initialized to zero
  *
  * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue.
  */
 struct vduse_vq_config {
 	__u32 index;
 	__u16 max_size;
-	__u16 reserved[13];
+	__u16 reserved1;
+	__u32 group;
+	__u16 reserved2[10];
 };
 
 /*
-- 
cgit v1.2.3


From 079212f6877e5d07308c8998a8fbc7539ca3f8f3 Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Mon, 19 Jan 2026 15:33:04 +0100
Subject: vduse: add vq group asid support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for assigning Address Space Identifiers (ASIDs) to each VQ
group.  This enables mapping each group into a distinct memory space.

The vq group to ASID association is protected by a rwlock now.  But the
mutex domain_lock keeps protecting the domains of all ASIDs, as some
operations like the one related with the bounce buffer size still
requires to lock all the ASIDs.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20260119143306.1818855-12-eperezma@redhat.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 385 ++++++++++++++++++++++++-------------
 include/uapi/linux/vduse.h         |  66 ++++++-
 2 files changed, 315 insertions(+), 136 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index d658f3e1cebf..2727c0c26003 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -9,6 +9,7 @@
  */
 
 #include "linux/virtio_net.h"
+#include <linux/cleanup.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/cdev.h>
@@ -41,6 +42,7 @@
 
 #define VDUSE_DEV_MAX (1U << MINORBITS)
 #define VDUSE_DEV_MAX_GROUPS 0xffff
+#define VDUSE_DEV_MAX_AS 0xffff
 #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
 #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
 #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
@@ -86,7 +88,15 @@ struct vduse_umem {
 	struct mm_struct *mm;
 };
 
+struct vduse_as {
+	struct vduse_iova_domain *domain;
+	struct vduse_umem *umem;
+	struct mutex mem_lock;
+};
+
 struct vduse_vq_group {
+	rwlock_t as_lock;
+	struct vduse_as *as; /* Protected by as_lock */
 	struct vduse_dev *dev;
 };
 
@@ -94,7 +104,7 @@ struct vduse_dev {
 	struct vduse_vdpa *vdev;
 	struct device *dev;
 	struct vduse_virtqueue **vqs;
-	struct vduse_iova_domain *domain;
+	struct vduse_as *as;
 	char *name;
 	struct mutex lock;
 	spinlock_t msg_lock;
@@ -122,9 +132,8 @@ struct vduse_dev {
 	u32 vq_num;
 	u32 vq_align;
 	u32 ngroups;
-	struct vduse_umem *umem;
+	u32 nas;
 	struct vduse_vq_group *groups;
-	struct mutex mem_lock;
 	unsigned int bounce_size;
 	struct mutex domain_lock;
 };
@@ -314,7 +323,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
 	return vduse_dev_msg_sync(dev, &msg);
 }
 
-static int vduse_dev_update_iotlb(struct vduse_dev *dev,
+static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
 				  u64 start, u64 last)
 {
 	struct vduse_dev_msg msg = { 0 };
@@ -323,8 +332,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
 		return -EINVAL;
 
 	msg.req.type = VDUSE_UPDATE_IOTLB;
-	msg.req.iova.start = start;
-	msg.req.iova.last = last;
+	if (dev->api_version < VDUSE_API_VERSION_1) {
+		msg.req.iova.start = start;
+		msg.req.iova.last = last;
+	} else {
+		msg.req.iova_v2.start = start;
+		msg.req.iova_v2.last = last;
+		msg.req.iova_v2.asid = asid;
+	}
 
 	return vduse_dev_msg_sync(dev, &msg);
 }
@@ -439,11 +454,14 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
 static void vduse_dev_reset(struct vduse_dev *dev)
 {
 	int i;
-	struct vduse_iova_domain *domain = dev->domain;
 
 	/* The coherent mappings are handled in vduse_dev_free_coherent() */
-	if (domain && domain->bounce_map)
-		vduse_domain_reset_bounce_map(domain);
+	for (i = 0; i < dev->nas; i++) {
+		struct vduse_iova_domain *domain = dev->as[i].domain;
+
+		if (domain && domain->bounce_map)
+			vduse_domain_reset_bounce_map(domain);
+	}
 
 	down_write(&dev->rwsem);
 
@@ -622,6 +640,42 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
 	return ret;
 }
 
+DEFINE_GUARD(vq_group_as_read_lock, struct vduse_vq_group *,
+	if (_T->dev->nas > 1)
+		read_lock(&_T->as_lock),
+	if (_T->dev->nas > 1)
+		read_unlock(&_T->as_lock))
+
+DEFINE_GUARD(vq_group_as_write_lock, struct vduse_vq_group *,
+	if (_T->dev->nas > 1)
+		write_lock(&_T->as_lock),
+	if (_T->dev->nas > 1)
+		write_unlock(&_T->as_lock))
+
+static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
+				unsigned int asid)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct vduse_dev_msg msg = { 0 };
+	int r;
+
+	if (dev->api_version < VDUSE_API_VERSION_1)
+		return -EINVAL;
+
+	msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
+	msg.req.vq_group_asid.group = group;
+	msg.req.vq_group_asid.asid = asid;
+
+	r = vduse_dev_msg_sync(dev, &msg);
+	if (r < 0)
+		return r;
+
+	guard(vq_group_as_write_lock)(&dev->groups[group]);
+	dev->groups[group].as = &dev->as[asid];
+
+	return 0;
+}
+
 static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
 				struct vdpa_vq_state *state)
 {
@@ -793,13 +847,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
 	int ret;
 
-	ret = vduse_domain_set_map(dev->domain, iotlb);
+	ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
 	if (ret)
 		return ret;
 
-	ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
+	ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
 	if (ret) {
-		vduse_domain_clear_map(dev->domain, iotlb);
+		vduse_domain_clear_map(dev->as[asid].domain, iotlb);
 		return ret;
 	}
 
@@ -842,6 +896,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
 	.get_vq_affinity	= vduse_vdpa_get_vq_affinity,
 	.reset			= vduse_vdpa_reset,
 	.set_map		= vduse_vdpa_set_map,
+	.set_group_asid		= vduse_set_group_asid,
 	.get_vq_map		= vduse_get_vq_map,
 	.free			= vduse_vdpa_free,
 };
@@ -850,15 +905,13 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
 					     dma_addr_t dma_addr, size_t size,
 					     enum dma_data_direction dir)
 {
-	struct vduse_dev *vdev;
 	struct vduse_iova_domain *domain;
 
 	if (!token.group)
 		return;
 
-	vdev = token.group->dev;
-	domain = vdev->domain;
-
+	guard(vq_group_as_read_lock)(token.group);
+	domain = token.group->as->domain;
 	vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
 }
 
@@ -866,15 +919,13 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
 					     dma_addr_t dma_addr, size_t size,
 					     enum dma_data_direction dir)
 {
-	struct vduse_dev *vdev;
 	struct vduse_iova_domain *domain;
 
 	if (!token.group)
 		return;
 
-	vdev = token.group->dev;
-	domain = vdev->domain;
-
+	guard(vq_group_as_read_lock)(token.group);
+	domain = token.group->as->domain;
 	vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
 }
 
@@ -883,15 +934,13 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
 				     enum dma_data_direction dir,
 				     unsigned long attrs)
 {
-	struct vduse_dev *vdev;
 	struct vduse_iova_domain *domain;
 
 	if (!token.group)
 		return DMA_MAPPING_ERROR;
 
-	vdev = token.group->dev;
-	domain = vdev->domain;
-
+	guard(vq_group_as_read_lock)(token.group);
+	domain = token.group->as->domain;
 	return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
 }
 
@@ -899,23 +948,19 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
 				 size_t size, enum dma_data_direction dir,
 				 unsigned long attrs)
 {
-	struct vduse_dev *vdev;
 	struct vduse_iova_domain *domain;
 
 	if (!token.group)
 		return;
 
-	vdev = token.group->dev;
-	domain = vdev->domain;
-
-	return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
+	guard(vq_group_as_read_lock)(token.group);
+	domain = token.group->as->domain;
+	vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
 }
 
 static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
 				      dma_addr_t *dma_addr, gfp_t flag)
 {
-	struct vduse_dev *vdev;
-	struct vduse_iova_domain *domain;
 	void *addr;
 
 	*dma_addr = DMA_MAPPING_ERROR;
@@ -926,11 +971,15 @@ static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
 	if (!addr)
 		return NULL;
 
-	vdev = token.group->dev;
-	domain = vdev->domain;
-	*dma_addr = vduse_domain_alloc_coherent(domain, size, addr);
-	if (*dma_addr == DMA_MAPPING_ERROR)
-		goto err;
+	{
+		struct vduse_iova_domain *domain;
+
+		guard(vq_group_as_read_lock)(token.group);
+		domain = token.group->as->domain;
+		*dma_addr = vduse_domain_alloc_coherent(domain, size, addr);
+		if (*dma_addr == DMA_MAPPING_ERROR)
+			goto err;
+	}
 
 	return addr;
 
@@ -943,31 +992,27 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size,
 				    void *vaddr, dma_addr_t dma_addr,
 				    unsigned long attrs)
 {
-	struct vduse_dev *vdev;
-	struct vduse_iova_domain *domain;
-
 	if (!token.group)
 		return;
 
-	vdev = token.group->dev;
-	domain = vdev->domain;
+	{
+		struct vduse_iova_domain *domain;
+
+		guard(vq_group_as_read_lock)(token.group);
+		domain = token.group->as->domain;
+		vduse_domain_free_coherent(domain, size, dma_addr, attrs);
+	}
 
-	vduse_domain_free_coherent(domain, size, dma_addr, attrs);
 	free_pages_exact(vaddr, size);
 }
 
 static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr)
 {
-	struct vduse_dev *vdev;
-	struct vduse_iova_domain *domain;
-
 	if (!token.group)
 		return false;
 
-	vdev = token.group->dev;
-	domain = vdev->domain;
-
-	return dma_addr < domain->bounce_size;
+	guard(vq_group_as_read_lock)(token.group);
+	return dma_addr < token.group->as->domain->bounce_size;
 }
 
 static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
@@ -979,16 +1024,11 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
 
 static size_t vduse_dev_max_mapping_size(union virtio_map token)
 {
-	struct vduse_dev *vdev;
-	struct vduse_iova_domain *domain;
-
 	if (!token.group)
 		return 0;
 
-	vdev = token.group->dev;
-	domain = vdev->domain;
-
-	return domain->bounce_size;
+	guard(vq_group_as_read_lock)(token.group);
+	return token.group->as->domain->bounce_size;
 }
 
 static const struct virtio_map_ops vduse_map_ops = {
@@ -1128,39 +1168,40 @@ unlock:
 	return ret;
 }
 
-static int vduse_dev_dereg_umem(struct vduse_dev *dev,
+static int vduse_dev_dereg_umem(struct vduse_dev *dev, u32 asid,
 				u64 iova, u64 size)
 {
 	int ret;
 
-	mutex_lock(&dev->mem_lock);
+	mutex_lock(&dev->as[asid].mem_lock);
 	ret = -ENOENT;
-	if (!dev->umem)
+	if (!dev->as[asid].umem)
 		goto unlock;
 
 	ret = -EINVAL;
-	if (!dev->domain)
+	if (!dev->as[asid].domain)
 		goto unlock;
 
-	if (dev->umem->iova != iova || size != dev->domain->bounce_size)
+	if (dev->as[asid].umem->iova != iova ||
+	    size != dev->as[asid].domain->bounce_size)
 		goto unlock;
 
-	vduse_domain_remove_user_bounce_pages(dev->domain);
-	unpin_user_pages_dirty_lock(dev->umem->pages,
-				    dev->umem->npages, true);
-	atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
-	mmdrop(dev->umem->mm);
-	vfree(dev->umem->pages);
-	kfree(dev->umem);
-	dev->umem = NULL;
+	vduse_domain_remove_user_bounce_pages(dev->as[asid].domain);
+	unpin_user_pages_dirty_lock(dev->as[asid].umem->pages,
+				    dev->as[asid].umem->npages, true);
+	atomic64_sub(dev->as[asid].umem->npages, &dev->as[asid].umem->mm->pinned_vm);
+	mmdrop(dev->as[asid].umem->mm);
+	vfree(dev->as[asid].umem->pages);
+	kfree(dev->as[asid].umem);
+	dev->as[asid].umem = NULL;
 	ret = 0;
 unlock:
-	mutex_unlock(&dev->mem_lock);
+	mutex_unlock(&dev->as[asid].mem_lock);
 	return ret;
 }
 
 static int vduse_dev_reg_umem(struct vduse_dev *dev,
-			      u64 iova, u64 uaddr, u64 size)
+			      u32 asid, u64 iova, u64 uaddr, u64 size)
 {
 	struct page **page_list = NULL;
 	struct vduse_umem *umem = NULL;
@@ -1168,14 +1209,14 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
 	unsigned long npages, lock_limit;
 	int ret;
 
-	if (!dev->domain || !dev->domain->bounce_map ||
-	    size != dev->domain->bounce_size ||
+	if (!dev->as[asid].domain || !dev->as[asid].domain->bounce_map ||
+	    size != dev->as[asid].domain->bounce_size ||
 	    iova != 0 || uaddr & ~PAGE_MASK)
 		return -EINVAL;
 
-	mutex_lock(&dev->mem_lock);
+	mutex_lock(&dev->as[asid].mem_lock);
 	ret = -EEXIST;
-	if (dev->umem)
+	if (dev->as[asid].umem)
 		goto unlock;
 
 	ret = -ENOMEM;
@@ -1199,7 +1240,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
 		goto out;
 	}
 
-	ret = vduse_domain_add_user_bounce_pages(dev->domain,
+	ret = vduse_domain_add_user_bounce_pages(dev->as[asid].domain,
 						 page_list, pinned);
 	if (ret)
 		goto out;
@@ -1212,7 +1253,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
 	umem->mm = current->mm;
 	mmgrab(current->mm);
 
-	dev->umem = umem;
+	dev->as[asid].umem = umem;
 out:
 	if (ret && pinned > 0)
 		unpin_user_pages(page_list, pinned);
@@ -1223,7 +1264,7 @@ unlock:
 		vfree(page_list);
 		kfree(umem);
 	}
-	mutex_unlock(&dev->mem_lock);
+	mutex_unlock(&dev->as[asid].mem_lock);
 	return ret;
 }
 
@@ -1244,44 +1285,47 @@ static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
 }
 
 static int vduse_dev_iotlb_entry(struct vduse_dev *dev,
-				 struct vduse_iotlb_entry *entry,
+				 struct vduse_iotlb_entry_v2 *entry,
 				 struct file **f, uint64_t *capability)
 {
+	u32 asid;
 	int r = -EINVAL;
 	struct vhost_iotlb_map *map;
 
-	if (entry->start > entry->last)
+	if (entry->v1.start > entry->v1.last || entry->asid >= dev->nas)
 		return -EINVAL;
 
+	asid = array_index_nospec(entry->asid, dev->nas);
 	mutex_lock(&dev->domain_lock);
-	if (!dev->domain)
+
+	if (!dev->as[asid].domain)
 		goto out;
 
-	spin_lock(&dev->domain->iotlb_lock);
-	map = vhost_iotlb_itree_first(dev->domain->iotlb, entry->start,
-				      entry->last);
+	spin_lock(&dev->as[asid].domain->iotlb_lock);
+	map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb,
+				      entry->v1.start, entry->v1.last);
 	if (map) {
 		if (f) {
 			const struct vdpa_map_file *map_file;
 
 			map_file = (struct vdpa_map_file *)map->opaque;
-			entry->offset = map_file->offset;
+			entry->v1.offset = map_file->offset;
 			*f = get_file(map_file->file);
 		}
-		entry->start = map->start;
-		entry->last = map->last;
-		entry->perm = map->perm;
+		entry->v1.start = map->start;
+		entry->v1.last = map->last;
+		entry->v1.perm = map->perm;
 		if (capability) {
 			*capability = 0;
 
-			if (dev->domain->bounce_map && map->start == 0 &&
-			    map->last == dev->domain->bounce_size - 1)
+			if (dev->as[asid].domain->bounce_map && map->start == 0 &&
+			    map->last == dev->as[asid].domain->bounce_size - 1)
 				*capability |= VDUSE_IOVA_CAP_UMEM;
 		}
 
 		r = 0;
 	}
-	spin_unlock(&dev->domain->iotlb_lock);
+	spin_unlock(&dev->as[asid].domain->iotlb_lock);
 
 out:
 	mutex_unlock(&dev->domain_lock);
@@ -1299,12 +1343,29 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 		return -EPERM;
 
 	switch (cmd) {
-	case VDUSE_IOTLB_GET_FD: {
-		struct vduse_iotlb_entry entry;
+	case VDUSE_IOTLB_GET_FD:
+	case VDUSE_IOTLB_GET_FD2: {
+		struct vduse_iotlb_entry_v2 entry = {0};
 		struct file *f = NULL;
 
+		ret = -ENOIOCTLCMD;
+		if (dev->api_version < VDUSE_API_VERSION_1 &&
+		    cmd == VDUSE_IOTLB_GET_FD2)
+			break;
+
 		ret = -EFAULT;
-		if (copy_from_user(&entry, argp, sizeof(entry)))
+		if (cmd == VDUSE_IOTLB_GET_FD2) {
+			if (copy_from_user(&entry, argp, sizeof(entry)))
+				break;
+		} else {
+			if (copy_from_user(&entry.v1, argp,
+					   sizeof(entry.v1)))
+				break;
+		}
+
+		ret = -EINVAL;
+		if (!is_mem_zero((const char *)entry.reserved,
+				 sizeof(entry.reserved)))
 			break;
 
 		ret = vduse_dev_iotlb_entry(dev, &entry, &f, NULL);
@@ -1315,12 +1376,19 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 		if (!f)
 			break;
 
-		ret = -EFAULT;
-		if (copy_to_user(argp, &entry, sizeof(entry))) {
+		if (cmd == VDUSE_IOTLB_GET_FD2)
+			ret = copy_to_user(argp, &entry,
+					   sizeof(entry));
+		else
+			ret = copy_to_user(argp, &entry.v1,
+					   sizeof(entry.v1));
+
+		if (ret) {
+			ret = -EFAULT;
 			fput(f);
 			break;
 		}
-		ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
+		ret = receive_fd(f, NULL, perm_to_file_flags(entry.v1.perm));
 		fput(f);
 		break;
 	}
@@ -1465,6 +1533,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 	}
 	case VDUSE_IOTLB_REG_UMEM: {
 		struct vduse_iova_umem umem;
+		u32 asid;
 
 		ret = -EFAULT;
 		if (copy_from_user(&umem, argp, sizeof(umem)))
@@ -1472,17 +1541,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 
 		ret = -EINVAL;
 		if (!is_mem_zero((const char *)umem.reserved,
-				 sizeof(umem.reserved)))
+				 sizeof(umem.reserved)) ||
+		    (dev->api_version < VDUSE_API_VERSION_1 &&
+		     umem.asid != 0) || umem.asid >= dev->nas)
 			break;
 
 		mutex_lock(&dev->domain_lock);
-		ret = vduse_dev_reg_umem(dev, umem.iova,
+		asid = array_index_nospec(umem.asid, dev->nas);
+		ret = vduse_dev_reg_umem(dev, asid, umem.iova,
 					 umem.uaddr, umem.size);
 		mutex_unlock(&dev->domain_lock);
 		break;
 	}
 	case VDUSE_IOTLB_DEREG_UMEM: {
 		struct vduse_iova_umem umem;
+		u32 asid;
 
 		ret = -EFAULT;
 		if (copy_from_user(&umem, argp, sizeof(umem)))
@@ -1490,17 +1563,22 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 
 		ret = -EINVAL;
 		if (!is_mem_zero((const char *)umem.reserved,
-				 sizeof(umem.reserved)))
+				 sizeof(umem.reserved)) ||
+		    (dev->api_version < VDUSE_API_VERSION_1 &&
+		     umem.asid != 0) ||
+		     umem.asid >= dev->nas)
 			break;
+
 		mutex_lock(&dev->domain_lock);
-		ret = vduse_dev_dereg_umem(dev, umem.iova,
+		asid = array_index_nospec(umem.asid, dev->nas);
+		ret = vduse_dev_dereg_umem(dev, asid, umem.iova,
 					   umem.size);
 		mutex_unlock(&dev->domain_lock);
 		break;
 	}
 	case VDUSE_IOTLB_GET_INFO: {
 		struct vduse_iova_info info;
-		struct vduse_iotlb_entry entry;
+		struct vduse_iotlb_entry_v2 entry;
 
 		ret = -EFAULT;
 		if (copy_from_user(&info, argp, sizeof(info)))
@@ -1510,15 +1588,23 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 				 sizeof(info.reserved)))
 			break;
 
-		entry.start = info.start;
-		entry.last = info.last;
+		if (dev->api_version < VDUSE_API_VERSION_1) {
+			if (info.asid)
+				break;
+		} else if (info.asid >= dev->nas)
+			break;
+
+		entry.v1.start = info.start;
+		entry.v1.last = info.last;
+		entry.asid = info.asid;
 		ret = vduse_dev_iotlb_entry(dev, &entry, NULL,
 					    &info.capability);
 		if (ret < 0)
 			break;
 
-		info.start = entry.start;
-		info.last = entry.last;
+		info.start = entry.v1.start;
+		info.last = entry.v1.last;
+		info.asid = entry.asid;
 
 		ret = -EFAULT;
 		if (copy_to_user(argp, &info, sizeof(info)))
@@ -1540,8 +1626,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
 	struct vduse_dev *dev = file->private_data;
 
 	mutex_lock(&dev->domain_lock);
-	if (dev->domain)
-		vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
+	for (int i = 0; i < dev->nas; i++)
+		if (dev->as[i].domain)
+			vduse_dev_dereg_umem(dev, i, 0,
+					     dev->as[i].domain->bounce_size);
 	mutex_unlock(&dev->domain_lock);
 	spin_lock(&dev->msg_lock);
 	/* Make sure the inflight messages can processed after reconncection */
@@ -1760,7 +1848,6 @@ static struct vduse_dev *vduse_dev_create(void)
 		return NULL;
 
 	mutex_init(&dev->lock);
-	mutex_init(&dev->mem_lock);
 	mutex_init(&dev->domain_lock);
 	spin_lock_init(&dev->msg_lock);
 	INIT_LIST_HEAD(&dev->send_list);
@@ -1811,8 +1898,11 @@ static int vduse_destroy_dev(char *name)
 	idr_remove(&vduse_idr, dev->minor);
 	kvfree(dev->config);
 	vduse_dev_deinit_vqs(dev);
-	if (dev->domain)
-		vduse_domain_destroy(dev->domain);
+	for (int i = 0; i < dev->nas; i++) {
+		if (dev->as[i].domain)
+			vduse_domain_destroy(dev->as[i].domain);
+	}
+	kfree(dev->as);
 	kfree(dev->name);
 	kfree(dev->groups);
 	vduse_dev_destroy(dev);
@@ -1859,12 +1949,17 @@ static bool vduse_validate_config(struct vduse_dev_config *config,
 			 sizeof(config->reserved)))
 		return false;
 
-	if (api_version < VDUSE_API_VERSION_1 && config->ngroups)
+	if (api_version < VDUSE_API_VERSION_1 &&
+	    (config->ngroups || config->nas))
 		return false;
 
-	if (api_version >= VDUSE_API_VERSION_1 &&
-	    (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS))
-		return false;
+	if (api_version >= VDUSE_API_VERSION_1) {
+		if (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)
+			return false;
+
+		if (!config->nas || config->nas > VDUSE_DEV_MAX_AS)
+			return false;
+	}
 
 	if (config->vq_align > PAGE_SIZE)
 		return false;
@@ -1929,7 +2024,8 @@ static ssize_t bounce_size_store(struct device *device,
 
 	ret = -EPERM;
 	mutex_lock(&dev->domain_lock);
-	if (dev->domain)
+	/* Assuming that if the first domain is allocated, all are allocated */
+	if (dev->as[0].domain)
 		goto unlock;
 
 	ret = kstrtouint(buf, 10, &bounce_size);
@@ -1981,6 +2077,14 @@ static int vduse_create_dev(struct vduse_dev_config *config,
 	dev->device_features = config->features;
 	dev->device_id = config->device_id;
 	dev->vendor_id = config->vendor_id;
+
+	dev->nas = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->nas;
+	dev->as = kcalloc(dev->nas, sizeof(dev->as[0]), GFP_KERNEL);
+	if (!dev->as)
+		goto err_as;
+	for (int i = 0; i < dev->nas; i++)
+		mutex_init(&dev->as[i].mem_lock);
+
 	dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1)
 		       ? 1
 		       : config->ngroups;
@@ -1988,8 +2092,11 @@ static int vduse_create_dev(struct vduse_dev_config *config,
 			      GFP_KERNEL);
 	if (!dev->groups)
 		goto err_vq_groups;
-	for (u32 i = 0; i < dev->ngroups; ++i)
+	for (u32 i = 0; i < dev->ngroups; ++i) {
 		dev->groups[i].dev = dev;
+		rwlock_init(&dev->groups[i].as_lock);
+		dev->groups[i].as = &dev->as[0];
+	}
 
 	dev->name = kstrdup(config->name, GFP_KERNEL);
 	if (!dev->name)
@@ -2029,6 +2136,8 @@ err_idr:
 err_str:
 	kfree(dev->groups);
 err_vq_groups:
+	kfree(dev->as);
+err_as:
 	vduse_dev_destroy(dev);
 err:
 	return ret;
@@ -2152,7 +2261,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
 
 	vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
 				 &vduse_vdpa_config_ops, &vduse_map_ops,
-				 dev->ngroups, 1, name, true);
+				 dev->ngroups, dev->nas, name, true);
 	if (IS_ERR(vdev))
 		return PTR_ERR(vdev);
 
@@ -2167,7 +2276,8 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
 			const struct vdpa_dev_set_config *config)
 {
 	struct vduse_dev *dev;
-	int ret;
+	size_t domain_bounce_size;
+	int ret, i;
 
 	mutex_lock(&vduse_lock);
 	dev = vduse_find_dev(name);
@@ -2181,29 +2291,38 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
 		return ret;
 
 	mutex_lock(&dev->domain_lock);
-	if (!dev->domain)
-		dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
-						  dev->bounce_size);
-	mutex_unlock(&dev->domain_lock);
-	if (!dev->domain) {
-		ret = -ENOMEM;
-		goto domain_err;
+	ret = 0;
+
+	domain_bounce_size = dev->bounce_size / dev->nas;
+	for (i = 0; i < dev->nas; ++i) {
+		dev->as[i].domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
+							domain_bounce_size);
+		if (!dev->as[i].domain) {
+			ret = -ENOMEM;
+			goto err;
+		}
 	}
 
+	mutex_unlock(&dev->domain_lock);
+
 	ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
-	if (ret) {
-		goto register_err;
-	}
+	if (ret)
+		goto err_register;
 
 	return 0;
 
-register_err:
+err_register:
 	mutex_lock(&dev->domain_lock);
-	vduse_domain_destroy(dev->domain);
-	dev->domain = NULL;
+
+err:
+	for (int j = 0; j < i; j++) {
+		if (dev->as[j].domain) {
+			vduse_domain_destroy(dev->as[j].domain);
+			dev->as[j].domain = NULL;
+		}
+	}
 	mutex_unlock(&dev->domain_lock);
 
-domain_err:
 	put_device(&dev->vdev->vdpa.dev);
 
 	return ret;
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index a3d51cf6df3a..68b4287f9fac 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -32,6 +32,7 @@
  * @vq_num: the number of virtqueues
  * @vq_align: the allocation alignment of virtqueue's metadata
  * @ngroups: number of vq groups that VDUSE device declares
+ * @nas: number of address spaces that VDUSE device declares
  * @reserved: for future use, needs to be initialized to zero
  * @config_size: the size of the configuration space
  * @config: the buffer of the configuration space
@@ -47,7 +48,8 @@ struct vduse_dev_config {
 	__u32 vq_num;
 	__u32 vq_align;
 	__u32 ngroups; /* if VDUSE_API_VERSION >= 1 */
-	__u32 reserved[12];
+	__u32 nas; /* if VDUSE_API_VERSION >= 1 */
+	__u32 reserved[11];
 	__u32 config_size;
 	__u8 config[];
 };
@@ -166,6 +168,16 @@ struct vduse_vq_state_packed {
 	__u16 last_used_idx;
 };
 
+/**
+ * struct vduse_vq_group_asid - virtqueue group ASID
+ * @group: Index of the virtqueue group
+ * @asid: Address space ID of the group
+ */
+struct vduse_vq_group_asid {
+	__u32 group;
+	__u32 asid;
+};
+
 /**
  * struct vduse_vq_info - information of a virtqueue
  * @index: virtqueue index
@@ -225,6 +237,7 @@ struct vduse_vq_eventfd {
  * @uaddr: start address of userspace memory, it must be aligned to page size
  * @iova: start of the IOVA region
  * @size: size of the IOVA region
+ * @asid: Address space ID of the IOVA region
  * @reserved: for future use, needs to be initialized to zero
  *
  * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM
@@ -234,7 +247,8 @@ struct vduse_iova_umem {
 	__u64 uaddr;
 	__u64 iova;
 	__u64 size;
-	__u64 reserved[3];
+	__u32 asid;
+	__u32 reserved[5];
 };
 
 /* Register userspace memory for IOVA regions */
@@ -248,6 +262,7 @@ struct vduse_iova_umem {
  * @start: start of the IOVA region
  * @last: last of the IOVA region
  * @capability: capability of the IOVA region
+ * @asid: Address space ID of the IOVA region, only if device API version >= 1
  * @reserved: for future use, needs to be initialized to zero
  *
  * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of
@@ -258,7 +273,8 @@ struct vduse_iova_info {
 	__u64 last;
 #define VDUSE_IOVA_CAP_UMEM (1 << 0)
 	__u64 capability;
-	__u64 reserved[3];
+	__u32 asid; /* Only if device API version >= 1 */
+	__u32 reserved[5];
 };
 
 /*
@@ -267,6 +283,28 @@ struct vduse_iova_info {
  */
 #define VDUSE_IOTLB_GET_INFO	_IOWR(VDUSE_BASE, 0x1a, struct vduse_iova_info)
 
+/**
+ * struct vduse_iotlb_entry_v2 - entry of IOTLB to describe one IOVA region
+ *
+ * @v1: the original vduse_iotlb_entry
+ * @asid: address space ID of the IOVA region
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_IOTLB_GET_FD2 ioctl to find an overlapped IOVA region.
+ */
+struct vduse_iotlb_entry_v2 {
+	struct vduse_iotlb_entry v1;
+	__u32 asid;
+	__u32 reserved[12];
+};
+
+/*
+ * Same as VDUSE_IOTLB_GET_FD but with vduse_iotlb_entry_v2 argument that
+ * support extra fields.
+ */
+#define VDUSE_IOTLB_GET_FD2	_IOWR(VDUSE_BASE, 0x1b, struct vduse_iotlb_entry_v2)
+
+
 /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
 
 /**
@@ -275,11 +313,14 @@ struct vduse_iova_info {
  * @VDUSE_SET_STATUS: set the device status
  * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for
  *                      specified IOVA range via VDUSE_IOTLB_GET_FD ioctl
+ * @VDUSE_SET_VQ_GROUP_ASID: Notify userspace to update the address space of a
+ *                           virtqueue group.
  */
 enum vduse_req_type {
 	VDUSE_GET_VQ_STATE,
 	VDUSE_SET_STATUS,
 	VDUSE_UPDATE_IOTLB,
+	VDUSE_SET_VQ_GROUP_ASID,
 };
 
 /**
@@ -314,6 +355,18 @@ struct vduse_iova_range {
 	__u64 last;
 };
 
+/**
+ * struct vduse_iova_range_v2 - IOVA range [start, last] if API_VERSION >= 1
+ * @start: start of the IOVA range
+ * @last: last of the IOVA range
+ * @asid: address space ID of the IOVA range
+ */
+struct vduse_iova_range_v2 {
+	__u64 start;
+	__u64 last;
+	__u32 asid;
+};
+
 /**
  * struct vduse_dev_request - control request
  * @type: request type
@@ -322,6 +375,8 @@ struct vduse_iova_range {
  * @vq_state: virtqueue state, only index field is available
  * @s: device status
  * @iova: IOVA range for updating
+ * @iova_v2: IOVA range for updating if API_VERSION >= 1
+ * @vq_group_asid: ASID of a virtqueue group
  * @padding: padding
  *
  * Structure used by read(2) on /dev/vduse/$NAME.
@@ -334,6 +389,11 @@ struct vduse_dev_request {
 		struct vduse_vq_state vq_state;
 		struct vduse_dev_status s;
 		struct vduse_iova_range iova;
+		/* Following members but padding exist only if vduse api
+		 * version >= 1
+		 */
+		struct vduse_iova_range_v2 iova_v2;
+		struct vduse_vq_group_asid vq_group_asid;
 		__u32 padding[32];
 	};
 };
-- 
cgit v1.2.3


From 5ca243f6e3c30b979a54a96b96df355dda2b4d0f Mon Sep 17 00:00:00 2001
From: Deepak Gupta <debug@rivosinc.com>
Date: Sun, 25 Jan 2026 21:09:54 -0700
Subject: prctl: add arch-agnostic prctl()s for indirect branch tracking

Three architectures (x86, aarch64, riscv) have support for indirect
branch tracking feature in a very similar fashion. On a very high
level, indirect branch tracking is a CPU feature where CPU tracks
branches which use a memory operand to transfer control. As part of
this tracking, during an indirect branch, the CPU expects a landing
pad instruction on the target PC, and if not found, the CPU raises
some fault (architecture-dependent).

x86 landing pad instr - 'ENDBRANCH'
arch64 landing pad instr - 'BTI'
riscv landing instr - 'lpad'

Given that three major architectures have support for indirect branch
tracking, this patch creates architecture-agnostic 'prctls' to allow
userspace to control this feature.  They are:
 - PR_GET_INDIR_BR_LP_STATUS: Get the current configured status for indirect
   branch tracking.
 - PR_SET_INDIR_BR_LP_STATUS: Set the configuration for indirect branch
   tracking.
   The following status options are allowed:
       - PR_INDIR_BR_LP_ENABLE: Enables indirect branch tracking on user
         thread.
       - PR_INDIR_BR_LP_DISABLE: Disables indirect branch tracking on user
         thread.
 - PR_LOCK_INDIR_BR_LP_STATUS: Locks configured status for indirect branch
   tracking for user thread.

Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Deepak Gupta <debug@rivosinc.com>
Tested-by: Andreas Korb <andreas.korb@aisec.fraunhofer.de> # QEMU, custom CVA6
Tested-by: Valentin Haudiquet <valentin.haudiquet@canonical.com>
Link: https://patch.msgid.link/20251112-v5_user_cfi_series-v23-13-b55691eacf4f@rivosinc.com
[pjw@kernel.org: cleaned up patch description, code comments]
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 include/linux/cpu.h        |  4 ++++
 include/uapi/linux/prctl.h | 27 +++++++++++++++++++++++++++
 kernel/sys.c               | 30 ++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 487b3bf2e1ea..8239cd95a005 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -229,4 +229,8 @@ static inline bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v)
 #define smt_mitigations SMT_MITIGATIONS_OFF
 #endif
 
+int arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status);
+int arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status);
+int arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status);
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 51c4e8c82b1e..f57098fb0ba8 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -386,4 +386,31 @@ struct prctl_mm_map {
 # define PR_FUTEX_HASH_SET_SLOTS	1
 # define PR_FUTEX_HASH_GET_SLOTS	2
 
+/*
+ * Get the current indirect branch tracking configuration for the current
+ * thread, this will be the value configured via PR_SET_INDIR_BR_LP_STATUS.
+ */
+#define PR_GET_INDIR_BR_LP_STATUS      79
+
+/*
+ * Set the indirect branch tracking configuration. PR_INDIR_BR_LP_ENABLE will
+ * enable cpu feature for user thread, to track all indirect branches and ensure
+ * they land on arch defined landing pad instruction.
+ * x86 - If enabled, an indirect branch must land on an ENDBRANCH instruction.
+ * arch64 - If enabled, an indirect branch must land on a BTI instruction.
+ * riscv - If enabled, an indirect branch must land on an lpad instruction.
+ * PR_INDIR_BR_LP_DISABLE will disable feature for user thread and indirect
+ * branches will no more be tracked by cpu to land on arch defined landing pad
+ * instruction.
+ */
+#define PR_SET_INDIR_BR_LP_STATUS      80
+# define PR_INDIR_BR_LP_ENABLE		   (1UL << 0)
+
+/*
+ * Prevent further changes to the specified indirect branch tracking
+ * configuration.  All bits may be locked via this call, including
+ * undefined bits.
+ */
+#define PR_LOCK_INDIR_BR_LP_STATUS      81
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b58eece4e58..9071422c1609 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2388,6 +2388,21 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st
 	return -EINVAL;
 }
 
+int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status)
+{
+	return -EINVAL;
+}
+
+int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status)
+{
+	return -EINVAL;
+}
+
+int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status)
+{
+	return -EINVAL;
+}
+
 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
 
 static int prctl_set_vma(unsigned long opt, unsigned long addr,
@@ -2868,6 +2883,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_FUTEX_HASH:
 		error = futex_hash_prctl(arg2, arg3, arg4);
 		break;
+	case PR_GET_INDIR_BR_LP_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_get_indir_br_lp_status(me, (unsigned long __user *)arg2);
+		break;
+	case PR_SET_INDIR_BR_LP_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_set_indir_br_lp_status(me, arg2);
+		break;
+	case PR_LOCK_INDIR_BR_LP_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_lock_indir_br_lp_status(me, arg2);
+		break;
 	default:
 		trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
 		error = -EINVAL;
-- 
cgit v1.2.3


From 2af7c9cf021c5dabe880b68e5cc22c618060d954 Mon Sep 17 00:00:00 2001
From: Deepak Gupta <debug@rivosinc.com>
Date: Sun, 25 Jan 2026 21:09:55 -0700
Subject: riscv/ptrace: expose riscv CFI status and state via ptrace and in
 core files

Expose a new register type NT_RISCV_USER_CFI for risc-v CFI status and
state. Intentionally, both landing pad and shadow stack status and
state are rolled into the CFI state. Creating two different
NT_RISCV_USER_XXX would not be useful and would waste a note
type. Enabling, disabling and locking the CFI feature is not allowed
via ptrace set interface. However, setting 'elp' state or setting
shadow stack pointer are allowed via the ptrace set interface.  It is
expected that 'gdb' might need to fixup 'elp' state or 'shadow stack'
pointer.

Signed-off-by: Deepak Gupta <debug@rivosinc.com>
Tested-by: Andreas Korb <andreas.korb@aisec.fraunhofer.de> # QEMU, custom CVA6
Tested-by: Valentin Haudiquet <valentin.haudiquet@canonical.com>
Link: https://patch.msgid.link/20251112-v5_user_cfi_series-v23-19-b55691eacf4f@rivosinc.com
[pjw@kernel.org: updated to apply; cleaned patch description and comments; addressed checkpatch issues]
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 arch/riscv/include/uapi/asm/ptrace.h | 30 ++++++++++++
 arch/riscv/kernel/ptrace.c           | 95 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h             |  2 +
 3 files changed, 127 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h
index 261bfe70f60a..18988a5f1a63 100644
--- a/arch/riscv/include/uapi/asm/ptrace.h
+++ b/arch/riscv/include/uapi/asm/ptrace.h
@@ -131,6 +131,36 @@ struct __sc_riscv_cfi_state {
 	unsigned long ss_ptr;   /* shadow stack pointer */
 };
 
+#define PTRACE_CFI_LP_EN_BIT	0
+#define PTRACE_CFI_LP_LOCK_BIT	1
+#define PTRACE_CFI_ELP_BIT	2
+#define PTRACE_CFI_SS_EN_BIT	3
+#define PTRACE_CFI_SS_LOCK_BIT	4
+#define PTRACE_CFI_SS_PTR_BIT	5
+
+#define PTRACE_CFI_LP_EN_STATE		BIT(PTRACE_CFI_LP_EN_BIT)
+#define PTRACE_CFI_LP_LOCK_STATE	BIT(PTRACE_CFI_LP_LOCK_BIT)
+#define PTRACE_CFI_ELP_STATE		BIT(PTRACE_CFI_ELP_BIT)
+#define PTRACE_CFI_SS_EN_STATE		BIT(PTRACE_CFI_SS_EN_BIT)
+#define PTRACE_CFI_SS_LOCK_STATE	BIT(PTRACE_CFI_SS_LOCK_BIT)
+#define PTRACE_CFI_SS_PTR_STATE		BIT(PTRACE_CFI_SS_PTR_BIT)
+
+#define PRACE_CFI_STATE_INVALID_MASK	~(PTRACE_CFI_LP_EN_STATE | \
+					  PTRACE_CFI_LP_LOCK_STATE | \
+					  PTRACE_CFI_ELP_STATE | \
+					  PTRACE_CFI_SS_EN_STATE | \
+					  PTRACE_CFI_SS_LOCK_STATE | \
+					  PTRACE_CFI_SS_PTR_STATE)
+
+struct __cfi_status {
+	__u64 cfi_state;
+};
+
+struct user_cfi_state {
+	struct __cfi_status	cfi_status;
+	__u64 shstk_ptr;
+};
+
 #endif /* __ASSEMBLER__ */
 
 #endif /* _UAPI_ASM_RISCV_PTRACE_H */
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index e6272d74572f..57e257d459e8 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -19,6 +19,7 @@
 #include <linux/regset.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
+#include <asm/usercfi.h>
 
 enum riscv_regset {
 	REGSET_X,
@@ -31,6 +32,9 @@ enum riscv_regset {
 #ifdef CONFIG_RISCV_ISA_SUPM
 	REGSET_TAGGED_ADDR_CTRL,
 #endif
+#ifdef CONFIG_RISCV_USER_CFI
+	REGSET_CFI,
+#endif
 };
 
 static int riscv_gpr_get(struct task_struct *target,
@@ -195,6 +199,87 @@ static int tagged_addr_ctrl_set(struct task_struct *target,
 }
 #endif
 
+#ifdef CONFIG_RISCV_USER_CFI
+static int riscv_cfi_get(struct task_struct *target,
+			 const struct user_regset *regset,
+			 struct membuf to)
+{
+	struct user_cfi_state user_cfi;
+	struct pt_regs *regs;
+
+	memset(&user_cfi, 0, sizeof(user_cfi));
+	regs = task_pt_regs(target);
+
+	if (is_indir_lp_enabled(target)) {
+		user_cfi.cfi_status.cfi_state |= PTRACE_CFI_LP_EN_STATE;
+		user_cfi.cfi_status.cfi_state |= is_indir_lp_locked(target) ?
+						 PTRACE_CFI_LP_LOCK_STATE : 0;
+		user_cfi.cfi_status.cfi_state |= (regs->status & SR_ELP) ?
+						PTRACE_CFI_ELP_STATE : 0;
+	}
+
+	if (is_shstk_enabled(target)) {
+		user_cfi.cfi_status.cfi_state |= (PTRACE_CFI_SS_EN_STATE |
+						  PTRACE_CFI_SS_PTR_STATE);
+		user_cfi.cfi_status.cfi_state |= is_shstk_locked(target) ?
+						 PTRACE_CFI_SS_LOCK_STATE : 0;
+		user_cfi.shstk_ptr = get_active_shstk(target);
+	}
+
+	return membuf_write(&to, &user_cfi, sizeof(user_cfi));
+}
+
+/*
+ * Does it make sense to allow enable / disable of cfi via ptrace?
+ * We don't allow enable / disable / locking control via ptrace for now.
+ * Setting the shadow stack pointer is allowed. GDB might use it to unwind or
+ * some other fixup. Similarly gdb might want to suppress elp and may want
+ * to reset elp state.
+ */
+static int riscv_cfi_set(struct task_struct *target,
+			 const struct user_regset *regset,
+			 unsigned int pos, unsigned int count,
+			 const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	struct user_cfi_state user_cfi;
+	struct pt_regs *regs;
+
+	regs = task_pt_regs(target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_cfi, 0, -1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Not allowing enabling or locking shadow stack or landing pad
+	 * There is no disabling of shadow stack or landing pad via ptrace
+	 * rsvd field should be set to zero so that if those fields are needed in future
+	 */
+	if ((user_cfi.cfi_status.cfi_state &
+	     (PTRACE_CFI_LP_EN_STATE | PTRACE_CFI_LP_LOCK_STATE |
+	      PTRACE_CFI_SS_EN_STATE | PTRACE_CFI_SS_LOCK_STATE)) ||
+	     (user_cfi.cfi_status.cfi_state & PRACE_CFI_STATE_INVALID_MASK))
+		return -EINVAL;
+
+	/* If lpad is enabled on target and ptrace requests to set / clear elp, do that */
+	if (is_indir_lp_enabled(target)) {
+		if (user_cfi.cfi_status.cfi_state &
+		    PTRACE_CFI_ELP_STATE) /* set elp state */
+			regs->status |= SR_ELP;
+		else
+			regs->status &= ~SR_ELP; /* clear elp state */
+	}
+
+	/* If shadow stack enabled on target, set new shadow stack pointer */
+	if (is_shstk_enabled(target) &&
+	    (user_cfi.cfi_status.cfi_state & PTRACE_CFI_SS_PTR_STATE))
+		set_active_shstk(target, user_cfi.shstk_ptr);
+
+	return 0;
+}
+#endif
+
 static struct user_regset riscv_user_regset[] __ro_after_init = {
 	[REGSET_X] = {
 		USER_REGSET_NOTE_TYPE(PRSTATUS),
@@ -234,6 +319,16 @@ static struct user_regset riscv_user_regset[] __ro_after_init = {
 		.set = tagged_addr_ctrl_set,
 	},
 #endif
+#ifdef CONFIG_RISCV_USER_CFI
+	[REGSET_CFI] = {
+		.core_note_type = NT_RISCV_USER_CFI,
+		.align = sizeof(__u64),
+		.n = sizeof(struct user_cfi_state) / sizeof(__u64),
+		.size = sizeof(__u64),
+		.regset_get = riscv_cfi_get,
+		.set = riscv_cfi_set,
+	},
+#endif
 };
 
 static const struct user_regset_view riscv_user_native_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 819ded2d39de..ee30dcd80901 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -545,6 +545,8 @@ typedef struct elf64_shdr {
 #define NT_RISCV_VECTOR	0x901		/* RISC-V vector registers */
 #define NN_RISCV_TAGGED_ADDR_CTRL "LINUX"
 #define NT_RISCV_TAGGED_ADDR_CTRL 0x902	/* RISC-V tagged address control (prctl()) */
+#define NN_RISCV_USER_CFI	"LINUX"
+#define NT_RISCV_USER_CFI	0x903		/* RISC-V shadow stack state */
 #define NN_LOONGARCH_CPUCFG	"LINUX"
 #define NT_LOONGARCH_CPUCFG	0xa00	/* LoongArch CPU config registers */
 #define NN_LOONGARCH_CSR	"LINUX"
-- 
cgit v1.2.3


From 0e6b7eae1fded85f94a357d6132f07d64c614cfa Mon Sep 17 00:00:00 2001
From: Andrey Albershteyn <aalbersh@kernel.org>
Date: Mon, 26 Jan 2026 12:56:57 +0100
Subject: fs: add FS_XFLAG_VERITY for fs-verity files

fs-verity introduced inode flag for inodes with enabled fs-verity on
them. This patch adds FS_XFLAG_VERITY file attribute which can be
retrieved with FS_IOC_FSGETXATTR ioctl() and file_getattr() syscall.

This flag is read-only and can not be set with corresponding set ioctl()
and file_setattr(). The FS_IOC_SETFLAGS requires file to be opened for
writing which is not allowed for verity files. The FS_IOC_FSSETXATTR and
file_setattr() clears this flag from the user input.

As this is now common flag for both flag interfaces (flags/xflags) add
it to overlapping flags list to exclude it from overwrite.

Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Link: https://patch.msgid.link/20260126115658.27656-2-aalbersh@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/fsverity.rst | 16 ++++++++++++++++
 fs/file_attr.c                         |  4 ++++
 include/linux/fileattr.h               |  6 +++---
 include/uapi/linux/fs.h                |  1 +
 4 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst
index 412cf11e3298..22b49b295d1f 100644
--- a/Documentation/filesystems/fsverity.rst
+++ b/Documentation/filesystems/fsverity.rst
@@ -341,6 +341,22 @@ the file has fs-verity enabled.  This can perform better than
 FS_IOC_GETFLAGS and FS_IOC_MEASURE_VERITY because it doesn't require
 opening the file, and opening verity files can be expensive.
 
+FS_IOC_FSGETXATTR
+-----------------
+
+Since Linux v7.0, the FS_IOC_FSGETXATTR ioctl sets FS_XFLAG_VERITY (0x00020000)
+in the returned flags when the file has verity enabled. Note that this attribute
+cannot be set with FS_IOC_FSSETXATTR as enabling verity requires input
+parameters. See FS_IOC_ENABLE_VERITY.
+
+file_getattr
+------------
+
+Since Linux v7.0, the file_getattr() syscall sets FS_XFLAG_VERITY (0x00020000)
+in the returned flags when the file has verity enabled. Note that this attribute
+cannot be set with file_setattr() as enabling verity requires input parameters.
+See FS_IOC_ENABLE_VERITY.
+
 .. _accessing_verity_files:
 
 Accessing verity files
diff --git a/fs/file_attr.c b/fs/file_attr.c
index f3704881c126..dfde87401817 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -36,6 +36,8 @@ void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags)
 		fa->flags |= FS_DAX_FL;
 	if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
 		fa->flags |= FS_PROJINHERIT_FL;
+	if (fa->fsx_xflags & FS_XFLAG_VERITY)
+		fa->flags |= FS_VERITY_FL;
 }
 EXPORT_SYMBOL(fileattr_fill_xflags);
 
@@ -66,6 +68,8 @@ void fileattr_fill_flags(struct file_kattr *fa, u32 flags)
 		fa->fsx_xflags |= FS_XFLAG_DAX;
 	if (fa->flags & FS_PROJINHERIT_FL)
 		fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+	if (fa->flags & FS_VERITY_FL)
+		fa->fsx_xflags |= FS_XFLAG_VERITY;
 }
 EXPORT_SYMBOL(fileattr_fill_flags);
 
diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h
index f89dcfad3f8f..3780904a63a6 100644
--- a/include/linux/fileattr.h
+++ b/include/linux/fileattr.h
@@ -7,16 +7,16 @@
 #define FS_COMMON_FL \
 	(FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \
 	 FS_NODUMP_FL |	FS_NOATIME_FL | FS_DAX_FL | \
-	 FS_PROJINHERIT_FL)
+	 FS_PROJINHERIT_FL | FS_VERITY_FL)
 
 #define FS_XFLAG_COMMON \
 	(FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND | \
 	 FS_XFLAG_NODUMP | FS_XFLAG_NOATIME | FS_XFLAG_DAX | \
-	 FS_XFLAG_PROJINHERIT)
+	 FS_XFLAG_PROJINHERIT | FS_XFLAG_VERITY)
 
 /* Read-only inode flags */
 #define FS_XFLAG_RDONLY_MASK \
-	(FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR)
+	(FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR | FS_XFLAG_VERITY)
 
 /* Flags to indicate valid value of fsx_ fields */
 #define FS_XFLAG_VALUES_MASK \
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 66ca526cf786..70b2b661f42c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -253,6 +253,7 @@ struct file_attr {
 #define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
 #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
 #define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
+#define FS_XFLAG_VERITY		0x00020000	/* fs-verity enabled */
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is
-- 
cgit v1.2.3


From 8cf82bb558517503a81f8e3c49914c0836360aa6 Mon Sep 17 00:00:00 2001
From: Koichiro Den <den@valinux.co.jp>
Date: Sat, 24 Jan 2026 23:50:11 +0900
Subject: misc: pci_endpoint_test: Add BAR subrange mapping test case

Add a new PCITEST_BAR_SUBRANGE ioctl to exercise EPC BAR subrange
mapping end-to-end.

The test programs a simple 2-subrange layout on the endpoint (via
pci-epf-test) and verifies that:
  - the endpoint-provided per-subrange signature bytes are observed at
    the expected PCIe BAR offsets, and
  - writes to each subrange are routed to the correct backing region
    (i.e. the submap order is applied rather than accidentally working
    due to an identity mapping).

Return -EOPNOTSUPP when the endpoint does not advertise subrange
mapping, -ENODATA when the BAR is disabled, and -EBUSY when the BAR is
reserved for the test register space.

Signed-off-by: Koichiro Den <den@valinux.co.jp>
Signed-off-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20260124145012.2794108-8-den@valinux.co.jp
---
 drivers/misc/pci_endpoint_test.c | 203 ++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/pcitest.h     |   1 +
 2 files changed, 203 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 1c0fd185114f..74ab5b5b9011 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -39,6 +39,8 @@
 #define COMMAND_COPY				BIT(5)
 #define COMMAND_ENABLE_DOORBELL			BIT(6)
 #define COMMAND_DISABLE_DOORBELL		BIT(7)
+#define COMMAND_BAR_SUBRANGE_SETUP		BIT(8)
+#define COMMAND_BAR_SUBRANGE_CLEAR		BIT(9)
 
 #define PCI_ENDPOINT_TEST_STATUS		0x8
 #define STATUS_READ_SUCCESS			BIT(0)
@@ -55,6 +57,10 @@
 #define STATUS_DOORBELL_ENABLE_FAIL		BIT(11)
 #define STATUS_DOORBELL_DISABLE_SUCCESS		BIT(12)
 #define STATUS_DOORBELL_DISABLE_FAIL		BIT(13)
+#define STATUS_BAR_SUBRANGE_SETUP_SUCCESS	BIT(14)
+#define STATUS_BAR_SUBRANGE_SETUP_FAIL		BIT(15)
+#define STATUS_BAR_SUBRANGE_CLEAR_SUCCESS	BIT(16)
+#define STATUS_BAR_SUBRANGE_CLEAR_FAIL		BIT(17)
 
 #define PCI_ENDPOINT_TEST_LOWER_SRC_ADDR	0x0c
 #define PCI_ENDPOINT_TEST_UPPER_SRC_ADDR	0x10
@@ -77,6 +83,7 @@
 #define CAP_MSI					BIT(1)
 #define CAP_MSIX				BIT(2)
 #define CAP_INTX				BIT(3)
+#define CAP_SUBRANGE_MAPPING			BIT(4)
 
 #define PCI_ENDPOINT_TEST_DB_BAR		0x34
 #define PCI_ENDPOINT_TEST_DB_OFFSET		0x38
@@ -100,6 +107,8 @@
 
 #define PCI_DEVICE_ID_ROCKCHIP_RK3588		0x3588
 
+#define PCI_ENDPOINT_TEST_BAR_SUBRANGE_NSUB	2
+
 static DEFINE_IDA(pci_endpoint_test_ida);
 
 #define to_endpoint_test(priv) container_of((priv), struct pci_endpoint_test, \
@@ -414,6 +423,193 @@ static int pci_endpoint_test_bars(struct pci_endpoint_test *test)
 	return 0;
 }
 
+static u8 pci_endpoint_test_subrange_sig_byte(enum pci_barno barno,
+					      unsigned int subno)
+{
+	return 0x50 + (barno * 8) + subno;
+}
+
+static u8 pci_endpoint_test_subrange_test_byte(enum pci_barno barno,
+					       unsigned int subno)
+{
+	return 0xa0 + (barno * 8) + subno;
+}
+
+static int pci_endpoint_test_bar_subrange_cmd(struct pci_endpoint_test *test,
+					      enum pci_barno barno, u32 command,
+					      u32 ok_bit, u32 fail_bit)
+{
+	struct pci_dev *pdev = test->pdev;
+	struct device *dev = &pdev->dev;
+	int irq_type = test->irq_type;
+	u32 status;
+
+	if (irq_type < PCITEST_IRQ_TYPE_INTX ||
+	    irq_type > PCITEST_IRQ_TYPE_MSIX) {
+		dev_err(dev, "Invalid IRQ type\n");
+		return -EINVAL;
+	}
+
+	reinit_completion(&test->irq_raised);
+
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_STATUS, 0);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_TYPE, irq_type);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_NUMBER, 1);
+	/* Reuse SIZE as a command parameter: bar number. */
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_SIZE, barno);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, command);
+
+	if (!wait_for_completion_timeout(&test->irq_raised,
+					 msecs_to_jiffies(1000)))
+		return -ETIMEDOUT;
+
+	status = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_STATUS);
+	if (status & fail_bit)
+		return -EIO;
+
+	if (!(status & ok_bit))
+		return -EIO;
+
+	return 0;
+}
+
+static int pci_endpoint_test_bar_subrange_setup(struct pci_endpoint_test *test,
+						enum pci_barno barno)
+{
+	return pci_endpoint_test_bar_subrange_cmd(test, barno,
+			COMMAND_BAR_SUBRANGE_SETUP,
+			STATUS_BAR_SUBRANGE_SETUP_SUCCESS,
+			STATUS_BAR_SUBRANGE_SETUP_FAIL);
+}
+
+static int pci_endpoint_test_bar_subrange_clear(struct pci_endpoint_test *test,
+						enum pci_barno barno)
+{
+	return pci_endpoint_test_bar_subrange_cmd(test, barno,
+			COMMAND_BAR_SUBRANGE_CLEAR,
+			STATUS_BAR_SUBRANGE_CLEAR_SUCCESS,
+			STATUS_BAR_SUBRANGE_CLEAR_FAIL);
+}
+
+static int pci_endpoint_test_bar_subrange(struct pci_endpoint_test *test,
+					  enum pci_barno barno)
+{
+	u32 nsub = PCI_ENDPOINT_TEST_BAR_SUBRANGE_NSUB;
+	struct device *dev = &test->pdev->dev;
+	size_t sub_size, buf_size;
+	resource_size_t bar_size;
+	void __iomem *bar_addr;
+	void *read_buf = NULL;
+	int ret, clear_ret;
+	size_t off, chunk;
+	u32 i, exp, val;
+	u8 pattern;
+
+	if (!(test->ep_caps & CAP_SUBRANGE_MAPPING))
+		return -EOPNOTSUPP;
+
+	/*
+	 * The test register BAR is not safe to reprogram and write/read
+	 * over its full size. BAR_TEST already special-cases it to a tiny
+	 * range. For subrange mapping tests, let's simply skip it.
+	 */
+	if (barno == test->test_reg_bar)
+		return -EBUSY;
+
+	bar_size = pci_resource_len(test->pdev, barno);
+	if (!bar_size)
+		return -ENODATA;
+
+	bar_addr = test->bar[barno];
+	if (!bar_addr)
+		return -ENOMEM;
+
+	ret = pci_endpoint_test_bar_subrange_setup(test, barno);
+	if (ret)
+		return ret;
+
+	if (bar_size % nsub || bar_size / nsub > SIZE_MAX) {
+		ret = -EINVAL;
+		goto out_clear;
+	}
+
+	sub_size = bar_size / nsub;
+	if (sub_size < sizeof(u32)) {
+		ret = -ENOSPC;
+		goto out_clear;
+	}
+
+	/* Limit the temporary buffer size */
+	buf_size = min_t(size_t, sub_size, SZ_1M);
+
+	read_buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!read_buf) {
+		ret = -ENOMEM;
+		goto out_clear;
+	}
+
+	/*
+	 * Step 1: verify EP-provided signature per subrange. This detects
+	 * whether the EP actually applied the submap order.
+	 */
+	for (i = 0; i < nsub; i++) {
+		exp = (u32)pci_endpoint_test_subrange_sig_byte(barno, i) *
+			0x01010101U;
+		val = ioread32(bar_addr + (i * sub_size));
+		if (val != exp) {
+			dev_err(dev,
+				"BAR%d subrange%u signature mismatch @%#zx: exp %#08x got %#08x\n",
+				barno, i, (size_t)i * sub_size, exp, val);
+			ret = -EIO;
+			goto out_clear;
+		}
+		val = ioread32(bar_addr + (i * sub_size) + sub_size - sizeof(u32));
+		if (val != exp) {
+			dev_err(dev,
+				"BAR%d subrange%u signature mismatch @%#zx: exp %#08x got %#08x\n",
+				barno, i,
+				((size_t)i * sub_size) + sub_size - sizeof(u32),
+				exp, val);
+			ret = -EIO;
+			goto out_clear;
+		}
+	}
+
+	/* Step 2: write unique pattern per subrange (write all first). */
+	for (i = 0; i < nsub; i++) {
+		pattern = pci_endpoint_test_subrange_test_byte(barno, i);
+		memset_io(bar_addr + (i * sub_size), pattern, sub_size);
+	}
+
+	/* Step 3: read back and verify (read all after all writes). */
+	for (i = 0; i < nsub; i++) {
+		pattern = pci_endpoint_test_subrange_test_byte(barno, i);
+		for (off = 0; off < sub_size; off += chunk) {
+			void *bad;
+
+			chunk = min_t(size_t, buf_size, sub_size - off);
+			memcpy_fromio(read_buf, bar_addr + (i * sub_size) + off,
+				      chunk);
+			bad = memchr_inv(read_buf, pattern, chunk);
+			if (bad) {
+				size_t bad_off = (u8 *)bad - (u8 *)read_buf;
+
+				dev_err(dev,
+					"BAR%d subrange%u data mismatch @%#zx (pattern %#02x)\n",
+					barno, i, (size_t)i * sub_size + off + bad_off,
+					pattern);
+				ret = -EIO;
+				goto out_clear;
+			}
+		}
+	}
+
+out_clear:
+	kfree(read_buf);
+	clear_ret = pci_endpoint_test_bar_subrange_clear(test, barno);
+	return ret ?: clear_ret;
+}
+
 static int pci_endpoint_test_intx_irq(struct pci_endpoint_test *test)
 {
 	u32 val;
@@ -936,12 +1132,17 @@ static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd,
 
 	switch (cmd) {
 	case PCITEST_BAR:
+	case PCITEST_BAR_SUBRANGE:
 		bar = arg;
 		if (bar <= NO_BAR || bar > BAR_5)
 			goto ret;
 		if (is_am654_pci_dev(pdev) && bar == BAR_0)
 			goto ret;
-		ret = pci_endpoint_test_bar(test, bar);
+
+		if (cmd == PCITEST_BAR)
+			ret = pci_endpoint_test_bar(test, bar);
+		else
+			ret = pci_endpoint_test_bar_subrange(test, bar);
 		break;
 	case PCITEST_BARS:
 		ret = pci_endpoint_test_bars(test);
diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h
index d6023a45a9d0..710f8842223f 100644
--- a/include/uapi/linux/pcitest.h
+++ b/include/uapi/linux/pcitest.h
@@ -22,6 +22,7 @@
 #define PCITEST_GET_IRQTYPE	_IO('P', 0x9)
 #define PCITEST_BARS		_IO('P', 0xa)
 #define PCITEST_DOORBELL	_IO('P', 0xb)
+#define PCITEST_BAR_SUBRANGE	_IO('P', 0xc)
 #define PCITEST_CLEAR_IRQ	_IO('P', 0x10)
 
 #define PCITEST_IRQ_TYPE_UNDEFINED	-1
-- 
cgit v1.2.3


From bc443c253fcdd2636e2a29fde3f749d39d479cf0 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Mon, 26 Jan 2026 17:22:51 +0100
Subject: dpll: expose fractional frequency offset in ppt

Currently, the dpll subsystem exports the fractional frequency offset
(FFO) in parts per million (ppm). This granularity is insufficient for
high-precision synchronization scenarios which often require parts per
trillion (ppt) resolution.

Add a new netlink attribute DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET_PPT
to expose the FFO in ppt.

Update the dpll netlink core to expect the driver-provided FFO value
to be in ppt. To maintain backward compatibility with existing userspace
tools, populate the legacy DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET
attribute by dividing the new ppt value by 1,000,000.

Update the zl3073x and mlx5 drivers to provide the FFO value in ppt:
- zl3073x: adjust the fixed-point calculation to produce ppt (10^12)
  instead of ppm (10^6).
- mlx5: scale the existing ppm value by 1,000,000.

Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20260126162253.27890-1-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/dpll.yaml          | 11 +++++++++++
 drivers/dpll/dpll_netlink.c                    | 10 +++++++++-
 drivers/dpll/zl3073x/core.c                    |  7 +++++--
 drivers/net/ethernet/mellanox/mlx5/core/dpll.c |  2 +-
 include/uapi/linux/dpll.h                      |  1 +
 5 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index b55afa77eac4..3dd48a32f783 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -446,6 +446,16 @@ attribute-sets:
         doc: |
           Granularity of phase adjustment, in picoseconds. The value of
           phase adjustment must be a multiple of this granularity.
+      -
+        name: fractional-frequency-offset-ppt
+        type: sint
+        doc: |
+          The FFO (Fractional Frequency Offset) of the pin with respect to
+          the nominal frequency.
+          Value = (frequency_measured - frequency_nominal) / frequency_nominal
+          Value is in PPT (parts per trillion, 10^-12).
+          Note: This attribute provides higher resolution than the standard
+          fractional-frequency-offset (which is in PPM).
 
   -
     name: pin-parent-device
@@ -628,6 +638,7 @@ operations:
             - phase-adjust-max
             - phase-adjust
             - fractional-frequency-offset
+            - fractional-frequency-offset-ppt
             - esync-frequency
             - esync-frequency-supported
             - esync-pulse
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 499bca460b1e..904199ddd178 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -389,7 +389,15 @@ static int dpll_msg_add_ffo(struct sk_buff *msg, struct dpll_pin *pin,
 			return 0;
 		return ret;
 	}
-	return nla_put_sint(msg, DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET, ffo);
+	/* Put the FFO value in PPM to preserve compatibility with older
+	 * programs.
+	 */
+	ret = nla_put_sint(msg, DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET,
+			   div_s64(ffo, 1000000));
+	if (ret)
+		return -EMSGSIZE;
+	return nla_put_sint(msg, DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET_PPT,
+			    ffo);
 }
 
 static int
diff --git a/drivers/dpll/zl3073x/core.c b/drivers/dpll/zl3073x/core.c
index 383e2397dd03..63bd97181b9e 100644
--- a/drivers/dpll/zl3073x/core.c
+++ b/drivers/dpll/zl3073x/core.c
@@ -710,8 +710,11 @@ zl3073x_ref_ffo_update(struct zl3073x_dev *zldev)
 		if (rc)
 			return rc;
 
-		/* Convert to ppm -> ffo = (10^6 * value) / 2^32 */
-		zldev->ref[i].ffo = mul_s64_u64_shr(value, 1000000, 32);
+		/* Convert to ppt
+		 * ffo = (10^12 * value) / 2^32
+		 * ffo = ( 5^12 * value) / 2^20
+		 */
+		zldev->ref[i].ffo = mul_s64_u64_shr(value, 244140625, 20);
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
index 1e5522a19483..3ea8a1766ae2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
@@ -136,7 +136,7 @@ mlx5_dpll_pin_ffo_get(struct mlx5_dpll_synce_status *synce_status,
 {
 	if (!synce_status->oper_freq_measure)
 		return -ENODATA;
-	*ffo = synce_status->frequency_diff;
+	*ffo = 1000000LL * synce_status->frequency_diff;
 	return 0;
 }
 
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index b7ff9c44f9aa..de0005f28e5c 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -253,6 +253,7 @@ enum dpll_a_pin {
 	DPLL_A_PIN_ESYNC_PULSE,
 	DPLL_A_PIN_REFERENCE_SYNC,
 	DPLL_A_PIN_PHASE_ADJUST_GRAN,
+	DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET_PPT,
 
 	__DPLL_A_PIN_MAX,
 	DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1)
-- 
cgit v1.2.3


From 8443e2087e7002fa25984faad6bbf5f63b280645 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 30 Jan 2026 00:19:51 +0800
Subject: ublk: add UBLK_F_NO_AUTO_PART_SCAN feature flag

Add a new feature flag UBLK_F_NO_AUTO_PART_SCAN to allow users to suppress
automatic partition scanning when starting a ublk device.

This is useful for some cases in which user don't want to scan
partitions.

Users still can manually trigger partition scanning later when appropriate
using standard tools (e.g., partprobe, blockdev --rereadpt).

Reported-by: Yoav Cohen <yoav@nvidia.com>
Link: https://lore.kernel.org/linux-block/DM4PR12MB63280C5637917C071C2F0D65A9A8A@DM4PR12MB6328.namprd12.prod.outlook.com/
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 14 ++++++++++----
 include/uapi/linux/ublk_cmd.h |  3 +++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 92bd2351e3ad..4fe754e7d1e8 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -80,7 +80,8 @@
 		| UBLK_F_BUF_REG_OFF_DAEMON \
 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
 		| UBLK_F_SAFE_STOP_DEV \
-		| UBLK_F_BATCH_IO)
+		| UBLK_F_BATCH_IO \
+		| UBLK_F_NO_AUTO_PART_SCAN)
 
 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
 		| UBLK_F_USER_RECOVERY_REISSUE \
@@ -4430,9 +4431,14 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
 
 	set_bit(UB_STATE_USED, &ub->state);
 
-	/* Schedule async partition scan for trusted daemons */
-	if (!ub->unprivileged_daemons)
-		schedule_work(&ub->partition_scan_work);
+	/* Skip partition scan if disabled by user */
+	if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
+		clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+	} else {
+		/* Schedule async partition scan for trusted daemons */
+		if (!ub->unprivileged_daemons)
+			schedule_work(&ub->partition_scan_work);
+	}
 
 out_put_cdev:
 	if (ret) {
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 743d31491387..a88876756805 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -367,6 +367,9 @@
  */
 #define UBLK_F_SAFE_STOP_DEV	(1ULL << 17)
 
+/* Disable automatic partition scanning when device is started */
+#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit v1.2.3


From 503efe850c7463a1e59df133b84461ef53c0361f Mon Sep 17 00:00:00 2001
From: Wang Yaxin <wang.yaxin@zte.com.cn>
Date: Mon, 19 Jan 2026 10:02:41 +0800
Subject: delayacct: add timestamp of delay max

Problem
=======
Commit 658eb5ab916d ("delayacct: add delay max to record delay peak")
introduced the delay max for getdelays, which records abnormal latency
peaks and helps us understand the magnitude of such delays.  However, the
peak latency value alone is insufficient for effective root cause
analysis.  Without the precise timestamp of when the peak occurred, we
still lack the critical context needed to correlate it with other system
events.

Solution
========
To address this, we need to additionally record a precise timestamp when
the maximum latency occurs.  By correlating this timestamp with system
logs and monitoring metrics, we can identify processes with abnormal
resource usage at the same moment, which can help us to pinpoint root
causes.

Use Case
========
bash-4.4# ./getdelays -d -t 227
print delayacct stats ON
TGID    227
CPU         count     real total  virtual total    delay total  delay average      delay max      delay min      delay max timestamp
               46      188000000      192348334        4098012          0.089ms     0.429260ms     0.051205ms    2026-01-15T15:06:58
IO          count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
SWAP        count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
RECLAIM     count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
THRAS HING   count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
COMPACT     count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
WPCOPY      count    delay total  delay average      delay max      delay min      delay max timestamp
              182       19413338          0.107ms     0.547353ms     0.022462ms    2026-01-15T15:05:24
IRQ         count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A

Link: https://lkml.kernel.org/r/20260119100241520gWubW8-5QfhSf9gjqcc_E@zte.com.cn
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/accounting/delay-accounting.rst |  32 ++---
 include/linux/delayacct.h                     |   8 ++
 include/linux/sched.h                         |   5 +
 include/uapi/linux/taskstats.h                |  22 +++-
 kernel/delayacct.c                            |  31 +++--
 kernel/sched/stats.h                          |   8 +-
 tools/accounting/getdelays.c                  | 172 ++++++++++++++++++++++----
 7 files changed, 223 insertions(+), 55 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 86d7902a657f..e209c46241b0 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -107,22 +107,22 @@ Get sum and peak of delays, since system boot, for all pids with tgid 242::
 	TGID    242
 
 
-	CPU         count     real total  virtual total    delay total  delay average      delay max      delay min
-	               39      156000000      156576579        2111069          0.054ms     0.212296ms     0.031307ms
-	IO          count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	SWAP        count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	RECLAIM     count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	THRASHING   count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	COMPACT     count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	WPCOPY      count    delay total  delay average      delay max      delay min
-	              156       11215873          0.072ms     0.207403ms     0.033913ms
-	IRQ         count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
+	CPU         count     real total  virtual total    delay total  delay average      delay max      delay min      delay max timestamp
+	               46      188000000      192348334        4098012          0.089ms     0.429260ms     0.051205ms    2026-01-15T15:06:58
+	IO          count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	SWAP        count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	RECLAIM     count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	THRASHING   count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	COMPACT     count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	WPCOPY      count    delay total  delay average      delay max      delay min      delay max timestamp
+	              182       19413338          0.107ms     0.547353ms     0.022462ms    2026-01-15T15:05:24
+	IRQ         count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
 
 Get IO accounting for pid 1, it works only with -p::
 
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 800dcc360db2..ecb06f16d22c 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -69,6 +69,14 @@ struct task_delay_info {
 	u32 compact_count;	/* total count of memory compact */
 	u32 wpcopy_count;	/* total count of write-protect copy */
 	u32 irq_count;	/* total count of IRQ/SOFTIRQ */
+
+	struct timespec64 blkio_delay_max_ts;
+	struct timespec64 swapin_delay_max_ts;
+	struct timespec64 freepages_delay_max_ts;
+	struct timespec64 thrashing_delay_max_ts;
+	struct timespec64 compact_delay_max_ts;
+	struct timespec64 wpcopy_delay_max_ts;
+	struct timespec64 irq_delay_max_ts;
 };
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index da0133524d08..1d22b6229b95 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -49,6 +49,7 @@
 #include <linux/tracepoint-defs.h>
 #include <linux/unwind_deferred_types.h>
 #include <asm/kmap_size.h>
+#include <linux/time64.h>
 #ifndef COMPILE_OFFSETS
 #include <generated/rq-offsets.h>
 #endif
@@ -86,6 +87,7 @@ struct signal_struct;
 struct task_delay_info;
 struct task_group;
 struct task_struct;
+struct timespec64;
 struct user_event_mm;
 
 #include <linux/sched/ext.h>
@@ -435,6 +437,9 @@ struct sched_info {
 	/* When were we last queued to run? */
 	unsigned long long		last_queued;
 
+	/* Timestamp of max time spent waiting on a runqueue: */
+	struct timespec64		max_run_delay_ts;
+
 #endif /* CONFIG_SCHED_INFO */
 };
 
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 5929030d4e8b..1b31e8e14d2f 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -18,6 +18,16 @@
 #define _LINUX_TASKSTATS_H
 
 #include <linux/types.h>
+#ifdef __KERNEL__
+#include <linux/time64.h>
+#else
+#ifndef _LINUX_TIME64_H
+struct timespec64 {
+	__s64   tv_sec;         /* seconds */
+	long    tv_nsec;        /* nanoseconds */
+};
+#endif
+#endif
 
 /* Format for per-task data returned to userland when
  *	- a task exits
@@ -34,7 +44,7 @@
  */
 
 
-#define TASKSTATS_VERSION	16
+#define TASKSTATS_VERSION	17
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -230,6 +240,16 @@ struct taskstats {
 
 	__u64	irq_delay_max;
 	__u64	irq_delay_min;
+
+	/*v17: delay max timestamp record*/
+	struct timespec64 cpu_delay_max_ts;
+	struct timespec64 blkio_delay_max_ts;
+	struct timespec64 swapin_delay_max_ts;
+	struct timespec64 freepages_delay_max_ts;
+	struct timespec64 thrashing_delay_max_ts;
+	struct timespec64 compact_delay_max_ts;
+	struct timespec64 wpcopy_delay_max_ts;
+	struct timespec64 irq_delay_max_ts;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 30e7912ebb0d..d58ffc63bcba 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -18,6 +18,7 @@
 do { \
 	d->type##_delay_max = tsk->delays->type##_delay_max; \
 	d->type##_delay_min = tsk->delays->type##_delay_min; \
+	d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \
 	tmp = d->type##_delay_total + tsk->delays->type##_delay; \
 	d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
 	d->type##_count += tsk->delays->type##_count; \
@@ -104,7 +105,8 @@ void __delayacct_tsk_init(struct task_struct *tsk)
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumulator (@total) and @count
  */
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count,
+							 u64 *max, u64 *min, struct timespec64 *ts)
 {
 	s64 ns = local_clock() - *start;
 	unsigned long flags;
@@ -113,8 +115,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou
 		raw_spin_lock_irqsave(lock, flags);
 		*total += ns;
 		(*count)++;
-		if (ns > *max)
+		if (ns > *max) {
 			*max = ns;
+			ktime_get_real_ts64(ts);
+		}
 		if (*min == 0 || ns < *min)
 			*min = ns;
 		raw_spin_unlock_irqrestore(lock, flags);
@@ -137,7 +141,8 @@ void __delayacct_blkio_end(struct task_struct *p)
 		      &p->delays->blkio_delay,
 		      &p->delays->blkio_count,
 		      &p->delays->blkio_delay_max,
-		      &p->delays->blkio_delay_min);
+		      &p->delays->blkio_delay_min,
+		      &p->delays->blkio_delay_max_ts);
 }
 
 int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -170,6 +175,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	d->cpu_delay_max = tsk->sched_info.max_run_delay;
 	d->cpu_delay_min = tsk->sched_info.min_run_delay;
+	d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts;
 	tmp = (s64)d->cpu_delay_total + t2;
 	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
 	tmp = (s64)d->cpu_run_virtual_total + t3;
@@ -217,7 +223,8 @@ void __delayacct_freepages_end(void)
 		      &current->delays->freepages_delay,
 		      &current->delays->freepages_count,
 		      &current->delays->freepages_delay_max,
-		      &current->delays->freepages_delay_min);
+		      &current->delays->freepages_delay_min,
+		      &current->delays->freepages_delay_max_ts);
 }
 
 void __delayacct_thrashing_start(bool *in_thrashing)
@@ -241,7 +248,8 @@ void __delayacct_thrashing_end(bool *in_thrashing)
 		      &current->delays->thrashing_delay,
 		      &current->delays->thrashing_count,
 		      &current->delays->thrashing_delay_max,
-		      &current->delays->thrashing_delay_min);
+		      &current->delays->thrashing_delay_min,
+		      &current->delays->thrashing_delay_max_ts);
 }
 
 void __delayacct_swapin_start(void)
@@ -256,7 +264,8 @@ void __delayacct_swapin_end(void)
 		      &current->delays->swapin_delay,
 		      &current->delays->swapin_count,
 		      &current->delays->swapin_delay_max,
-		      &current->delays->swapin_delay_min);
+		      &current->delays->swapin_delay_min,
+		      &current->delays->swapin_delay_max_ts);
 }
 
 void __delayacct_compact_start(void)
@@ -271,7 +280,8 @@ void __delayacct_compact_end(void)
 		      &current->delays->compact_delay,
 		      &current->delays->compact_count,
 		      &current->delays->compact_delay_max,
-		      &current->delays->compact_delay_min);
+		      &current->delays->compact_delay_min,
+		      &current->delays->compact_delay_max_ts);
 }
 
 void __delayacct_wpcopy_start(void)
@@ -286,7 +296,8 @@ void __delayacct_wpcopy_end(void)
 		      &current->delays->wpcopy_delay,
 		      &current->delays->wpcopy_count,
 		      &current->delays->wpcopy_delay_max,
-		      &current->delays->wpcopy_delay_min);
+		      &current->delays->wpcopy_delay_min,
+		      &current->delays->wpcopy_delay_max_ts);
 }
 
 void __delayacct_irq(struct task_struct *task, u32 delta)
@@ -296,8 +307,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta)
 	raw_spin_lock_irqsave(&task->delays->lock, flags);
 	task->delays->irq_delay += delta;
 	task->delays->irq_count++;
-	if (delta > task->delays->irq_delay_max)
+	if (delta > task->delays->irq_delay_max) {
 		task->delays->irq_delay_max = delta;
+		ktime_get_real_ts64(&task->delays->irq_delay_max_ts);
+	}
 	if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
 		task->delays->irq_delay_min = delta;
 	raw_spin_unlock_irqrestore(&task->delays->lock, flags);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c903f1a42891..a612cf253c87 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
 	delta = rq_clock(rq) - t->sched_info.last_queued;
 	t->sched_info.last_queued = 0;
 	t->sched_info.run_delay += delta;
-	if (delta > t->sched_info.max_run_delay)
+	if (delta > t->sched_info.max_run_delay) {
 		t->sched_info.max_run_delay = delta;
+		ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+	}
 	if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
 		t->sched_info.min_run_delay = delta;
 	rq_sched_info_dequeue(rq, delta);
@@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 	t->sched_info.run_delay += delta;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcount++;
-	if (delta > t->sched_info.max_run_delay)
+	if (delta > t->sched_info.max_run_delay) {
 		t->sched_info.max_run_delay = delta;
+		ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+	}
 	if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
 		t->sched_info.min_run_delay = delta;
 
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 21cb3c3d1331..64796c0223be 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -24,6 +24,7 @@
 #include <sys/socket.h>
 #include <sys/wait.h>
 #include <signal.h>
+#include <time.h>
 
 #include <linux/genetlink.h>
 #include <linux/taskstats.h>
@@ -194,6 +195,37 @@ static int get_family_id(int sd)
 #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
 #define delay_ms(t) (t / 1000000ULL)
 
+/*
+ * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS)
+ * Returns formatted string or "N/A" if timestamp is zero
+ */
+static const char *format_timespec64(struct timespec64 *ts)
+{
+	static char buffer[32];
+	struct tm tm_info;
+	time_t time_sec;
+
+	/* Check if timestamp is zero (not set) */
+	if (ts->tv_sec == 0 && ts->tv_nsec == 0)
+		return "N/A";
+
+	time_sec = (time_t)ts->tv_sec;
+
+	/* Use thread-safe localtime_r */
+	if (localtime_r(&time_sec, &tm_info) == NULL)
+		return "N/A";
+
+	snprintf(buffer, sizeof(buffer), "%04d-%02d-%02dT%02d:%02d:%02d",
+		tm_info.tm_year + 1900,
+		tm_info.tm_mon + 1,
+		tm_info.tm_mday,
+		tm_info.tm_hour,
+		tm_info.tm_min,
+		tm_info.tm_sec);
+
+	return buffer;
+}
+
 /*
  * Version compatibility note:
  * Field availability depends on taskstats version (t->version),
@@ -205,13 +237,28 @@ static int get_family_id(int sd)
  * version >= 13  - supports WPCOPY statistics
  * version >= 14  - supports IRQ statistics
  * version >= 16  - supports *_max and *_min delay statistics
+ * version >= 17  - supports delay max timestamp statistics
  *
  * Always verify version before accessing version-dependent fields
  * to maintain backward compatibility.
  */
 #define PRINT_CPU_DELAY(version, t) \
 	do { \
-		if (version >= 16) { \
+		if (version >= 17) { \
+			printf("%-10s%15s%15s%15s%15s%15s%15s%15s%25s\n", \
+				"CPU", "count", "real total", "virtual total", \
+				"delay total", "delay average", "delay max", \
+				"delay min", "delay max timestamp"); \
+			printf("          %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \
+				(unsigned long long)(t)->cpu_count, \
+				(unsigned long long)(t)->cpu_run_real_total, \
+				(unsigned long long)(t)->cpu_run_virtual_total, \
+				(unsigned long long)(t)->cpu_delay_total, \
+				average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \
+				delay_ms((double)(t)->cpu_delay_max), \
+				delay_ms((double)(t)->cpu_delay_min), \
+				format_timespec64(&(t)->cpu_delay_max_ts)); \
+		} else if (version >= 16) { \
 			printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \
 				"CPU", "count", "real total", "virtual total", \
 				"delay total", "delay average", "delay max", "delay min"); \
@@ -257,44 +304,115 @@ static int get_family_id(int sd)
 		} \
 	} while (0)
 
+#define PRINT_FILED_DELAY_WITH_TS(name, version, t, count, total, max, min, max_ts) \
+	do { \
+		if (version >= 17) { \
+			printf("%-10s%15s%15s%15s%15s%15s%25s\n", \
+				name, "count", "delay total", "delay average", \
+				"delay max", "delay min", "delay max timestamp"); \
+			printf("          %15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \
+				(unsigned long long)(t)->count, \
+				(unsigned long long)(t)->total, \
+				average_ms((double)(t)->total, (t)->count), \
+				delay_ms((double)(t)->max), \
+				delay_ms((double)(t)->min), \
+				format_timespec64(&(t)->max_ts)); \
+		} else if (version >= 16) { \
+			printf("%-10s%15s%15s%15s%15s%15s\n", \
+				name, "count", "delay total", "delay average", \
+				"delay max", "delay min"); \
+			printf("          %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \
+				(unsigned long long)(t)->count, \
+				(unsigned long long)(t)->total, \
+				average_ms((double)(t)->total, (t)->count), \
+				delay_ms((double)(t)->max), \
+				delay_ms((double)(t)->min)); \
+		} else { \
+			printf("%-10s%15s%15s%15s\n", \
+				name, "count", "delay total", "delay average"); \
+			printf("          %15llu%15llu%15.3fms\n", \
+				(unsigned long long)(t)->count, \
+				(unsigned long long)(t)->total, \
+				average_ms((double)(t)->total, (t)->count)); \
+		} \
+	} while (0)
+
 static void print_delayacct(struct taskstats *t)
 {
 	printf("\n\n");
 
 	PRINT_CPU_DELAY(t->version, t);
 
-	PRINT_FILED_DELAY("IO", t->version, t,
-		blkio_count, blkio_delay_total,
-		blkio_delay_max, blkio_delay_min);
+	/* Use new macro with timestamp support for version >= 17 */
+	if (t->version >= 17) {
+		PRINT_FILED_DELAY_WITH_TS("IO", t->version, t,
+			blkio_count, blkio_delay_total,
+			blkio_delay_max, blkio_delay_min, blkio_delay_max_ts);
 
-	PRINT_FILED_DELAY("SWAP", t->version, t,
-		swapin_count, swapin_delay_total,
-		swapin_delay_max, swapin_delay_min);
+		PRINT_FILED_DELAY_WITH_TS("SWAP", t->version, t,
+			swapin_count, swapin_delay_total,
+			swapin_delay_max, swapin_delay_min, swapin_delay_max_ts);
 
-	PRINT_FILED_DELAY("RECLAIM", t->version, t,
-		freepages_count, freepages_delay_total,
-		freepages_delay_max, freepages_delay_min);
+		PRINT_FILED_DELAY_WITH_TS("RECLAIM", t->version, t,
+			freepages_count, freepages_delay_total,
+			freepages_delay_max, freepages_delay_min, freepages_delay_max_ts);
 
-	PRINT_FILED_DELAY("THRASHING", t->version, t,
-		thrashing_count, thrashing_delay_total,
-		thrashing_delay_max, thrashing_delay_min);
+		PRINT_FILED_DELAY_WITH_TS("THRASHING", t->version, t,
+			thrashing_count, thrashing_delay_total,
+			thrashing_delay_max, thrashing_delay_min, thrashing_delay_max_ts);
 
-	if (t->version >= 11) {
-		PRINT_FILED_DELAY("COMPACT", t->version, t,
-			compact_count, compact_delay_total,
-			compact_delay_max, compact_delay_min);
-	}
+		if (t->version >= 11) {
+			PRINT_FILED_DELAY_WITH_TS("COMPACT", t->version, t,
+				compact_count, compact_delay_total,
+				compact_delay_max, compact_delay_min, compact_delay_max_ts);
+		}
 
-	if (t->version >= 13) {
-		PRINT_FILED_DELAY("WPCOPY", t->version, t,
-			wpcopy_count, wpcopy_delay_total,
-			wpcopy_delay_max, wpcopy_delay_min);
-	}
+		if (t->version >= 13) {
+			PRINT_FILED_DELAY_WITH_TS("WPCOPY", t->version, t,
+				wpcopy_count, wpcopy_delay_total,
+				wpcopy_delay_max, wpcopy_delay_min, wpcopy_delay_max_ts);
+		}
 
-	if (t->version >= 14) {
-		PRINT_FILED_DELAY("IRQ", t->version, t,
-			irq_count, irq_delay_total,
-			irq_delay_max, irq_delay_min);
+		if (t->version >= 14) {
+			PRINT_FILED_DELAY_WITH_TS("IRQ", t->version, t,
+				irq_count, irq_delay_total,
+				irq_delay_max, irq_delay_min, irq_delay_max_ts);
+		}
+	} else {
+		/* Use original macro for older versions */
+		PRINT_FILED_DELAY("IO", t->version, t,
+			blkio_count, blkio_delay_total,
+			blkio_delay_max, blkio_delay_min);
+
+		PRINT_FILED_DELAY("SWAP", t->version, t,
+			swapin_count, swapin_delay_total,
+			swapin_delay_max, swapin_delay_min);
+
+		PRINT_FILED_DELAY("RECLAIM", t->version, t,
+			freepages_count, freepages_delay_total,
+			freepages_delay_max, freepages_delay_min);
+
+		PRINT_FILED_DELAY("THRASHING", t->version, t,
+			thrashing_count, thrashing_delay_total,
+			thrashing_delay_max, thrashing_delay_min);
+
+		if (t->version >= 11) {
+			PRINT_FILED_DELAY("COMPACT", t->version, t,
+				compact_count, compact_delay_total,
+				compact_delay_max, compact_delay_min);
+		}
+
+		if (t->version >= 13) {
+			PRINT_FILED_DELAY("WPCOPY", t->version, t,
+				wpcopy_count, wpcopy_delay_total,
+				wpcopy_delay_max, wpcopy_delay_min);
+		}
+
+		if (t->version >= 14) {
+			PRINT_FILED_DELAY("IRQ", t->version, t,
+				irq_count, irq_delay_total,
+				irq_delay_max, irq_delay_min);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 072e6f7f416f5d17be71000b31fb108651ad360d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 30 Jan 2026 16:21:06 +0100
Subject: wifi: cfg80211: add initial UHR support

Add initial support for making UHR connections (or suppressing
that), adding UHR capable stations on the AP side, encoding
and decoding UHR MCSes (except rate calculation for the new
MCSes 17, 19, 20 and 23) as well as regulatory support.

Link: https://patch.msgid.link/20260130164259.54cc12fbb307.I26126bebd83c7ab17e99827489f946ceabb3521f@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  58 ++++++++++++++++++++++--
 include/uapi/linux/nl80211.h |  30 +++++++++++++
 net/wireless/nl80211.c       | 102 +++++++++++++++++++++++++++++++++++++++++--
 net/wireless/reg.c           |   4 +-
 net/wireless/util.c          | 101 ++++++++++++++++++++++++++++++++----------
 5 files changed, 265 insertions(+), 30 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7911ed58abbb..fc01de19c798 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7,7 +7,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014 Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
  */
 
 #include <linux/ethtool.h>
@@ -126,6 +126,7 @@ struct wiphy;
  * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel.
  * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel.
  * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel.
+ * @IEEE80211_CHAN_NO_UHR: UHR operation is not permitted on this channel.
  */
 enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DISABLED			= BIT(0),
@@ -143,6 +144,7 @@ enum ieee80211_channel_flags {
 	IEEE80211_CHAN_NO_10MHZ			= BIT(12),
 	IEEE80211_CHAN_NO_HE			= BIT(13),
 	/* can use free bits here */
+	IEEE80211_CHAN_NO_UHR			= BIT(18),
 	IEEE80211_CHAN_NO_320MHZ		= BIT(19),
 	IEEE80211_CHAN_NO_EHT			= BIT(20),
 	IEEE80211_CHAN_DFS_CONCURRENT		= BIT(21),
@@ -429,6 +431,18 @@ struct ieee80211_sta_eht_cap {
 	u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN];
 };
 
+/**
+ * struct ieee80211_sta_uhr_cap - STA's UHR capabilities
+ * @has_uhr: true iff UHR is supported and data is valid
+ * @mac: fixed MAC capabilities
+ * @phy: fixed PHY capabilities
+ */
+struct ieee80211_sta_uhr_cap {
+	bool has_uhr;
+	struct ieee80211_uhr_cap_mac mac;
+	struct ieee80211_uhr_cap_phy phy;
+};
+
 /* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */
 #ifdef __CHECKER__
 /*
@@ -454,6 +468,7 @@ struct ieee80211_sta_eht_cap {
  * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a
  *	6 GHz band channel (and 0 may be valid value).
  * @eht_cap: STA's EHT capabilities
+ * @uhr_cap: STA's UHR capabilities
  * @vendor_elems: vendor element(s) to advertise
  * @vendor_elems.data: vendor element(s) data
  * @vendor_elems.len: vendor element(s) length
@@ -463,6 +478,7 @@ struct ieee80211_sband_iftype_data {
 	struct ieee80211_sta_he_cap he_cap;
 	struct ieee80211_he_6ghz_capa he_6ghz_capa;
 	struct ieee80211_sta_eht_cap eht_cap;
+	struct ieee80211_sta_uhr_cap uhr_cap;
 	struct {
 		const u8 *data;
 		unsigned int len;
@@ -704,6 +720,26 @@ ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband,
 	return NULL;
 }
 
+/**
+ * ieee80211_get_uhr_iftype_cap - return UHR capabilities for an sband's iftype
+ * @sband: the sband to search for the iftype on
+ * @iftype: enum nl80211_iftype
+ *
+ * Return: pointer to the struct ieee80211_sta_uhr_cap, or NULL is none found
+ */
+static inline const struct ieee80211_sta_uhr_cap *
+ieee80211_get_uhr_iftype_cap(const struct ieee80211_supported_band *sband,
+			     enum nl80211_iftype iftype)
+{
+	const struct ieee80211_sband_iftype_data *data =
+		ieee80211_get_sband_iftype_data(sband, iftype);
+
+	if (data && data->uhr_cap.has_uhr)
+		return &data->uhr_cap;
+
+	return NULL;
+}
+
 /**
  * wiphy_read_of_freq_limits - read frequency limits from device tree
  *
@@ -1486,6 +1522,7 @@ struct cfg80211_s1g_short_beacon {
  * @he_cap: HE capabilities (or %NULL if HE isn't enabled)
  * @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled)
  * @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled)
+ * @uhr_oper: UHR operation (or %NULL if UHR isn't enabled)
  * @ht_required: stations must support HT
  * @vht_required: stations must support VHT
  * @twt_responder: Enable Target Wait Time
@@ -1525,6 +1562,7 @@ struct cfg80211_ap_settings {
 	const struct ieee80211_he_operation *he_oper;
 	const struct ieee80211_eht_cap_elem *eht_cap;
 	const struct ieee80211_eht_operation *eht_oper;
+	const struct ieee80211_uhr_operation *uhr_oper;
 	bool ht_required, vht_required, he_required, sae_h2e_required;
 	bool twt_responder;
 	u32 flags;
@@ -1698,6 +1736,8 @@ struct sta_txpwr {
  * @eht_capa: EHT capabilities of station
  * @eht_capa_len: the length of the EHT capabilities
  * @s1g_capa: S1G capabilities of station
+ * @uhr_capa: UHR capabilities of the station
+ * @uhr_capa_len: the length of the UHR capabilities
  */
 struct link_station_parameters {
 	const u8 *mld_mac;
@@ -1717,6 +1757,8 @@ struct link_station_parameters {
 	const struct ieee80211_eht_cap_elem *eht_capa;
 	u8 eht_capa_len;
 	const struct ieee80211_s1g_cap *s1g_capa;
+	const struct ieee80211_uhr_cap *uhr_capa;
+	u8 uhr_capa_len;
 };
 
 /**
@@ -1898,6 +1940,11 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
  * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS
  * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information
  * @RATE_INFO_FLAGS_S1G_MCS: MCS field filled with S1G MCS
+ * @RATE_INFO_FLAGS_UHR_MCS: UHR MCS information
+ * @RATE_INFO_FLAGS_UHR_ELR_MCS: UHR ELR MCS was used
+ *	(set together with @RATE_INFO_FLAGS_UHR_MCS)
+ * @RATE_INFO_FLAGS_UHR_IM: UHR Interference Mitigation
+ *	was used
  */
 enum rate_info_flags {
 	RATE_INFO_FLAGS_MCS			= BIT(0),
@@ -1909,6 +1956,9 @@ enum rate_info_flags {
 	RATE_INFO_FLAGS_EXTENDED_SC_DMG		= BIT(6),
 	RATE_INFO_FLAGS_EHT_MCS			= BIT(7),
 	RATE_INFO_FLAGS_S1G_MCS			= BIT(8),
+	RATE_INFO_FLAGS_UHR_MCS			= BIT(9),
+	RATE_INFO_FLAGS_UHR_ELR_MCS		= BIT(10),
+	RATE_INFO_FLAGS_UHR_IM			= BIT(11),
 };
 
 /**
@@ -1924,7 +1974,7 @@ enum rate_info_flags {
  * @RATE_INFO_BW_160: 160 MHz bandwidth
  * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
  * @RATE_INFO_BW_320: 320 MHz bandwidth
- * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation
+ * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT/UHR RU allocation
  * @RATE_INFO_BW_1: 1 MHz bandwidth
  * @RATE_INFO_BW_2: 2 MHz bandwidth
  * @RATE_INFO_BW_4: 4 MHz bandwidth
@@ -1955,7 +2005,7 @@ enum rate_info_bw {
  *
  * @flags: bitflag of flags from &enum rate_info_flags
  * @legacy: bitrate in 100kbit/s for 802.11abg
- * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G rate
+ * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G/UHR rate
  * @nss: number of streams (VHT & HE only)
  * @bw: bandwidth (from &enum rate_info_bw)
  * @he_gi: HE guard interval (from &enum nl80211_he_gi)
@@ -3262,6 +3312,7 @@ struct cfg80211_ml_reconf_req {
  *	Drivers shall disable MLO features for the current association if this
  *	flag is not set.
  * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any)
+ * @ASSOC_REQ_DISABLE_UHR: Disable UHR
  */
 enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_DISABLE_HT			= BIT(0),
@@ -3272,6 +3323,7 @@ enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_DISABLE_EHT			= BIT(5),
 	CONNECT_REQ_MLO_SUPPORT			= BIT(6),
 	ASSOC_REQ_SPP_AMSDU			= BIT(7),
+	ASSOC_REQ_DISABLE_UHR			= BIT(8),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 706a98686068..b63f71850906 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2977,6 +2977,13 @@ enum nl80211_commands {
  * @NL80211_ATTR_EPP_PEER: A flag attribute to indicate if the peer is an EPP
  *	STA. Used with %NL80211_CMD_NEW_STA and %NL80211_CMD_ADD_LINK_STA
  *
+ * @NL80211_ATTR_UHR_CAPABILITY: UHR Capability information element (from
+ *	association request when used with NL80211_CMD_NEW_STATION). Can be set
+ *	only if HE/EHT are also available.
+ * @NL80211_ATTR_DISABLE_UHR: Force UHR capable interfaces to disable
+ *	this feature during association. This is a flag attribute.
+ *	Currently only supported in mac80211 drivers.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3547,6 +3554,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_EPP_PEER,
 
+	NL80211_ATTR_UHR_CAPABILITY,
+	NL80211_ATTR_DISABLE_UHR,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3899,6 +3909,12 @@ enum nl80211_eht_ru_alloc {
  * @NL80211_RATE_INFO_4_MHZ_WIDTH: 4 MHz S1G rate
  * @NL80211_RATE_INFO_8_MHZ_WIDTH: 8 MHz S1G rate
  * @NL80211_RATE_INFO_16_MHZ_WIDTH: 16 MHz S1G rate
+ * @NL80211_RATE_INFO_UHR_MCS: UHR MCS index (u8, 0-15, 17, 19, 20, 23)
+ *	Note that the other EHT attributes (such as @NL80211_RATE_INFO_EHT_NSS)
+ *	are used in conjunction with this where applicable
+ * @NL80211_RATE_INFO_UHR_ELR: UHR ELR flag, which restricts NSS to 1,
+ *	MCS to 0 or 1, and GI to %NL80211_RATE_INFO_EHT_GI_1_6.
+ * @NL80211_RATE_INFO_UHR_IM: UHR Interference Mitigation flag
  * @__NL80211_RATE_INFO_AFTER_LAST: internal use
  */
 enum nl80211_rate_info {
@@ -3932,6 +3948,9 @@ enum nl80211_rate_info {
 	NL80211_RATE_INFO_4_MHZ_WIDTH,
 	NL80211_RATE_INFO_8_MHZ_WIDTH,
 	NL80211_RATE_INFO_16_MHZ_WIDTH,
+	NL80211_RATE_INFO_UHR_MCS,
+	NL80211_RATE_INFO_UHR_ELR,
+	NL80211_RATE_INFO_UHR_IM,
 
 	/* keep last */
 	__NL80211_RATE_INFO_AFTER_LAST,
@@ -4254,6 +4273,10 @@ enum nl80211_mpath_info {
  *	capabilities element
  * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE: EHT PPE thresholds information as
  *	defined in EHT capabilities element
+ * @NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC: UHR MAC capabilities as in UHR
+ *	capabilities element
+ * @NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY: UHR PHY capabilities as in UHR
+ *	capabilities element
  * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use
  * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band attribute currently defined
  */
@@ -4271,6 +4294,8 @@ enum nl80211_band_iftype_attr {
 	NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY,
 	NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET,
 	NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE,
+	NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC,
+	NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY,
 
 	/* keep last */
 	__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST,
@@ -4453,6 +4478,8 @@ enum nl80211_wmm_rule {
  * @NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY: Channel is not permitted for use
  *	as a primary channel. Does not prevent the channel from existing
  *	as a non-primary subchannel. Only applicable to S1G channels.
+ * @NL80211_FREQUENCY_ATTR_NO_UHR: UHR operation is not allowed on this channel
+ *	in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -4502,6 +4529,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_NO_8MHZ,
 	NL80211_FREQUENCY_ATTR_NO_16MHZ,
 	NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY,
+	NL80211_FREQUENCY_ATTR_NO_UHR,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
@@ -4715,6 +4743,7 @@ enum nl80211_sched_scan_match_attr {
  *	despite NO_IR configuration.
  * @NL80211_RRF_ALLOW_20MHZ_ACTIVITY: Allow activity in 20 MHz bandwidth,
  *	despite NO_IR configuration.
+ * @NL80211_RRF_NO_UHR: UHR operation not allowed
  */
 enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_OFDM                 = 1 << 0,
@@ -4741,6 +4770,7 @@ enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_6GHZ_AFC_CLIENT      = 1 << 23,
 	NL80211_RRF_ALLOW_6GHZ_VLP_AP       = 1 << 24,
 	NL80211_RRF_ALLOW_20MHZ_ACTIVITY    = 1 << 25,
+	NL80211_RRF_NO_UHR		    = 1 << 26,
 };
 
 #define NL80211_RRF_PASSIVE_SCAN	NL80211_RRF_NO_IR
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9aa83a6943a2..6e58b238a1f8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -332,6 +332,15 @@ static int validate_nan_cluster_id(const struct nlattr *attr,
 	return 0;
 }
 
+static int validate_uhr_capa(const struct nlattr *attr,
+			     struct netlink_ext_ack *extack)
+{
+	const u8 *data = nla_data(attr);
+	unsigned int len = nla_len(attr);
+
+	return ieee80211_uhr_capa_size_ok(data, len, false);
+}
+
 /* policy for the attributes */
 static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];
 
@@ -934,6 +943,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG },
 	[NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG },
 	[NL80211_ATTR_EPP_PEER] = { .type = NLA_FLAG },
+	[NL80211_ATTR_UHR_CAPABILITY] =
+		NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_uhr_capa, 255),
+	[NL80211_ATTR_DISABLE_UHR] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -1319,6 +1331,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 		if ((chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) &&
 		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY))
 			goto nla_put_failure;
+		if ((chan->flags & IEEE80211_CHAN_NO_UHR) &&
+		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHR))
+			goto nla_put_failure;
 	}
 
 	if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
@@ -1954,6 +1969,7 @@ nl80211_send_iftype_data(struct sk_buff *msg,
 {
 	const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
 	const struct ieee80211_sta_eht_cap *eht_cap = &iftdata->eht_cap;
+	const struct ieee80211_sta_uhr_cap *uhr_cap = &iftdata->uhr_cap;
 
 	if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
 				iftdata->types_mask))
@@ -2005,6 +2021,14 @@ nl80211_send_iftype_data(struct sk_buff *msg,
 			return -ENOBUFS;
 	}
 
+	if (uhr_cap->has_uhr) {
+		if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC,
+			    sizeof(uhr_cap->mac), &uhr_cap->mac) ||
+		    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY,
+			    sizeof(uhr_cap->phy), &uhr_cap->phy))
+			return -ENOBUFS;
+	}
+
 	if (sband->band == NL80211_BAND_6GHZ &&
 	    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA,
 		    sizeof(iftdata->he_6ghz_capa),
@@ -6462,6 +6486,17 @@ static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
 						cap->datalen - 1))
 			return -EINVAL;
 	}
+
+	cap = cfg80211_find_ext_elem(WLAN_EID_EXT_UHR_OPER, ies, ies_len);
+	if (cap) {
+		if (!cap->datalen)
+			return -EINVAL;
+		params->uhr_oper = (void *)(cap->data + 1);
+		if (!ieee80211_uhr_oper_size_ok((const u8 *)params->uhr_oper,
+						cap->datalen - 1, true))
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -6593,6 +6628,9 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params
 	    (channel->flags & IEEE80211_CHAN_NO_EHT))
 		return -EOPNOTSUPP;
 
+	if (params->uhr_oper && (channel->flags & IEEE80211_CHAN_NO_UHR))
+		return -EOPNOTSUPP;
+
 	return 0;
 }
 
@@ -7175,7 +7213,8 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
 		break;
 	case RATE_INFO_BW_EHT_RU:
 		rate_flg = 0;
-		WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS));
+		WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS) &&
+			!(info->flags & RATE_INFO_FLAGS_UHR_MCS));
 		break;
 	}
 
@@ -7228,6 +7267,23 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
 		    nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC,
 			       info->eht_ru_alloc))
 			return false;
+	} else if (info->flags & RATE_INFO_FLAGS_UHR_MCS) {
+		if (nla_put_u8(msg, NL80211_RATE_INFO_UHR_MCS, info->mcs))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_NSS, info->nss))
+			return false;
+		if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_GI, info->eht_gi))
+			return false;
+		if (info->bw == RATE_INFO_BW_EHT_RU &&
+		    nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC,
+			       info->eht_ru_alloc))
+			return false;
+		if (info->flags & RATE_INFO_FLAGS_UHR_ELR_MCS &&
+		    nla_put_flag(msg, NL80211_RATE_INFO_UHR_ELR))
+			return false;
+		if (info->flags & RATE_INFO_FLAGS_UHR_IM &&
+		    nla_put_flag(msg, NL80211_RATE_INFO_UHR_IM))
+			return false;
 	}
 
 	nla_nest_end(msg, rate);
@@ -8101,7 +8157,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 		if (params->ext_capab || params->link_sta_params.ht_capa ||
 		    params->link_sta_params.vht_capa ||
 		    params->link_sta_params.he_capa ||
-		    params->link_sta_params.eht_capa)
+		    params->link_sta_params.eht_capa ||
+		    params->link_sta_params.uhr_capa)
 			return -EINVAL;
 		if (params->sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU))
 			return -EINVAL;
@@ -8321,6 +8378,16 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 		}
 	}
 
+	if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) {
+		if (!params->link_sta_params.eht_capa)
+			return -EINVAL;
+
+		params->link_sta_params.uhr_capa =
+			nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+		params->link_sta_params.uhr_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+	}
+
 	if (info->attrs[NL80211_ATTR_S1G_CAPABILITY])
 		params->link_sta_params.s1g_capa =
 			nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]);
@@ -8641,6 +8708,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) {
+		if (!params.link_sta_params.eht_capa)
+			return -EINVAL;
+
+		params.link_sta_params.uhr_capa =
+			nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+		params.link_sta_params.uhr_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+	}
+
 	if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) {
 		params.eml_cap_present = true;
 		params.eml_cap =
@@ -8700,10 +8777,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		params.link_sta_params.ht_capa = NULL;
 		params.link_sta_params.vht_capa = NULL;
 
-		/* HE and EHT require WME */
+		/* HE, EHT and UHR require WME */
 		if (params.link_sta_params.he_capa_len ||
 		    params.link_sta_params.he_6ghz_capa ||
-		    params.link_sta_params.eht_capa_len)
+		    params.link_sta_params.eht_capa_len ||
+		    params.link_sta_params.uhr_capa_len)
 			return -EINVAL;
 	}
 
@@ -12376,6 +12454,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT]))
 		req.flags |= ASSOC_REQ_DISABLE_EHT;
 
+	if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR]))
+		req.flags |= ASSOC_REQ_DISABLE_UHR;
+
 	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
 		memcpy(&req.vht_capa_mask,
 		       nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
@@ -13248,6 +13329,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT]))
 		connect.flags |= ASSOC_REQ_DISABLE_EHT;
 
+	if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR]))
+		connect.flags |= ASSOC_REQ_DISABLE_UHR;
+
 	if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
 		memcpy(&connect.vht_capa_mask,
 		       nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
@@ -17680,6 +17764,16 @@ nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info,
 		}
 	}
 
+	if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) {
+		if (!params.eht_capa)
+			return -EINVAL;
+
+		params.uhr_capa =
+			nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+		params.uhr_capa_len =
+			nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+	}
+
 	if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
 		params.he_6ghz_capa =
 			nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 6cbfa3b78311..139cb27e5a81 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -5,7 +5,7 @@
  * Copyright 2008-2011	Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright      2017  Intel Deutschland GmbH
- * Copyright (C) 2018 - 2025 Intel Corporation
+ * Copyright (C) 2018 - 2026 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -1605,6 +1605,8 @@ static u32 map_regdom_flags(u32 rd_flags)
 		channel_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP;
 	if (rd_flags & NL80211_RRF_ALLOW_20MHZ_ACTIVITY)
 		channel_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY;
+	if (rd_flags & NL80211_RRF_NO_UHR)
+		channel_flags |= IEEE80211_CHAN_NO_UHR;
 	return channel_flags;
 }
 
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 08c525835518..404fe604a8db 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,7 +5,7 @@
  * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2023, 2025 Intel Corporation
+ * Copyright (C) 2018-2023, 2025-2026 Intel Corporation
  */
 #include <linux/export.h>
 #include <linux/bitops.h>
@@ -1574,26 +1574,30 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
 	return result / 10000;
 }
 
-static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
+static u32 _cfg80211_calculate_bitrate_eht_uhr(struct rate_info *rate)
 {
 #define SCALE 6144
-	static const u32 mcs_divisors[16] = {
-		102399, /* 16.666666... */
-		 51201, /*  8.333333... */
-		 34134, /*  5.555555... */
-		 25599, /*  4.166666... */
-		 17067, /*  2.777777... */
-		 12801, /*  2.083333... */
-		 11377, /*  1.851725... */
-		 10239, /*  1.666666... */
-		  8532, /*  1.388888... */
-		  7680, /*  1.250000... */
-		  6828, /*  1.111111... */
-		  6144, /*  1.000000... */
-		  5690, /*  0.926106... */
-		  5120, /*  0.833333... */
-		409600, /* 66.666666... */
-		204800, /* 33.333333... */
+	static const u32 mcs_divisors[] = {
+		[ 0] = 102399, /* 16.666666... */
+		[ 1] =  51201, /*  8.333333... */
+		[ 2] =  34134, /*  5.555555... */
+		[ 3] =  25599, /*  4.166666... */
+		[ 4] =  17067, /*  2.777777... */
+		[ 5] =  12801, /*  2.083333... */
+		[ 6] =  11377, /*  1.851725... */
+		[ 7] =  10239, /*  1.666666... */
+		[ 8] =   8532, /*  1.388888... */
+		[ 9] =   7680, /*  1.250000... */
+		[10] =   6828, /*  1.111111... */
+		[11] =   6144, /*  1.000000... */
+		[12] =   5690, /*  0.926106... */
+		[13] =   5120, /*  0.833333... */
+		[14] = 409600, /* 66.666666... */
+		[15] = 204800, /* 33.333333... */
+		[17] =  38400, /*  6.250180... */
+		[19] =  19200, /*  3.125090... */
+		[20] =  15360, /*  2.500000... */
+		[23] =   9600, /*  1.562545... */
 	};
 	static const u32 rates_996[3] =  { 480388888, 453700000, 408333333 };
 	static const u32 rates_484[3] =  { 229411111, 216666666, 195000000 };
@@ -1604,8 +1608,6 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
 	u64 tmp;
 	u32 result;
 
-	if (WARN_ON_ONCE(rate->mcs > 15))
-		return 0;
 	if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2))
 		return 0;
 	if (WARN_ON_ONCE(rate->eht_ru_alloc >
@@ -1686,7 +1688,7 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
 		 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26)
 		result = rates_26[rate->eht_gi];
 	else {
-		WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n",
+		WARN(1, "invalid EHT or UHR MCS: bw:%d, ru:%d\n",
 		     rate->bw, rate->eht_ru_alloc);
 		return 0;
 	}
@@ -1700,11 +1702,64 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
 	tmp *= rate->nss;
 	do_div(tmp, 8);
 
+	/* and handle interference mitigation - 0.9x */
+	if (rate->flags & RATE_INFO_FLAGS_UHR_IM) {
+		if (WARN(rate->nss != 1 || rate->mcs == 15,
+			 "invalid NSS or MCS for UHR IM\n"))
+			return 0;
+		tmp *= 9000;
+		do_div(tmp, 10000);
+	}
+
 	result = tmp;
 
 	return result / 10000;
 }
 
+static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
+{
+	if (WARN_ONCE(rate->mcs > 15, "bad EHT MCS %d\n", rate->mcs))
+		return 0;
+
+	if (WARN_ONCE(rate->flags & (RATE_INFO_FLAGS_UHR_ELR_MCS |
+				     RATE_INFO_FLAGS_UHR_IM),
+		      "bad EHT MCS flags 0x%x\n", rate->flags))
+		return 0;
+
+	return _cfg80211_calculate_bitrate_eht_uhr(rate);
+}
+
+static u32 cfg80211_calculate_bitrate_uhr(struct rate_info *rate)
+{
+	if (rate->flags & RATE_INFO_FLAGS_UHR_ELR_MCS) {
+		WARN_ONCE(rate->eht_gi != NL80211_RATE_INFO_EHT_GI_1_6,
+			  "bad UHR ELR guard interval %d\n",
+			  rate->eht_gi);
+		WARN_ONCE(rate->mcs > 1, "bad UHR ELR MCS %d\n", rate->mcs);
+		WARN_ONCE(rate->nss != 1, "bad UHR ELR NSS %d\n", rate->nss);
+		WARN_ONCE(rate->bw != RATE_INFO_BW_20,
+			  "bad UHR ELR bandwidth %d\n",
+			  rate->bw);
+		WARN_ONCE(rate->flags & RATE_INFO_FLAGS_UHR_IM,
+			  "bad UHR MCS flags 0x%x\n", rate->flags);
+		if (rate->mcs == 0)
+			return 17;
+		return 33;
+	}
+
+	switch (rate->mcs) {
+	case 0 ... 15:
+	case 17:
+	case 19:
+	case 20:
+	case 23:
+		return _cfg80211_calculate_bitrate_eht_uhr(rate);
+	}
+
+	WARN_ONCE(1, "bad UHR MCS %d\n", rate->mcs);
+	return 0;
+}
+
 static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate)
 {
 	/* For 1, 2, 4, 8 and 16 MHz channels */
@@ -1829,6 +1884,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
 		return cfg80211_calculate_bitrate_he(rate);
 	if (rate->flags & RATE_INFO_FLAGS_EHT_MCS)
 		return cfg80211_calculate_bitrate_eht(rate);
+	if (rate->flags & RATE_INFO_FLAGS_UHR_MCS)
+		return cfg80211_calculate_bitrate_uhr(rate);
 	if (rate->flags & RATE_INFO_FLAGS_S1G_MCS)
 		return cfg80211_calculate_bitrate_s1g(rate);
 
-- 
cgit v1.2.3


From 3495064b6d65a669b409cfe1241db4f3c540251a Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Fri, 30 Jan 2026 17:36:00 +0000
Subject: ptp: vmclock: add vm generation counter

Similar to live migration, loading a VM from some saved state (aka
snapshot) is also an event that calls for clock adjustments in the
guest. However, guests might want to take more actions as a response to
such events, e.g. as discarding UUIDs, resetting network connections,
reseeding entropy pools, etc. These are actions that guests don't
typically take during live migration, so add a new field in the
vmclock_abi called vm_generation_counter which informs the guest about
such events.

Hypervisor advertises support for vm_generation_counter through the
VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT flag. Users need to check the
presence of this bit in vmclock_abi flags field before using this flag.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: Takahiro Itazur <itazur@amazon.com>
Link: https://patch.msgid.link/20260130173704.12575-2-itazur@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/vmclock-abi.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h
index 2d99b29ac44a..937fe00e4f33 100644
--- a/include/uapi/linux/vmclock-abi.h
+++ b/include/uapi/linux/vmclock-abi.h
@@ -115,6 +115,12 @@ struct vmclock_abi {
 	 * bit again after the update, using the about-to-be-valid fields.
 	 */
 #define VMCLOCK_FLAG_TIME_MONOTONIC		(1 << 7)
+	/*
+	 * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will
+	 * bump the vm_generation_counter field every time the guest is
+	 * loaded from some save state (restored from a snapshot).
+	 */
+#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT     (1 << 8)
 
 	__u8 pad[2];
 	__u8 clock_status;
@@ -177,6 +183,15 @@ struct vmclock_abi {
 	__le64 time_frac_sec;		/* Units of 1/2^64 of a second */
 	__le64 time_esterror_nanosec;
 	__le64 time_maxerror_nanosec;
+
+	/*
+	 * This field changes to another non-repeating value when the guest
+	 * has been loaded from a snapshot. In addition to handling a
+	 * disruption in time (which will also be signalled through the
+	 * disruption_marker field), a guest may wish to discard UUIDs,
+	 * reset network connections, reseed entropy, etc.
+	 */
+	__le64 vm_generation_counter;
 };
 
 #endif /*  __VMCLOCK_ABI_H__ */
-- 
cgit v1.2.3


From 3b1526ddb25452385b52f2588b655f524a57070b Mon Sep 17 00:00:00 2001
From: Babis Chalios <bchalios@amazon.es>
Date: Fri, 30 Jan 2026 17:36:01 +0000
Subject: ptp: vmclock: support device notifications

Add optional support for device notifications in VMClock. When
supported, the hypervisor will send a device notification every time it
updates the seq_count to a new even value.

Moreover, add support for poll() in VMClock as a means to propagate this
notification to user space. poll() will return a POLLIN event to
listeners every time seq_count changes to a value different than the one
last seen (since open() or last read()/pread()). This means that when
poll() returns a POLLIN event, listeners need to use read() to observe
what has changed and update the reader's view of seq_count. In other
words, after a poll() returned, all subsequent calls to poll() will
immediately return with a POLLIN event until the listener calls read().

The device advertises support for the notification mechanism by setting
flag VMCLOCK_FLAG_NOTIFICATION_PRESENT in vmclock_abi flags field. If
the flag is not present the driver won't setup the ACPI notification
handler and poll() will always immediately return POLLHUP.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Takahiro Itazuri <itazur@amazon.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Tested-by: Takahiro Itazuri <itazur@amazon.com>
Link: https://patch.msgid.link/20260130173704.12575-3-itazur@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/ptp/ptp_vmclock.c        | 162 ++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/vmclock-abi.h |   5 ++
 2 files changed, 148 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
index b3a83b03d9c1..f8b24f9e85cd 100644
--- a/drivers/ptp/ptp_vmclock.c
+++ b/drivers/ptp/ptp_vmclock.c
@@ -5,6 +5,9 @@
  * Copyright © 2024 Amazon.com, Inc. or its affiliates.
  */
 
+#include "linux/poll.h"
+#include "linux/types.h"
+#include "linux/wait.h"
 #include <linux/acpi.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -39,6 +42,7 @@ struct vmclock_state {
 	struct resource res;
 	struct vmclock_abi *clk;
 	struct miscdevice miscdev;
+	wait_queue_head_t disrupt_wait;
 	struct ptp_clock_info ptp_clock_info;
 	struct ptp_clock *ptp_clock;
 	enum clocksource_ids cs_id, sys_cs_id;
@@ -357,10 +361,15 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev,
 	return ptp_clock_register(&st->ptp_clock_info, dev);
 }
 
+struct vmclock_file_state {
+	struct vmclock_state *st;
+	atomic_t seq;
+};
+
 static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
 {
-	struct vmclock_state *st = container_of(fp->private_data,
-						struct vmclock_state, miscdev);
+	struct vmclock_file_state *fst = fp->private_data;
+	struct vmclock_state *st = fst->st;
 
 	if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
 		return -EROFS;
@@ -379,11 +388,11 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
 static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 				    size_t count, loff_t *ppos)
 {
-	struct vmclock_state *st = container_of(fp->private_data,
-						struct vmclock_state, miscdev);
 	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+	struct vmclock_file_state *fst = fp->private_data;
+	struct vmclock_state *st = fst->st;
+	uint32_t seq, old_seq;
 	size_t max_count;
-	uint32_t seq;
 
 	if (*ppos >= PAGE_SIZE)
 		return 0;
@@ -392,6 +401,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 	if (count > max_count)
 		count = max_count;
 
+	old_seq = atomic_read(&fst->seq);
 	while (1) {
 		seq = le32_to_cpu(st->clk->seq_count) & ~1U;
 		/* Pairs with hypervisor wmb */
@@ -402,8 +412,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 
 		/* Pairs with hypervisor wmb */
 		virt_rmb();
-		if (seq == le32_to_cpu(st->clk->seq_count))
-			break;
+		if (seq == le32_to_cpu(st->clk->seq_count)) {
+			/*
+			 * Either we updated fst->seq to seq (the latest version we observed)
+			 * or someone else did (old_seq == seq), so we can break.
+			 */
+			if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) ||
+			    old_seq == seq) {
+				break;
+			}
+		}
 
 		if (ktime_after(ktime_get(), deadline))
 			return -ETIMEDOUT;
@@ -413,25 +431,62 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 	return count;
 }
 
+static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait)
+{
+	struct vmclock_file_state *fst = fp->private_data;
+	struct vmclock_state *st = fst->st;
+	uint32_t seq;
+
+	/*
+	 * Hypervisor will not send us any notifications, so fail immediately
+	 * to avoid having caller sleeping for ever.
+	 */
+	if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
+		return POLLHUP;
+
+	poll_wait(fp, &st->disrupt_wait, wait);
+
+	seq = le32_to_cpu(st->clk->seq_count);
+	if (atomic_read(&fst->seq) != seq)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static int vmclock_miscdev_open(struct inode *inode, struct file *fp)
+{
+	struct vmclock_state *st = container_of(fp->private_data,
+						struct vmclock_state, miscdev);
+	struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL);
+
+	if (!fst)
+		return -ENOMEM;
+
+	fst->st = st;
+	atomic_set(&fst->seq, 0);
+
+	fp->private_data = fst;
+
+	return 0;
+}
+
+static int vmclock_miscdev_release(struct inode *inode, struct file *fp)
+{
+	kfree(fp->private_data);
+	return 0;
+}
+
 static const struct file_operations vmclock_miscdev_fops = {
 	.owner = THIS_MODULE,
+	.open = vmclock_miscdev_open,
+	.release = vmclock_miscdev_release,
 	.mmap = vmclock_miscdev_mmap,
 	.read = vmclock_miscdev_read,
+	.poll = vmclock_miscdev_poll,
 };
 
 /* module operations */
 
-static void vmclock_remove(void *data)
-{
-	struct vmclock_state *st = data;
-
-	if (st->ptp_clock)
-		ptp_clock_unregister(st->ptp_clock);
-
-	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
-		misc_deregister(&st->miscdev);
-}
-
 static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
 {
 	struct vmclock_state *st = data;
@@ -459,6 +514,44 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data
 	return AE_ERROR;
 }
 
+static void
+vmclock_acpi_notification_handler(acpi_handle __always_unused handle,
+				  u32 __always_unused event, void *dev)
+{
+	struct device *device = dev;
+	struct vmclock_state *st = device->driver_data;
+
+	wake_up_interruptible(&st->disrupt_wait);
+}
+
+static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st)
+{
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	acpi_status status;
+
+	/*
+	 * This should never happen as this function is only called when
+	 * has_acpi_companion(dev) is true, but the logic is sufficiently
+	 * complex that Coverity can't see the tautology.
+	 */
+	if (!adev)
+		return -ENODEV;
+
+	/* The device does not support notifications. Nothing else to do */
+	if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
+		return 0;
+
+	status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
+					     vmclock_acpi_notification_handler,
+					     dev);
+	if (ACPI_FAILURE(status)) {
+		dev_err(dev, "failed to install notification handler");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
 static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
 {
 	struct acpi_device *adev = ACPI_COMPANION(dev);
@@ -482,6 +575,30 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
 	return 0;
 }
 
+static void vmclock_remove(void *data)
+{
+	struct device *dev = data;
+	struct vmclock_state *st = dev->driver_data;
+
+	if (!st) {
+		dev_err(dev, "%s called with NULL driver_data", __func__);
+		return;
+	}
+
+	if (has_acpi_companion(dev))
+		acpi_remove_notify_handler(ACPI_COMPANION(dev)->handle,
+					   ACPI_DEVICE_NOTIFY,
+					   vmclock_acpi_notification_handler);
+
+	if (st->ptp_clock)
+		ptp_clock_unregister(st->ptp_clock);
+
+	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
+		misc_deregister(&st->miscdev);
+
+	dev->driver_data = NULL;
+}
+
 static void vmclock_put_idx(void *data)
 {
 	struct vmclock_state *st = data;
@@ -545,7 +662,14 @@ static int vmclock_probe(struct platform_device *pdev)
 
 	st->miscdev.minor = MISC_DYNAMIC_MINOR;
 
-	ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st);
+	init_waitqueue_head(&st->disrupt_wait);
+	dev->driver_data = st;
+
+	ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, dev);
+	if (ret)
+		return ret;
+
+	ret = vmclock_setup_notification(dev, st);
 	if (ret)
 		return ret;
 
diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h
index 937fe00e4f33..d320623b0118 100644
--- a/include/uapi/linux/vmclock-abi.h
+++ b/include/uapi/linux/vmclock-abi.h
@@ -121,6 +121,11 @@ struct vmclock_abi {
 	 * loaded from some save state (restored from a snapshot).
 	 */
 #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT     (1 << 8)
+	/*
+	 * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send
+	 * a notification every time it updates seq_count to a new even number.
+	 */
+#define VMCLOCK_FLAG_NOTIFICATION_PRESENT       (1 << 9)
 
 	__u8 pad[2];
 	__u8 clock_status;
-- 
cgit v1.2.3


From ef6a31d035a1000071dc4846aebd02ad081db9e4 Mon Sep 17 00:00:00 2001
From: Mark Harmstone <mark@harmstone.com>
Date: Wed, 7 Jan 2026 14:09:01 +0000
Subject: btrfs: add definitions and constants for remap-tree

Add an incompat flag for the new remap-tree feature, and the constants
and definitions needed to support it.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h            |  4 ++++
 fs/btrfs/locking.c              |  1 +
 fs/btrfs/sysfs.c                |  3 +++
 fs/btrfs/tree-checker.c         |  6 ++----
 fs/btrfs/tree-checker.h         |  5 +++++
 fs/btrfs/volumes.c              |  1 +
 include/uapi/linux/btrfs.h      |  1 +
 include/uapi/linux/btrfs_tree.h | 17 +++++++++++++++++
 8 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 78721412951c..09cdd6bfddf5 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -1010,6 +1010,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
 BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
 			 struct btrfs_verity_descriptor_item, size, 64);
 
+BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap_item, address, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap_item,
+			 address, 64);
+
 /* Cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type)				\
 	((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 0035851d72b0..e3df5ca0b552 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
 	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
 	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
+	{ .id = BTRFS_REMAP_TREE_OBJECTID,      DEFINE_NAME("remap") },
 	{ .id = 0,				DEFINE_NAME("tree")	},
 };
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ebd6d1d6778b..8834a1dd499c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -299,6 +299,8 @@ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
 BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
 /* Remove once support for raid stripe tree is feature complete. */
 BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE);
+/* Remove once support for remap tree is feature complete. */
+BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE);
 #endif
 #ifdef CONFIG_FS_VERITY
 BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -331,6 +333,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
 	BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
 	BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
+	BTRFS_FEAT_ATTR_PTR(remap_tree),
 #endif
 #ifdef CONFIG_FS_VERITY
 	BTRFS_FEAT_ATTR_PTR(verity),
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index c21c21adf61e..aedc208a95b8 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -913,12 +913,10 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
 			  length, btrfs_stripe_nr_to_offset(U32_MAX));
 		return -EUCLEAN;
 	}
-	if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
-			      BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
+	if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) {
 		chunk_err(fs_info, leaf, chunk, logical,
 			  "unrecognized chunk type: 0x%llx",
-			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
-			    BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
+			  type & ~BTRFS_BLOCK_GROUP_VALID);
 		return -EUCLEAN;
 	}
 
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index eb201f4ec3c7..833e2fd989eb 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -57,6 +57,11 @@ enum btrfs_tree_block_status {
 	BTRFS_TREE_BLOCK_WRITTEN_NOT_SET,
 };
 
+
+#define BTRFS_BLOCK_GROUP_VALID	(BTRFS_BLOCK_GROUP_TYPE_MASK | \
+				 BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+				 BTRFS_BLOCK_GROUP_REMAPPED)
+
 /*
  * Exported simply for btrfs-progs which wants to have the
  * btrfs_tree_block_status return codes.
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c4be17fcb87a..d2b7352eb7cb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -231,6 +231,7 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped");
 
 	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index e8fd92789423..9165154a274d 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2	(1ULL << 13)
 #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE	(1ULL << 14)
 #define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA	(1ULL << 16)
+#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE	(1ULL << 17)
 
 struct btrfs_ioctl_feature_flags {
 	__u64 compat_flags;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index fc29d273845d..f011d34cb699 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -76,6 +76,9 @@
 /* Tracks RAID stripes in block groups. */
 #define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
 
+/* Holds details of remapped addresses after relocation. */
+#define BTRFS_REMAP_TREE_OBJECTID 13ULL
+
 /* device stats in the device tree */
 #define BTRFS_DEV_STATS_OBJECTID 0ULL
 
@@ -282,6 +285,10 @@
 
 #define BTRFS_RAID_STRIPE_KEY	230
 
+#define BTRFS_IDENTITY_REMAP_KEY 	234
+#define BTRFS_REMAP_KEY		 	235
+#define BTRFS_REMAP_BACKREF_KEY	 	236
+
 /*
  * Records the overall state of the qgroups.
  * There's only one instance of this key present,
@@ -1161,6 +1168,7 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
 #define BTRFS_BLOCK_GROUP_RAID1C3       (1ULL << 9)
 #define BTRFS_BLOCK_GROUP_RAID1C4       (1ULL << 10)
+#define BTRFS_BLOCK_GROUP_REMAPPED      (1ULL << 11)
 #define BTRFS_BLOCK_GROUP_RESERVED	(BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
 					 BTRFS_SPACE_INFO_GLOBAL_RSV)
 
@@ -1323,4 +1331,13 @@ struct btrfs_verity_descriptor_item {
 	__u8 encryption;
 } __attribute__ ((__packed__));
 
+/*
+ * For a range identified by a BTRFS_REMAP_KEY item in the remap tree, gives
+ * the address that the start of the range will get remapped to.  This
+ * structure is also shared by BTRFS_REMAP_BACKREF_KEY.
+ */
+struct btrfs_remap_item {
+	__le64 address;
+} __attribute__ ((__packed__));
+
 #endif /* _BTRFS_CTREE_H_ */
-- 
cgit v1.2.3


From 0b4d29fa98ca1a49c4498353253f857573871ba0 Mon Sep 17 00:00:00 2001
From: Mark Harmstone <mark@harmstone.com>
Date: Wed, 7 Jan 2026 14:09:02 +0000
Subject: btrfs: add METADATA_REMAP chunk type

Add a new METADATA_REMAP chunk type, which is a metadata chunk that holds the
remap tree.

This is needed for bootstrapping purposes: the remap tree can't itself
be remapped, and must be relocated the existing way, by COWing every
leaf. The remap tree can't go in the SYSTEM chunk as space there is
limited, because a copy of the chunk item gets placed in the superblock.

The changes in fs/btrfs/volumes.h are because we're adding a new block
group type bit after the profile bits, and so can no longer rely on the
const_ilog2 trick.

The sizing to 32MB per chunk, matching the SYSTEM chunk, is an estimate
here, we can adjust it later if it proves to be too big or too small.
This works out to be ~500,000 remap items, which for a 4KB block size
covers ~2GB of remapped data in the worst case and ~500TB in the best case.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.c            |  8 ++++++++
 fs/btrfs/block-rsv.h            |  1 +
 fs/btrfs/disk-io.c              |  1 +
 fs/btrfs/fs.h                   |  2 ++
 fs/btrfs/space-info.c           | 13 ++++++++++++-
 fs/btrfs/sysfs.c                |  2 ++
 fs/btrfs/tree-checker.c         | 13 +++++++++++--
 fs/btrfs/volumes.c              |  3 +++
 fs/btrfs/volumes.h              | 10 +++++++++-
 include/uapi/linux/btrfs_tree.h |  4 +++-
 10 files changed, 52 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 96cf7a162987..e823230c09b7 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -419,6 +419,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
 	case BTRFS_TREE_LOG_OBJECTID:
 		root->block_rsv = &fs_info->treelog_rsv;
 		break;
+	case BTRFS_REMAP_TREE_OBJECTID:
+		root->block_rsv = &fs_info->remap_block_rsv;
+		break;
 	default:
 		root->block_rsv = NULL;
 		break;
@@ -432,6 +435,9 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 	fs_info->chunk_block_rsv.space_info = space_info;
 
+	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA_REMAP);
+	fs_info->remap_block_rsv.space_info = space_info;
+
 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	fs_info->global_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
@@ -458,6 +464,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+	WARN_ON(fs_info->remap_block_rsv.size > 0);
+	WARN_ON(fs_info->remap_block_rsv.reserved > 0);
 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 	WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 79ae9d05cd91..8359fb96bc3c 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -22,6 +22,7 @@ enum btrfs_rsv_type {
 	BTRFS_BLOCK_RSV_DELALLOC,
 	BTRFS_BLOCK_RSV_TRANS,
 	BTRFS_BLOCK_RSV_CHUNK,
+	BTRFS_BLOCK_RSV_REMAP,
 	BTRFS_BLOCK_RSV_DELOPS,
 	BTRFS_BLOCK_RSV_DELREFS,
 	BTRFS_BLOCK_RSV_TREELOG,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index faa1c2c20ecd..922e69038d81 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2751,6 +2751,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 			     BTRFS_BLOCK_RSV_GLOBAL);
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+	btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP);
 	btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
 	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index e3e5e52e97a2..195428ecfd75 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -509,6 +509,8 @@ struct btrfs_fs_info {
 	struct btrfs_block_rsv trans_block_rsv;
 	/* Block reservation for chunk tree */
 	struct btrfs_block_rsv chunk_block_rsv;
+	/* Block reservation for remap tree. */
+	struct btrfs_block_rsv remap_block_rsv;
 	/* Block reservation for delayed operations */
 	struct btrfs_block_rsv delayed_block_rsv;
 	/* Block reservation for delayed refs */
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 1d76242f5e0d..2c9cf1ab232b 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -215,7 +215,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
 
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
 		return BTRFS_MAX_DATA_CHUNK_SIZE;
-	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+	else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP))
 		return SZ_32M;
 
 	/* Handle BTRFS_BLOCK_GROUP_METADATA */
@@ -348,6 +348,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 	if (mixed) {
 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
 		ret = create_space_info(fs_info, flags);
+		if (ret)
+			goto out;
 	} else {
 		flags = BTRFS_BLOCK_GROUP_METADATA;
 		ret = create_space_info(fs_info, flags);
@@ -356,7 +358,15 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 
 		flags = BTRFS_BLOCK_GROUP_DATA;
 		ret = create_space_info(fs_info, flags);
+		if (ret)
+			goto out;
+	}
+
+	if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) {
+		flags = BTRFS_BLOCK_GROUP_METADATA_REMAP;
+		ret = create_space_info(fs_info, flags);
 	}
+
 out:
 	return ret;
 }
@@ -611,6 +621,7 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
 	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
 	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
 	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+	DUMP_BLOCK_RSV(fs_info, remap_block_rsv);
 	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
 	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
 }
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 8834a1dd499c..27bfb7b55ec4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1929,6 +1929,8 @@ static const char *alloc_name(struct btrfs_space_info *space_info)
 	case BTRFS_BLOCK_GROUP_SYSTEM:
 		ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY);
 		return "system";
+	case BTRFS_BLOCK_GROUP_METADATA_REMAP:
+		return "metadata-remap";
 	default:
 		WARN_ON(1);
 		return "invalid-combination";
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index aedc208a95b8..a6c158cd8fcd 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -748,17 +748,26 @@ static int check_block_group_item(struct extent_buffer *leaf,
 		return -EUCLEAN;
 	}
 
+	if (unlikely(flags & BTRFS_BLOCK_GROUP_METADATA_REMAP &&
+		     !btrfs_fs_incompat(fs_info, REMAP_TREE))) {
+		block_group_err(leaf, slot,
+"invalid flags, have 0x%llx (METADATA_REMAP flag set) but no remap-tree incompat flag",
+				flags);
+		return -EUCLEAN;
+	}
+
 	type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
 	if (unlikely(type != BTRFS_BLOCK_GROUP_DATA &&
 		     type != BTRFS_BLOCK_GROUP_METADATA &&
 		     type != BTRFS_BLOCK_GROUP_SYSTEM &&
+		     type != BTRFS_BLOCK_GROUP_METADATA_REMAP &&
 		     type != (BTRFS_BLOCK_GROUP_METADATA |
 			      BTRFS_BLOCK_GROUP_DATA))) {
 		block_group_err(leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx, 0x%llx or 0x%llx",
 			type, hweight64(type),
 			BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
-			BTRFS_BLOCK_GROUP_SYSTEM,
+			BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA_REMAP,
 			BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
 		return -EUCLEAN;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d2b7352eb7cb..eda6505f3ee5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -231,6 +231,9 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+	/* Block groups containing the remap tree. */
+	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA_REMAP, "metadata-remap");
+	/* Block group that has been remapped. */
 	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped");
 
 	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 59347a4bb185..e4b3cb50f94a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -58,7 +58,6 @@ static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
  */
 static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
 	      const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
-static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
 
 /* ilog2() can handle both constants and variables */
 #define BTRFS_BG_FLAG_TO_INDEX(profile)					\
@@ -80,6 +79,15 @@ enum btrfs_raid_types {
 	BTRFS_NR_RAID_TYPES
 };
 
+static_assert(BTRFS_RAID_RAID0 == 1);
+static_assert(BTRFS_RAID_RAID1 == 2);
+static_assert(BTRFS_RAID_DUP == 3);
+static_assert(BTRFS_RAID_RAID10 == 4);
+static_assert(BTRFS_RAID_RAID5 == 5);
+static_assert(BTRFS_RAID_RAID6 == 6);
+static_assert(BTRFS_RAID_RAID1C3 == 7);
+static_assert(BTRFS_RAID_RAID1C4 == 8);
+
 /*
  * Use sequence counter to get consistent device stat data on
  * 32-bit processors.
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index f011d34cb699..76578426671c 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -1169,12 +1169,14 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_RAID1C3       (1ULL << 9)
 #define BTRFS_BLOCK_GROUP_RAID1C4       (1ULL << 10)
 #define BTRFS_BLOCK_GROUP_REMAPPED      (1ULL << 11)
+#define BTRFS_BLOCK_GROUP_METADATA_REMAP (1ULL << 12)
 #define BTRFS_BLOCK_GROUP_RESERVED	(BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
 					 BTRFS_SPACE_INFO_GLOBAL_RSV)
 
 #define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
 					 BTRFS_BLOCK_GROUP_SYSTEM |  \
-					 BTRFS_BLOCK_GROUP_METADATA)
+					 BTRFS_BLOCK_GROUP_METADATA | \
+					 BTRFS_BLOCK_GROUP_METADATA_REMAP)
 
 #define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
 					 BTRFS_BLOCK_GROUP_RAID1 |   \
-- 
cgit v1.2.3


From 7977011460cffc6f5a0cd830584c832c4aa07076 Mon Sep 17 00:00:00 2001
From: Mark Harmstone <mark@harmstone.com>
Date: Wed, 7 Jan 2026 14:09:07 +0000
Subject: btrfs: add extended version of struct block_group_item

Add a struct btrfs_block_group_item_v2, which is used in the block group
tree if the remap-tree incompat flag is set.

This adds two new fields to the block group item: `remap_bytes` and
`identity_remap_count`.

`remap_bytes` records the amount of data that's physically within this
block group, but nominally in another, remapped block group. This is
necessary because this data will need to be moved first if this block
group is itself relocated. If `remap_bytes` > 0, this is an indicator to
the relocation thread that it will need to search the remap-tree for
backrefs. A block group must also have `remap_bytes` == 0 before it can
be dropped.

`identity_remap_count` records how many identity remap items are located
in the remap tree for this block group. When relocation is begun for
this block group, this is set to the number of holes in the free-space
tree for this range. As identity remaps are converted into actual remaps
by the relocation process, this number is decreased. Once it reaches 0,
either because of relocation or because extents have been deleted, the
block group has been fully remapped and its chunk's device extents are
removed.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/accessors.h            | 20 +++++++++
 fs/btrfs/block-group.c          | 92 ++++++++++++++++++++++++++++++-----------
 fs/btrfs/block-group.h          | 10 ++++-
 fs/btrfs/discard.c              |  2 +-
 fs/btrfs/tree-checker.c         | 10 ++++-
 include/uapi/linux/btrfs_tree.h |  8 ++++
 6 files changed, 114 insertions(+), 28 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 09cdd6bfddf5..9797f9e8d4e5 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -240,6 +240,26 @@ BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
 			struct btrfs_block_group_item, flags, 64);
 
+/* struct btrfs_block_group_item_v2 */
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2,
+			 used, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid,
+			 struct btrfs_block_group_item_v2, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid,
+		   struct btrfs_block_group_item_v2, chunk_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags,
+			 struct btrfs_block_group_item_v2, flags, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes,
+			 struct btrfs_block_group_item_v2, remap_bytes, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2,
+		   remap_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count,
+			 struct btrfs_block_group_item_v2, identity_remap_count, 32);
+BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2,
+		   identity_remap_count, 32);
+
 /* struct btrfs_free_space_info */
 BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
 		   extent_count, 32);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5709acc84297..a1ab513fa8ea 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2371,7 +2371,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
 }
 
 static int read_one_block_group(struct btrfs_fs_info *info,
-				struct btrfs_block_group_item *bgi,
+				struct btrfs_block_group_item_v2 *bgi,
 				const struct btrfs_key *key,
 				int need_clear)
 {
@@ -2386,11 +2386,15 @@ static int read_one_block_group(struct btrfs_fs_info *info,
 		return -ENOMEM;
 
 	cache->length = key->offset;
-	cache->used = btrfs_stack_block_group_used(bgi);
+	cache->used = btrfs_stack_block_group_v2_used(bgi);
 	cache->last_used = cache->used;
-	cache->flags = btrfs_stack_block_group_flags(bgi);
-	cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+	cache->flags = btrfs_stack_block_group_v2_flags(bgi);
+	cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi);
 	cache->space_info = btrfs_find_space_info(info, cache->flags);
+	cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi);
+	cache->last_remap_bytes = cache->remap_bytes;
+	cache->identity_remap_count = btrfs_stack_block_group_v2_identity_remap_count(bgi);
+	cache->last_identity_remap_count = cache->identity_remap_count;
 
 	btrfs_set_free_space_tree_thresholds(cache);
 
@@ -2455,7 +2459,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
 	} else if (cache->length == cache->used) {
 		cache->cached = BTRFS_CACHE_FINISHED;
 		btrfs_free_excluded_extents(cache);
-	} else if (cache->used == 0) {
+	} else if (cache->used == 0 && cache->remap_bytes == 0) {
 		cache->cached = BTRFS_CACHE_FINISHED;
 		ret = btrfs_add_new_free_space(cache, cache->start,
 					       cache->start + cache->length, NULL);
@@ -2475,7 +2479,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
 
 	set_avail_alloc_bits(info, cache->flags);
 	if (btrfs_chunk_writeable(info, cache->start)) {
-		if (cache->used == 0) {
+		if (cache->used == 0 && cache->remap_bytes == 0) {
 			ASSERT(list_empty(&cache->bg_list));
 			if (btrfs_test_opt(info, DISCARD_ASYNC))
 				btrfs_discard_queue_work(&info->discard_ctl, cache);
@@ -2579,9 +2583,10 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 		need_clear = 1;
 
 	while (1) {
-		struct btrfs_block_group_item bgi;
+		struct btrfs_block_group_item_v2 bgi;
 		struct extent_buffer *leaf;
 		int slot;
+		size_t size;
 
 		ret = find_first_block_group(info, path, &key);
 		if (ret > 0)
@@ -2592,8 +2597,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 
+		if (btrfs_fs_incompat(info, REMAP_TREE)) {
+			size = sizeof(struct btrfs_block_group_item_v2);
+		} else {
+			size = sizeof(struct btrfs_block_group_item);
+			btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0);
+			btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0);
+		}
+
 		read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
-				   sizeof(bgi));
+				   size);
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		btrfs_release_path(path);
@@ -2663,25 +2676,34 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
 				   struct btrfs_block_group *block_group)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_block_group_item bgi;
+	struct btrfs_block_group_item_v2 bgi;
 	struct btrfs_root *root = btrfs_block_group_root(fs_info);
 	struct btrfs_key key;
 	u64 old_last_used;
+	size_t size;
 	int ret;
 
 	spin_lock(&block_group->lock);
-	btrfs_set_stack_block_group_used(&bgi, block_group->used);
-	btrfs_set_stack_block_group_chunk_objectid(&bgi,
-						   block_group->global_root_id);
-	btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
+	btrfs_set_stack_block_group_v2_used(&bgi, block_group->used);
+	btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, block_group->global_root_id);
+	btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags);
+	btrfs_set_stack_block_group_v2_remap_bytes(&bgi, block_group->remap_bytes);
+	btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, block_group->identity_remap_count);
 	old_last_used = block_group->last_used;
 	block_group->last_used = block_group->used;
+	block_group->last_remap_bytes = block_group->remap_bytes;
+	block_group->last_identity_remap_count = block_group->identity_remap_count;
 	key.objectid = block_group->start;
 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	key.offset = block_group->length;
 	spin_unlock(&block_group->lock);
 
-	ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+	if (btrfs_fs_incompat(fs_info, REMAP_TREE))
+		size = sizeof(struct btrfs_block_group_item_v2);
+	else
+		size = sizeof(struct btrfs_block_group_item);
+
+	ret = btrfs_insert_item(trans, root, &key, &bgi, size);
 	if (ret < 0) {
 		spin_lock(&block_group->lock);
 		block_group->last_used = old_last_used;
@@ -3132,10 +3154,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = btrfs_block_group_root(fs_info);
 	unsigned long bi;
 	struct extent_buffer *leaf;
-	struct btrfs_block_group_item bgi;
+	struct btrfs_block_group_item_v2 bgi;
 	struct btrfs_key key;
-	u64 old_last_used;
-	u64 used;
+	u64 old_last_used, old_last_remap_bytes;
+	u32 old_last_identity_remap_count;
+	u64 used, remap_bytes;
+	u32 identity_remap_count;
 
 	/*
 	 * Block group items update can be triggered out of commit transaction
@@ -3145,13 +3169,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 	 */
 	spin_lock(&cache->lock);
 	old_last_used = cache->last_used;
+	old_last_remap_bytes = cache->last_remap_bytes;
+	old_last_identity_remap_count = cache->last_identity_remap_count;
 	used = cache->used;
-	/* No change in used bytes, can safely skip it. */
-	if (cache->last_used == used) {
+	remap_bytes = cache->remap_bytes;
+	identity_remap_count = cache->identity_remap_count;
+	/* No change in values, can safely skip it. */
+	if (cache->last_used == used &&
+	    cache->last_remap_bytes == remap_bytes &&
+	    cache->last_identity_remap_count == identity_remap_count) {
 		spin_unlock(&cache->lock);
 		return 0;
 	}
 	cache->last_used = used;
+	cache->last_remap_bytes = remap_bytes;
+	cache->last_identity_remap_count = identity_remap_count;
 	spin_unlock(&cache->lock);
 
 	key.objectid = cache->start;
@@ -3167,11 +3199,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 
 	leaf = path->nodes[0];
 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
-	btrfs_set_stack_block_group_used(&bgi, used);
-	btrfs_set_stack_block_group_chunk_objectid(&bgi,
-						   cache->global_root_id);
-	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
-	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
+	btrfs_set_stack_block_group_v2_used(&bgi, used);
+	btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, cache->global_root_id);
+	btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags);
+
+	if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+		btrfs_set_stack_block_group_v2_remap_bytes(&bgi, cache->remap_bytes);
+		btrfs_set_stack_block_group_v2_identity_remap_count(&bgi,
+						cache->identity_remap_count);
+		write_extent_buffer(leaf, &bgi, bi,
+				    sizeof(struct btrfs_block_group_item_v2));
+	} else {
+		write_extent_buffer(leaf, &bgi, bi,
+				    sizeof(struct btrfs_block_group_item));
+	}
+
 fail:
 	btrfs_release_path(path);
 	/*
@@ -3186,6 +3228,8 @@ fail:
 	if (ret < 0 && ret != -ENOENT) {
 		spin_lock(&cache->lock);
 		cache->last_used = old_last_used;
+		cache->last_remap_bytes = old_last_remap_bytes;
+		cache->last_identity_remap_count = old_last_identity_remap_count;
 		spin_unlock(&cache->lock);
 	}
 	return ret;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index b0fb85a36d97..ecabb1a9fc0e 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -129,6 +129,8 @@ struct btrfs_block_group {
 	u64 flags;
 	u64 cache_generation;
 	u64 global_root_id;
+	u64 remap_bytes;
+	u32 identity_remap_count;
 
 	/*
 	 * The last committed used bytes of this block group, if the above @used
@@ -136,6 +138,11 @@ struct btrfs_block_group {
 	 * group item of this block group.
 	 */
 	u64 last_used;
+	/* The last committed remap_bytes value of this block group. */
+	u64 last_remap_bytes;
+	/* The last commited identity_remap_count value of this block group. */
+	u32 last_identity_remap_count;
+
 	/*
 	 * If the free space extent count exceeds this number, convert the block
 	 * group to bitmaps.
@@ -282,7 +289,8 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
 {
 	lockdep_assert_held(&bg->lock);
 
-	return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+	return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 ||
+		bg->remap_bytes > 0);
 }
 
 static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 89fe85778115..ee5f5b2788e1 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -373,7 +373,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
 	if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
 		return;
 
-	if (block_group->used == 0)
+	if (block_group->used == 0 && block_group->remap_bytes == 0)
 		add_to_discard_unused_list(discard_ctl, block_group);
 	else
 		add_to_discard_list(discard_ctl, block_group);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ead2e1e2a0bb..452394b34d01 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -688,6 +688,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
 	u64 chunk_objectid;
 	u64 flags;
 	u64 type;
+	size_t exp_size;
 
 	/*
 	 * Here we don't really care about alignment since extent allocator can
@@ -699,10 +700,15 @@ static int check_block_group_item(struct extent_buffer *leaf,
 		return -EUCLEAN;
 	}
 
-	if (unlikely(item_size != sizeof(bgi))) {
+	if (btrfs_fs_incompat(fs_info, REMAP_TREE))
+		exp_size = sizeof(struct btrfs_block_group_item_v2);
+	else
+		exp_size = sizeof(struct btrfs_block_group_item);
+
+	if (unlikely(item_size != exp_size)) {
 		block_group_err(leaf, slot,
 			"invalid item size, have %u expect %zu",
-				item_size, sizeof(bgi));
+				item_size, exp_size);
 		return -EUCLEAN;
 	}
 
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 76578426671c..86820a9644e8 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -1229,6 +1229,14 @@ struct btrfs_block_group_item {
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+struct btrfs_block_group_item_v2 {
+	__le64 used;
+	__le64 chunk_objectid;
+	__le64 flags;
+	__le64 remap_bytes;
+	__le32 identity_remap_count;
+} __attribute__ ((__packed__));
+
 struct btrfs_free_space_info {
 	__le32 extent_count;
 	__le32 flags;
-- 
cgit v1.2.3


From 8620da16fb6be1fd9906374fa1c763a10c6918df Mon Sep 17 00:00:00 2001
From: Mark Harmstone <mark@harmstone.com>
Date: Wed, 7 Jan 2026 14:09:08 +0000
Subject: btrfs: allow mounting filesystems with remap-tree incompat flag

If we encounter a filesystem with the remap-tree incompat flag set,
validate its compatibility with the other flags, and load the remap tree
using the values that have been added to the superblock.

The remap-tree feature depends on the free-space-tree, but no-holes and
block-group-tree have been made dependencies to reduce the testing
matrix. Similarly I'm not aware of any reason why mixed-bg and zoned would be
incompatible with remap-tree, but this is blocked for the time being
until it can be fully tested.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Kconfig                |   2 +
 fs/btrfs/accessors.h            |   6 +++
 fs/btrfs/disk-io.c              | 105 +++++++++++++++++++++++++++++++++++-----
 fs/btrfs/extent-tree.c          |   2 +
 fs/btrfs/fs.h                   |   4 +-
 fs/btrfs/transaction.c          |   7 +++
 include/uapi/linux/btrfs_tree.h |   5 +-
 7 files changed, 116 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 423122786a93..ede184b6eda1 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -116,4 +116,6 @@ config BTRFS_EXPERIMENTAL
 
 	  - asynchronous checksum generation for data writes
 
+	  - remap-tree - logical address remapping tree
+
 	  If unsure, say N.
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 9797f9e8d4e5..8938357fcb40 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -883,6 +883,12 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
 			 uuid_tree_generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block,
 			 nr_global_roots, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root, struct btrfs_super_block,
+			 remap_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root_generation, struct btrfs_super_block,
+			 remap_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root_level, struct btrfs_super_block,
+			 remap_root_level, 8);
 
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cd46b9d85880..c69734c74c26 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1136,6 +1136,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
 	case BTRFS_RAID_STRIPE_TREE_OBJECTID:
 		return btrfs_grab_root(fs_info->stripe_root);
+	case BTRFS_REMAP_TREE_OBJECTID:
+		return btrfs_grab_root(fs_info->remap_root);
 	default:
 		return NULL;
 	}
@@ -1226,6 +1228,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->data_reloc_root);
 	btrfs_put_root(fs_info->block_group_root);
 	btrfs_put_root(fs_info->stripe_root);
+	btrfs_put_root(fs_info->remap_root);
 	btrfs_check_leaked_roots(fs_info);
 	btrfs_extent_buffer_leak_debug_check(fs_info);
 	kfree(fs_info->super_copy);
@@ -1778,6 +1781,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 	free_root_extent_buffers(info->data_reloc_root);
 	free_root_extent_buffers(info->block_group_root);
 	free_root_extent_buffers(info->stripe_root);
+	free_root_extent_buffers(info->remap_root);
 	if (free_chunk_root)
 		free_root_extent_buffers(info->chunk_root);
 }
@@ -2191,21 +2195,44 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	if (ret)
 		goto out;
 
-	/*
-	 * This tree can share blocks with some other fs tree during relocation
-	 * and we need a proper setup by btrfs_get_fs_root
-	 */
-	root = btrfs_get_fs_root(tree_root->fs_info,
-				 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
-	if (IS_ERR(root)) {
-		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
-			location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
-			ret = PTR_ERR(root);
-			goto out;
+	if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+		/* The remap_root has already been loaded in load_important_roots(). */
+		root = fs_info->remap_root;
+
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+
+		root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID;
+		root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+		root->root_key.offset = 0;
+
+		/* Check that data reloc tree doesn't also exist. */
+		location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+		root = btrfs_read_tree_root(fs_info->tree_root, &location);
+		if (!IS_ERR(root)) {
+			btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled");
+			btrfs_put_root(root);
+			return -EIO;
+		} else if (PTR_ERR(root) != -ENOENT) {
+			btrfs_warn(fs_info, "error %ld when checking for data reloc tree",
+				   PTR_ERR(root));
 		}
 	} else {
-		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-		fs_info->data_reloc_root = root;
+		/*
+		 * This tree can share blocks with some other fs tree during
+		 * relocation and we need a proper setup by btrfs_get_fs_root().
+		 */
+		root = btrfs_get_fs_root(tree_root->fs_info,
+					 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
+		if (IS_ERR(root)) {
+			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+				location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+				ret = PTR_ERR(root);
+				goto out;
+			}
+		} else {
+			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+			fs_info->data_reloc_root = root;
+		}
 	}
 
 	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
@@ -2445,6 +2472,35 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
+	if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+		/*
+		 * Reduce test matrix for remap tree by requiring block-group-tree
+		 * and no-holes. Free-space-tree is a hard requirement.
+		 */
+		if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+		    !btrfs_fs_incompat(fs_info, NO_HOLES) ||
+		    !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+			btrfs_err(fs_info,
+"remap-tree feature requires free-space-tree, no-holes, and block-group-tree");
+			ret = -EINVAL;
+		}
+
+		if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+			btrfs_err(fs_info, "remap-tree not supported with mixed-bg");
+			ret = -EINVAL;
+		}
+
+		if (btrfs_fs_incompat(fs_info, ZONED)) {
+			btrfs_err(fs_info, "remap-tree not supported with zoned devices");
+			ret = -EINVAL;
+		}
+
+		if (sectorsize > PAGE_SIZE) {
+			btrfs_err(fs_info, "remap-tree not supported when block size > page size");
+			ret = -EINVAL;
+		}
+	}
+
 	/*
 	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
 	 * done later
@@ -2603,6 +2659,18 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
 		btrfs_warn(fs_info, "couldn't read tree root");
 		return ret;
 	}
+
+	if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+		bytenr = btrfs_super_remap_root(sb);
+		gen = btrfs_super_remap_root_generation(sb);
+		level = btrfs_super_remap_root_level(sb);
+		ret = load_super_root(fs_info->remap_root, bytenr, gen, level);
+		if (ret) {
+			btrfs_warn(fs_info, "couldn't read remap root");
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
@@ -3231,6 +3299,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
+	struct btrfs_root *remap_root;
 	int ret;
 	int level;
 
@@ -3365,6 +3434,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	if (ret < 0)
 		goto fail_alloc;
 
+	if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) {
+		remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID,
+					      GFP_KERNEL);
+		fs_info->remap_root = remap_root;
+		if (!remap_root) {
+			ret = -ENOMEM;
+			goto fail_alloc;
+		}
+	}
+
 	/*
 	 * At this point our mount options are validated, if we set ->max_inline
 	 * to something non-standard make sure we truncate it to sectorsize.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 48a453fa3063..ce4bda1f37ad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2593,6 +2593,8 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
 		flags = BTRFS_BLOCK_GROUP_DATA;
 	else if (root == fs_info->chunk_root)
 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
+	else if (root == fs_info->remap_root)
+		flags = BTRFS_BLOCK_GROUP_METADATA_REMAP;
 	else
 		flags = BTRFS_BLOCK_GROUP_METADATA;
 
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 195428ecfd75..13b0aa0b9da9 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -315,7 +315,8 @@ enum {
 #define BTRFS_FEATURE_INCOMPAT_SUPP		\
 	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE |	\
 	 BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
-	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \
+	 BTRFS_FEATURE_INCOMPAT_REMAP_TREE)
 
 #else
 
@@ -475,6 +476,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *data_reloc_root;
 	struct btrfs_root *block_group_root;
 	struct btrfs_root *stripe_root;
+	struct btrfs_root *remap_root;
 
 	/* The log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e2f993b1783f..f4cc9e1a1b93 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1967,6 +1967,13 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
 		super->cache_generation = 0;
 	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
 		super->uuid_tree_generation = root_item->generation;
+
+	if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+		root_item = &fs_info->remap_root->root_item;
+		super->remap_root = root_item->bytenr;
+		super->remap_root_generation = root_item->generation;
+		super->remap_root_level = root_item->level;
+	}
 }
 
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 86820a9644e8..f7843e6bb978 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -721,9 +721,12 @@ struct btrfs_super_block {
 	__u8 metadata_uuid[BTRFS_FSID_SIZE];
 
 	__u64 nr_global_roots;
+	__le64 remap_root;
+	__le64 remap_root_generation;
+	__u8 remap_root_level;
 
 	/* Future expansion */
-	__le64 reserved[27];
+	__u8 reserved[199];
 	__u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 
-- 
cgit v1.2.3


From 4fa4ac5e584841c0f9b01c2f7dd0c2e3caa8bca0 Mon Sep 17 00:00:00 2001
From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Date: Sat, 31 Jan 2026 23:25:13 +0100
Subject: tcp: accecn: add tcpi_ecn_mode and tcpi_option2 in tcp_info

Add 2-bit tcpi_ecn_mode feild within tcp_info to indicate which ECN
mode is negotiated: ECN_MODE_DISABLED, ECN_MODE_RFC3168, ECN_MODE_ACCECN,
or ECN_MODE_PENDING. This is done by utilizing available bits from
tcpi_accecn_opt_seen (reduced from 16 bits to 2 bits) and
tcpi_accecn_fail_mode (reduced from 16 bits to 4 bits).

Also, an extra 24-bit tcpi_options2 field is identified to represent
newer options and connection features, as all 8 bits of tcpi_options
field have been used.

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Co-developed-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20260131222515.8485-14-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tcp_ecn.h    | 11 -----------
 include/uapi/linux/tcp.h | 26 +++++++++++++++++++++++---
 net/ipv4/tcp.c           |  8 ++++++++
 3 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
index e01653bbf181..e9a933641636 100644
--- a/include/net/tcp_ecn.h
+++ b/include/net/tcp_ecn.h
@@ -67,12 +67,6 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
 	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
 }
 
-/* tp->accecn_fail_mode */
-#define TCP_ACCECN_ACE_FAIL_SEND	BIT(0)
-#define TCP_ACCECN_ACE_FAIL_RECV	BIT(1)
-#define TCP_ACCECN_OPT_FAIL_SEND	BIT(2)
-#define TCP_ACCECN_OPT_FAIL_RECV	BIT(3)
-
 static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
 {
 	return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
@@ -98,11 +92,6 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
 	tp->accecn_fail_mode |= mode;
 }
 
-#define TCP_ACCECN_OPT_NOT_SEEN		0x0
-#define TCP_ACCECN_OPT_EMPTY_SEEN	0x1
-#define TCP_ACCECN_OPT_COUNTER_SEEN	0x2
-#define TCP_ACCECN_OPT_FAIL_SEEN	0x3
-
 static inline u8 tcp_accecn_ace(const struct tcphdr *th)
 {
 	return (th->ae << 2) | (th->cwr << 1) | th->ece;
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index dce3113787a7..03772dd4d399 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -226,6 +226,24 @@ enum tcp_ca_state {
 #define TCPF_CA_Loss	(1<<TCP_CA_Loss)
 };
 
+/* Values for tcpi_ecn_mode after negotiation */
+#define TCPI_ECN_MODE_DISABLED		0x0
+#define TCPI_ECN_MODE_RFC3168		0x1
+#define TCPI_ECN_MODE_ACCECN		0x2
+#define TCPI_ECN_MODE_PENDING		0x3
+
+/* Values for accecn_opt_seen */
+#define TCP_ACCECN_OPT_NOT_SEEN		0x0
+#define TCP_ACCECN_OPT_EMPTY_SEEN	0x1
+#define TCP_ACCECN_OPT_COUNTER_SEEN	0x2
+#define TCP_ACCECN_OPT_FAIL_SEEN	0x3
+
+/* Values for accecn_fail_mode */
+#define TCP_ACCECN_ACE_FAIL_SEND	BIT(0)
+#define TCP_ACCECN_ACE_FAIL_RECV	BIT(1)
+#define TCP_ACCECN_OPT_FAIL_SEND	BIT(2)
+#define TCP_ACCECN_OPT_FAIL_RECV	BIT(3)
+
 struct tcp_info {
 	__u8	tcpi_state;
 	__u8	tcpi_ca_state;
@@ -316,15 +334,17 @@ struct tcp_info {
 					 * in milliseconds, including any
 					 * unfinished recovery.
 					 */
-	__u32	tcpi_received_ce;    /* # of CE marks received */
+	__u32	tcpi_received_ce;    /* # of CE marked segments received */
 	__u32	tcpi_delivered_e1_bytes;  /* Accurate ECN byte counters */
 	__u32	tcpi_delivered_e0_bytes;
 	__u32	tcpi_delivered_ce_bytes;
 	__u32	tcpi_received_e1_bytes;
 	__u32	tcpi_received_e0_bytes;
 	__u32	tcpi_received_ce_bytes;
-	__u16	tcpi_accecn_fail_mode;
-	__u16	tcpi_accecn_opt_seen;
+	__u32	tcpi_ecn_mode:2,
+		tcpi_accecn_opt_seen:2,
+		tcpi_accecn_fail_mode:4,
+		tcpi_options2:24;
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 49f1029b5f8b..6ce03a9adb4a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4373,6 +4373,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	if (tp->rto_stamp)
 		info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
 
+	if (tcp_ecn_disabled(tp))
+		info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED;
+	else if (tcp_ecn_mode_rfc3168(tp))
+		info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168;
+	else if (tcp_ecn_mode_accecn(tp))
+		info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN;
+	else if (tcp_ecn_mode_pending(tp))
+		info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING;
 	info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
 	info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
 	info->tcpi_received_ce = tp->received_ce;
-- 
cgit v1.2.3


From 0ee4ddc1647b8b3b9e7a94d798a1774a764428c1 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.ibm.com>
Date: Wed, 4 Feb 2026 16:02:57 +0100
Subject: KVM: s390: Storage key manipulation IOCTL

Add a new IOCTL to allow userspace to manipulate storage keys directly.

This will make it easier to write selftests related to storage keys.

Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst | 42 ++++++++++++++++++++++++++++++
 arch/s390/kvm/kvm-s390.c       | 58 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h       | 11 ++++++++
 3 files changed, 111 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 01a3abef8abb..72e04dedb068 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6517,6 +6517,40 @@ the capability to be present.
 
 `flags` must currently be zero.
 
+4.144 KVM_S390_KEYOP
+--------------------
+
+:Capability: KVM_CAP_S390_KEYOP
+:Architectures: s390
+:Type: vm ioctl
+:Parameters: struct kvm_s390_keyop (in/out)
+:Returns: 0 in case of success, < 0 on error
+
+The specified key operation is performed on the given guest address. The
+previous storage key (or the relevant part thereof) will be returned in
+`key`.
+
+::
+
+  struct kvm_s390_keyop {
+	__u64 guest_addr;
+	__u8  key;
+	__u8  operation;
+  };
+
+Currently supported values for ``operation``:
+
+KVM_S390_KEYOP_ISKE
+  Returns the storage key for the guest address ``guest_addr`` in ``key``.
+
+KVM_S390_KEYOP_RRBE
+  Resets the reference bit for the guest address ``guest_addr``, returning the
+  R and C bits of the old storage key in ``key``; the remaining fields of
+  the storage key will be set to 0.
+
+KVM_S390_KEYOP_SSKE
+  Sets the storage key for the guest address ``guest_addr`` to the key
+  specified in ``key``, returning the previous value in ``key``.
 
 .. _kvm_run:
 
@@ -9287,6 +9321,14 @@ The presence of this capability indicates that KVM_RUN will update the
 KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the
 vCPU was executing nested guest code when it exited.
 
+8.46 KVM_CAP_S390_KEYOP
+-----------------------
+
+:Architectures: s390
+
+The presence of this capability indicates that the KVM_S390_KEYOP ioctl is
+available.
+
 KVM exits with the register state of either the L1 or L2 guest
 depending on which executed at the time of an exit. Userspace must
 take care to differentiate between these cases.
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ac7b5f56f0b5..9f24252775dd 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -554,6 +554,37 @@ static void __kvm_s390_exit(void)
 	debug_unregister(kvm_s390_dbf_uv);
 }
 
+static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op,
+			  unsigned long addr, union skey skey)
+{
+	union asce asce = kvm->arch.gmap->asce;
+	gfn_t gfn = gpa_to_gfn(addr);
+	int r;
+
+	guard(read_lock)(&kvm->mmu_lock);
+
+	switch (op) {
+	case KVM_S390_KEYOP_SSKE:
+		r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0);
+		if (r >= 0)
+			return skey.skey;
+		break;
+	case KVM_S390_KEYOP_ISKE:
+		r = dat_get_storage_key(asce, gfn, &skey);
+		if (!r)
+			return skey.skey;
+		break;
+	case KVM_S390_KEYOP_RRBE:
+		r = dat_reset_reference_bit(asce, gfn);
+		if (r > 0)
+			return r << 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return r;
+}
+
 /* Section: device related */
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
@@ -598,6 +629,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_DIAG318:
 	case KVM_CAP_IRQFD_RESAMPLE:
 	case KVM_CAP_S390_USER_OPEREXEC:
+	case KVM_CAP_S390_KEYOP:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -2931,6 +2963,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 			r = -EFAULT;
 		break;
 	}
+	case KVM_S390_KEYOP: {
+		struct kvm_s390_mmu_cache *mc;
+		struct kvm_s390_keyop kop;
+		union skey skey;
+
+		if (copy_from_user(&kop, argp, sizeof(kop))) {
+			r = -EFAULT;
+			break;
+		}
+		skey.skey = kop.key;
+
+		mc = kvm_s390_new_mmu_cache();
+		if (!mc)
+			return -ENOMEM;
+
+		r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey);
+		kvm_s390_free_mmu_cache(mc);
+		if (r < 0)
+			break;
+
+		kop.key = r;
+		r = 0;
+		if (copy_to_user(argp, &kop, sizeof(kop)))
+			r = -EFAULT;
+		break;
+	}
 	case KVM_S390_ZPCI_OP: {
 		struct kvm_s390_zpci_op args;
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index dddb781b0507..ab3d3d96e75f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -974,6 +974,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_GUEST_MEMFD_FLAGS 244
 #define KVM_CAP_ARM_SEA_TO_USER 245
 #define KVM_CAP_S390_USER_OPEREXEC 246
+#define KVM_CAP_S390_KEYOP 247
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
@@ -1219,6 +1220,15 @@ struct kvm_vfio_spapr_tce {
 	__s32	tablefd;
 };
 
+#define KVM_S390_KEYOP_ISKE 0x01
+#define KVM_S390_KEYOP_RRBE 0x02
+#define KVM_S390_KEYOP_SSKE 0x03
+struct kvm_s390_keyop {
+	__u64 guest_addr;
+	__u8  key;
+	__u8  operation;
+};
+
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
@@ -1238,6 +1248,7 @@ struct kvm_vfio_spapr_tce {
 #define KVM_S390_UCAS_MAP        _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping)
 #define KVM_S390_UCAS_UNMAP      _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping)
 #define KVM_S390_VCPU_FAULT	 _IOW(KVMIO, 0x52, unsigned long)
+#define KVM_S390_KEYOP           _IOWR(KVMIO, 0x53, struct kvm_s390_keyop)
 
 /* Device model IOC */
 #define KVM_CREATE_IRQCHIP        _IO(KVMIO,   0x60)
-- 
cgit v1.2.3


From f7ab71f178d56447e5efb55b65436feb68662f8f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 6 Feb 2026 10:17:30 +0100
Subject: KVM: s390: Add explicit padding to struct kvm_s390_keyop

The newly added structure causes a warning about implied padding:

./usr/include/linux/kvm.h:1247:1: error: padding struct size to alignment boundary with 6 bytes [-Werror=padded]

The padding can lead to leaking kernel data and ABI incompatibilies
when used on x86. Neither of these is a problem in this specific
patch, but it's best to avoid it and use explicit padding fields
in general.

Fixes: 0ee4ddc1647b ("KVM: s390: Storage key manipulation IOCTL")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ab3d3d96e75f..d4250ab662fe 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1227,6 +1227,7 @@ struct kvm_s390_keyop {
 	__u64 guest_addr;
 	__u8  key;
 	__u8  operation;
+	__u8  pad[6];
 };
 
 /*
-- 
cgit v1.2.3


From ed82f35b926b2e505c14b7006473614b8f58b4f4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Jan 2026 10:18:31 -0700
Subject: io_uring: allow registration of per-task restrictions

Currently io_uring supports restricting operations on a per-ring basis.
To use those, the ring must be setup in a disabled state by setting
IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and
the ring can then be enabled.

This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd
== -1, like the other "blind" register opcodes which work on the task
rather than a specific ring. This allows registration of the same kind
of restrictions as can been done on a specific ring, but with the task
itself. Once done, any ring created will inherit these restrictions.

If a restriction filter is registered with a task, then it's inherited
on fork for its children. Children may only further restrict operations,
not extend them.

Inheriting restrictions include both the classic
IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF
filters that have been registered with the task via
IORING_REGISTER_BPF_FILTER.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +
 include/uapi/linux/io_uring.h  |  7 ++++
 io_uring/bpf_filter.c          | 86 +++++++++++++++++++++++++++++++++++++++++-
 io_uring/bpf_filter.h          |  6 +++
 io_uring/io_uring.c            | 33 ++++++++++++++++
 io_uring/io_uring.h            |  1 +
 io_uring/register.c            | 80 +++++++++++++++++++++++++++++++++++++++
 io_uring/tctx.c                | 17 +++++++++
 8 files changed, 231 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 7617df247238..510d801b9a55 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -231,6 +231,8 @@ struct io_restriction {
 	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 	struct io_bpf_filters *bpf_filters;
+	/* ->bpf_filters needs COW on modification */
+	bool bpf_filters_cow;
 	u8 sqe_flags_allowed;
 	u8 sqe_flags_required;
 	/* IORING_OP_* restrictions exist */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 94669b77fee8..aeeffcf27fee 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -808,6 +808,13 @@ struct io_uring_restriction {
 	__u32 resv2[3];
 };
 
+struct io_uring_task_restriction {
+	__u16 flags;
+	__u16 nr_res;
+	__u32 resv[3];
+	__DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
+};
+
 struct io_uring_clock_register {
 	__u32	clockid;
 	__u32	__resv[3];
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
index b94944ab8442..3816883a45ed 100644
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -249,13 +249,77 @@ static int io_uring_check_cbpf_filter(struct sock_filter *filter,
 	return 0;
 }
 
+void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
+{
+	if (!src->bpf_filters)
+		return;
+
+	rcu_read_lock();
+	/*
+	 * If the src filter is going away, just ignore it.
+	 */
+	if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
+		dst->bpf_filters = src->bpf_filters;
+		dst->bpf_filters_cow = true;
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * Allocate a new struct io_bpf_filters. Used when a filter is cloned and
+ * modifications need to be made.
+ */
+static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
+{
+	struct io_bpf_filters *filters;
+	struct io_bpf_filter *srcf;
+	int i;
+
+	filters = io_new_bpf_filters();
+	if (IS_ERR(filters))
+		return filters;
+
+	/*
+	 * Iterate filters from src and assign in destination. Grabbing
+	 * a reference is enough, we don't need to duplicate the memory.
+	 * This is safe because filters are only ever appended to the
+	 * front of the list, hence the only memory ever touched inside
+	 * a filter is the refcount.
+	 */
+	rcu_read_lock();
+	for (i = 0; i < IORING_OP_LAST; i++) {
+		srcf = rcu_dereference(src->bpf_filters->filters[i]);
+		if (!srcf) {
+			continue;
+		} else if (srcf == &dummy_filter) {
+			rcu_assign_pointer(filters->filters[i], &dummy_filter);
+			continue;
+		}
+
+		/*
+		 * Getting a ref on the first node is enough, putting the
+		 * filter and iterating nodes to free will stop on the first
+		 * one that doesn't hit zero when dropping.
+		 */
+		if (!refcount_inc_not_zero(&srcf->refs))
+			goto err;
+		rcu_assign_pointer(filters->filters[i], srcf);
+	}
+	rcu_read_unlock();
+	return filters;
+err:
+	rcu_read_unlock();
+	__io_put_bpf_filters(filters);
+	return ERR_PTR(-EBUSY);
+}
+
 #define IO_URING_BPF_FILTER_FLAGS	IO_URING_BPF_FILTER_DENY_REST
 
 int io_register_bpf_filter(struct io_restriction *res,
 			   struct io_uring_bpf __user *arg)
 {
+	struct io_bpf_filters *filters, *old_filters = NULL;
 	struct io_bpf_filter *filter, *old_filter;
-	struct io_bpf_filters *filters;
 	struct io_uring_bpf reg;
 	struct bpf_prog *prog;
 	struct sock_fprog fprog;
@@ -297,6 +361,17 @@ int io_register_bpf_filter(struct io_restriction *res,
 			ret = PTR_ERR(filters);
 			goto err_prog;
 		}
+	} else if (res->bpf_filters_cow) {
+		filters = io_bpf_filter_cow(res);
+		if (IS_ERR(filters)) {
+			ret = PTR_ERR(filters);
+			goto err_prog;
+		}
+		/*
+		 * Stash old filters, we'll put them once we know we'll
+		 * succeed. Until then, res->bpf_filters is left untouched.
+		 */
+		old_filters = res->bpf_filters;
 	}
 
 	filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
@@ -306,6 +381,15 @@ int io_register_bpf_filter(struct io_restriction *res,
 	}
 	refcount_set(&filter->refs, 1);
 	filter->prog = prog;
+
+	/*
+	 * Success - install the new filter set now. If we did COW, put
+	 * the old filters as we're replacing them.
+	 */
+	if (old_filters) {
+		__io_put_bpf_filters(old_filters);
+		res->bpf_filters_cow = false;
+	}
 	res->bpf_filters = filters;
 
 	/*
diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h
index 9f3cdb92eb16..66a776cf25b4 100644
--- a/io_uring/bpf_filter.h
+++ b/io_uring/bpf_filter.h
@@ -13,6 +13,8 @@ int io_register_bpf_filter(struct io_restriction *res,
 
 void io_put_bpf_filters(struct io_restriction *res);
 
+void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src);
+
 static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
 					   struct io_kiocb *req)
 {
@@ -37,6 +39,10 @@ static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
 static inline void io_put_bpf_filters(struct io_restriction *res)
 {
 }
+static inline void io_bpf_filter_clone(struct io_restriction *dst,
+				       struct io_restriction *src)
+{
+}
 #endif /* CONFIG_IO_URING_BPF */
 
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 049454278563..e43c5283b23a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2880,6 +2880,32 @@ int io_prepare_config(struct io_ctx_config *config)
 	return 0;
 }
 
+void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src)
+{
+	memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op));
+	memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op));
+	dst->sqe_flags_allowed = src->sqe_flags_allowed;
+	dst->sqe_flags_required = src->sqe_flags_required;
+	dst->op_registered = src->op_registered;
+	dst->reg_registered = src->reg_registered;
+
+	io_bpf_filter_clone(dst, src);
+}
+
+static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
+				     struct io_restriction *src)
+{
+	struct io_restriction *dst = &ctx->restrictions;
+
+	io_restriction_clone(dst, src);
+	if (dst->bpf_filters)
+		WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
+	if (dst->op_registered)
+		ctx->op_restricted = 1;
+	if (dst->reg_registered)
+		ctx->reg_restricted = 1;
+}
+
 static __cold int io_uring_create(struct io_ctx_config *config)
 {
 	struct io_uring_params *p = &config->p;
@@ -2940,6 +2966,13 @@ static __cold int io_uring_create(struct io_ctx_config *config)
 	else
 		ctx->notify_method = TWA_SIGNAL;
 
+	/*
+	 * If the current task has restrictions enabled, then copy them to
+	 * our newly created ring and mark it as registered.
+	 */
+	if (current->io_uring_restrict)
+		io_ctx_restriction_clone(ctx, current->io_uring_restrict);
+
 	/*
 	 * This is just grabbed for accounting purposes. When a process exits,
 	 * the mm is exited and dropped before the files, hence we need to hang
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 29b8f90fdabf..a08d78c716f8 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -197,6 +197,7 @@ void io_task_refs_refill(struct io_uring_task *tctx);
 bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 
 void io_activate_pollwq(struct io_ring_ctx *ctx);
+void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src);
 
 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 {
diff --git a/io_uring/register.c b/io_uring/register.c
index 40de9b8924b9..af4815bc11d6 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -190,6 +190,82 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
+{
+	struct io_uring_task_restriction __user *ures = arg;
+	struct io_uring_task_restriction tres;
+	struct io_restriction *res;
+	int ret;
+
+	/* Disallow if task already has registered restrictions */
+	if (current->io_uring_restrict)
+		return -EPERM;
+	/*
+	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
+	 * is true and we're not CAP_SYS_ADMIN.
+	 */
+	if (!task_no_new_privs(current) &&
+	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+		return -EACCES;
+	if (nr_args != 1)
+		return -EINVAL;
+
+	if (copy_from_user(&tres, arg, sizeof(tres)))
+		return -EFAULT;
+
+	if (tres.flags)
+		return -EINVAL;
+	if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
+		return -EINVAL;
+
+	res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+	if (!res)
+		return -ENOMEM;
+
+	ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
+	if (ret < 0) {
+		kfree(res);
+		return ret;
+	}
+	current->io_uring_restrict = res;
+	return 0;
+}
+
+static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
+{
+	struct io_restriction *res;
+	int ret;
+
+	/*
+	 * Similar to seccomp, disallow setting a filter if task_no_new_privs
+	 * is true and we're not CAP_SYS_ADMIN.
+	 */
+	if (!task_no_new_privs(current) &&
+	    !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (nr_args != 1)
+		return -EINVAL;
+
+	/* If no task restrictions exist, setup a new set */
+	res = current->io_uring_restrict;
+	if (!res) {
+		res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+		if (!res)
+			return -ENOMEM;
+	}
+
+	ret = io_register_bpf_filter(res, arg);
+	if (ret) {
+		if (res != current->io_uring_restrict)
+			kfree(res);
+		return ret;
+	}
+	if (!current->io_uring_restrict)
+		current->io_uring_restrict = res;
+	return 0;
+}
+
 static int io_register_enable_rings(struct io_ring_ctx *ctx)
 {
 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
@@ -912,6 +988,10 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg,
 		return io_uring_register_send_msg_ring(arg, nr_args);
 	case IORING_REGISTER_QUERY:
 		return io_query(arg, nr_args);
+	case IORING_REGISTER_RESTRICTIONS:
+		return io_register_restrictions_task(arg, nr_args);
+	case IORING_REGISTER_BPF_FILTER:
+		return io_register_bpf_filter_task(arg, nr_args);
 	}
 	return -EINVAL;
 }
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index d4f7698805e4..e3da31fdf16f 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -11,6 +11,7 @@
 
 #include "io_uring.h"
 #include "tctx.h"
+#include "bpf_filter.h"
 
 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 					struct task_struct *task)
@@ -66,6 +67,11 @@ void __io_uring_free(struct task_struct *tsk)
 		kfree(tctx);
 		tsk->io_uring = NULL;
 	}
+	if (tsk->io_uring_restrict) {
+		io_put_bpf_filters(tsk->io_uring_restrict);
+		kfree(tsk->io_uring_restrict);
+		tsk->io_uring_restrict = NULL;
+	}
 }
 
 __cold int io_uring_alloc_task_context(struct task_struct *task,
@@ -356,5 +362,16 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
 
 int __io_uring_fork(struct task_struct *tsk)
 {
+	struct io_restriction *res, *src = tsk->io_uring_restrict;
+
+	/* Don't leave it dangling on error */
+	tsk->io_uring_restrict = NULL;
+
+	res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+	if (!res)
+		return -ENOMEM;
+
+	tsk->io_uring_restrict = res;
+	io_restriction_clone(res, src);
 	return 0;
 }
-- 
cgit v1.2.3


From 42fc7e6543f6d17d2cf9ed3e5021f103a3d11182 Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack@google.com>
Date: Thu, 27 Nov 2025 12:51:34 +0100
Subject: landlock: Multithreading support for landlock_restrict_self()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce the LANDLOCK_RESTRICT_SELF_TSYNC flag.  With this flag, a
given Landlock ruleset is applied to all threads of the calling
process, instead of only the current one.

Without this flag, multithreaded userspace programs currently resort
to using the nptl(7)/libpsx hack for multithreaded policy enforcement,
which is also used by libcap and for setuid(2).  Using this
userspace-based scheme, the threads of a process enforce the same
Landlock policy, but the resulting Landlock domains are still
separate.  The domains being separate causes multiple problems:

* When using Landlock's "scoped" access rights, the domain identity is
  used to determine whether an operation is permitted.  As a result,
  when using LANLDOCK_SCOPE_SIGNAL, signaling between sibling threads
  stops working.  This is a problem for programming languages and
  frameworks which are inherently multithreaded (e.g. Go).

* In audit logging, the domains of separate threads in a process will
  get logged with different domain IDs, even when they are based on
  the same ruleset FD, which might confuse users.

Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: John Johansen <john.johansen@canonical.com>
Cc: Paul Moore <paul@paul-moore.com>
Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20251127115136.3064948-2-gnoack@google.com
[mic: Fix restrict_self_flags test, clean up Makefile, allign comments,
reduce local variable scope, add missing includes]
Closes: https://github.com/landlock-lsm/linux/issues/2
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h                |  13 +
 security/landlock/Makefile                   |  11 +-
 security/landlock/cred.h                     |  12 +
 security/landlock/limits.h                   |   2 +-
 security/landlock/syscalls.c                 |  61 +--
 security/landlock/tsync.c                    | 561 +++++++++++++++++++++++++++
 security/landlock/tsync.h                    |  16 +
 tools/testing/selftests/landlock/base_test.c |   4 +-
 8 files changed, 650 insertions(+), 30 deletions(-)
 create mode 100644 security/landlock/tsync.c
 create mode 100644 security/landlock/tsync.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 75fd7f5e6cc3..d5081ab4e5ef 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -117,11 +117,24 @@ struct landlock_ruleset_attr {
  *     future nested domains, not the one being created. It can also be used
  *     with a @ruleset_fd value of -1 to mute subdomain logs without creating a
  *     domain.
+ *
+ * The following flag supports policy enforcement in multithreaded processes:
+ *
+ * %LANDLOCK_RESTRICT_SELF_TSYNC
+ *     Applies the new Landlock configuration atomically to all threads of the
+ *     current process, including the Landlock domain and logging
+ *     configuration. This overrides the Landlock configuration of sibling
+ *     threads, irrespective of previously established Landlock domains and
+ *     logging configurations on these threads.
+ *
+ *     If the calling thread is running with no_new_privs, this operation
+ *     enables no_new_privs on the sibling threads as well.
  */
 /* clang-format off */
 #define LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF		(1U << 0)
 #define LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON			(1U << 1)
 #define LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF		(1U << 2)
+#define LANDLOCK_RESTRICT_SELF_TSYNC				(1U << 3)
 /* clang-format on */
 
 /**
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 3160c2bdac1d..ffa7646d99f3 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,7 +1,14 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
-landlock-y := setup.o syscalls.o object.o ruleset.o \
-	cred.o task.o fs.o
+landlock-y := \
+	setup.o \
+	syscalls.o \
+	object.o \
+	ruleset.o \
+	cred.o \
+	task.o \
+	fs.o \
+	tsync.o
 
 landlock-$(CONFIG_INET) += net.o
 
diff --git a/security/landlock/cred.h b/security/landlock/cred.h
index c82fe63ec598..c10a06727eb1 100644
--- a/security/landlock/cred.h
+++ b/security/landlock/cred.h
@@ -26,6 +26,8 @@
  * This structure is packed to minimize the size of struct
  * landlock_file_security.  However, it is always aligned in the LSM cred blob,
  * see lsm_set_blob_size().
+ *
+ * When updating this, also update landlock_cred_copy() if needed.
  */
 struct landlock_cred_security {
 	/**
@@ -65,6 +67,16 @@ landlock_cred(const struct cred *cred)
 	return cred->security + landlock_blob_sizes.lbs_cred;
 }
 
+static inline void landlock_cred_copy(struct landlock_cred_security *dst,
+				      const struct landlock_cred_security *src)
+{
+	landlock_put_ruleset(dst->domain);
+
+	*dst = *src;
+
+	landlock_get_ruleset(src->domain);
+}
+
 static inline struct landlock_ruleset *landlock_get_current_domain(void)
 {
 	return landlock_cred(current_cred())->domain;
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 65b5ff051674..eb584f47288d 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -31,7 +31,7 @@
 #define LANDLOCK_MASK_SCOPE		((LANDLOCK_LAST_SCOPE << 1) - 1)
 #define LANDLOCK_NUM_SCOPE		__const_hweight64(LANDLOCK_MASK_SCOPE)
 
-#define LANDLOCK_LAST_RESTRICT_SELF	LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF
+#define LANDLOCK_LAST_RESTRICT_SELF	LANDLOCK_RESTRICT_SELF_TSYNC
 #define LANDLOCK_MASK_RESTRICT_SELF	((LANDLOCK_LAST_RESTRICT_SELF << 1) - 1)
 
 /* clang-format on */
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 0116e9f93ffe..3e4e99deb7f9 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -36,6 +36,7 @@
 #include "net.h"
 #include "ruleset.h"
 #include "setup.h"
+#include "tsync.h"
 
 static bool is_initialized(void)
 {
@@ -161,7 +162,7 @@ static const struct file_operations ruleset_fops = {
  * Documentation/userspace-api/landlock.rst should be updated to reflect the
  * UAPI change.
  */
-const int landlock_abi_version = 7;
+const int landlock_abi_version = 8;
 
 /**
  * sys_landlock_create_ruleset - Create a new ruleset
@@ -454,9 +455,10 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
  *         - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF
  *         - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON
  *         - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF
+ *         - %LANDLOCK_RESTRICT_SELF_TSYNC
  *
- * This system call enables to enforce a Landlock ruleset on the current
- * thread.  Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its
+ * This system call enforces a Landlock ruleset on the current thread.
+ * Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its
  * namespace or is running with no_new_privs.  This avoids scenarios where
  * unprivileged tasks can affect the behavior of privileged children.
  *
@@ -478,8 +480,7 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
 SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
 		flags)
 {
-	struct landlock_ruleset *new_dom,
-		*ruleset __free(landlock_put_ruleset) = NULL;
+	struct landlock_ruleset *ruleset __free(landlock_put_ruleset) = NULL;
 	struct cred *new_cred;
 	struct landlock_cred_security *new_llcred;
 	bool __maybe_unused log_same_exec, log_new_exec, log_subdomains,
@@ -538,33 +539,43 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
 	 * We could optimize this case by not calling commit_creds() if this flag
 	 * was already set, but it is not worth the complexity.
 	 */
-	if (!ruleset)
-		return commit_creds(new_cred);
-
-	/*
-	 * There is no possible race condition while copying and manipulating
-	 * the current credentials because they are dedicated per thread.
-	 */
-	new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset);
-	if (IS_ERR(new_dom)) {
-		abort_creds(new_cred);
-		return PTR_ERR(new_dom);
-	}
+	if (ruleset) {
+		/*
+		 * There is no possible race condition while copying and
+		 * manipulating the current credentials because they are
+		 * dedicated per thread.
+		 */
+		struct landlock_ruleset *const new_dom =
+			landlock_merge_ruleset(new_llcred->domain, ruleset);
+		if (IS_ERR(new_dom)) {
+			abort_creds(new_cred);
+			return PTR_ERR(new_dom);
+		}
 
 #ifdef CONFIG_AUDIT
-	new_dom->hierarchy->log_same_exec = log_same_exec;
-	new_dom->hierarchy->log_new_exec = log_new_exec;
-	if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains)
-		new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED;
+		new_dom->hierarchy->log_same_exec = log_same_exec;
+		new_dom->hierarchy->log_new_exec = log_new_exec;
+		if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains)
+			new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED;
 #endif /* CONFIG_AUDIT */
 
-	/* Replaces the old (prepared) domain. */
-	landlock_put_ruleset(new_llcred->domain);
-	new_llcred->domain = new_dom;
+		/* Replaces the old (prepared) domain. */
+		landlock_put_ruleset(new_llcred->domain);
+		new_llcred->domain = new_dom;
 
 #ifdef CONFIG_AUDIT
-	new_llcred->domain_exec |= BIT(new_dom->num_layers - 1);
+		new_llcred->domain_exec |= BIT(new_dom->num_layers - 1);
 #endif /* CONFIG_AUDIT */
+	}
+
+	if (flags & LANDLOCK_RESTRICT_SELF_TSYNC) {
+		const int err = landlock_restrict_sibling_threads(
+			current_cred(), new_cred);
+		if (err) {
+			abort_creds(new_cred);
+			return err;
+		}
+	}
 
 	return commit_creds(new_cred);
 }
diff --git a/security/landlock/tsync.c b/security/landlock/tsync.c
new file mode 100644
index 000000000000..0d2b9c646030
--- /dev/null
+++ b/security/landlock/tsync.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock - Cross-thread ruleset enforcement
+ *
+ * Copyright © 2025 Google LLC
+ */
+
+#include <linux/atomic.h>
+#include <linux/cleanup.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/overflow.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+
+#include "cred.h"
+#include "tsync.h"
+
+/*
+ * Shared state between multiple threads which are enforcing Landlock rulesets
+ * in lockstep with each other.
+ */
+struct tsync_shared_context {
+	/* The old and tentative new creds of the calling thread. */
+	const struct cred *old_cred;
+	const struct cred *new_cred;
+
+	/* True if sibling tasks need to set the no_new_privs flag. */
+	bool set_no_new_privs;
+
+	/* An error encountered in preparation step, or 0. */
+	atomic_t preparation_error;
+
+	/*
+	 * Barrier after preparation step in restrict_one_thread.
+	 * The calling thread waits for completion.
+	 *
+	 * Re-initialized on every round of looking for newly spawned threads.
+	 */
+	atomic_t num_preparing;
+	struct completion all_prepared;
+
+	/* Sibling threads wait for completion. */
+	struct completion ready_to_commit;
+
+	/*
+	 * Barrier after commit step (used by syscall impl to wait for
+	 * completion).
+	 */
+	atomic_t num_unfinished;
+	struct completion all_finished;
+};
+
+struct tsync_work {
+	struct callback_head work;
+	struct task_struct *task;
+	struct tsync_shared_context *shared_ctx;
+};
+
+/*
+ * restrict_one_thread - update a thread's Landlock domain in lockstep with the
+ * other threads in the same process
+ *
+ * When this is run, the same function gets run in all other threads in the same
+ * process (except for the calling thread which called landlock_restrict_self).
+ * The concurrently running invocations of restrict_one_thread coordinate
+ * through the shared ctx object to do their work in lockstep to implement
+ * all-or-nothing semantics for enforcing the new Landlock domain.
+ *
+ * Afterwards, depending on the presence of an error, all threads either commit
+ * or abort the prepared credentials.  The commit operation can not fail any
+ * more.
+ */
+static void restrict_one_thread(struct tsync_shared_context *ctx)
+{
+	int err;
+	struct cred *cred = NULL;
+
+	if (current_cred() == ctx->old_cred) {
+		/*
+		 * Switch out old_cred with new_cred, if possible.
+		 *
+		 * In the common case, where all threads initially point to the same
+		 * struct cred, this optimization avoids creating separate redundant
+		 * credentials objects for each, which would all have the same contents.
+		 *
+		 * Note: We are intentionally dropping the const qualifier here, because
+		 * it is required by commit_creds() and abort_creds().
+		 */
+		cred = (struct cred *)get_cred(ctx->new_cred);
+	} else {
+		/* Else, prepare new creds and populate them. */
+		cred = prepare_creds();
+
+		if (!cred) {
+			atomic_set(&ctx->preparation_error, -ENOMEM);
+
+			/*
+			 * Even on error, we need to adhere to the protocol and coordinate
+			 * with concurrently running invocations.
+			 */
+			if (atomic_dec_return(&ctx->num_preparing) == 0)
+				complete_all(&ctx->all_prepared);
+
+			goto out;
+		}
+
+		landlock_cred_copy(landlock_cred(cred),
+				   landlock_cred(ctx->new_cred));
+	}
+
+	/*
+	 * Barrier: Wait until all threads are done preparing.
+	 * After this point, we can have no more failures.
+	 */
+	if (atomic_dec_return(&ctx->num_preparing) == 0)
+		complete_all(&ctx->all_prepared);
+
+	/*
+	 * Wait for signal from calling thread that it's safe to read the
+	 * preparation error now and we are ready to commit (or abort).
+	 */
+	wait_for_completion(&ctx->ready_to_commit);
+
+	/* Abort the commit if any of the other threads had an error. */
+	err = atomic_read(&ctx->preparation_error);
+	if (err) {
+		abort_creds(cred);
+		goto out;
+	}
+
+	/*
+	 * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
+	 * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
+	 * kernel/seccomp.c)
+	 */
+	if (ctx->set_no_new_privs)
+		task_set_no_new_privs(current);
+
+	commit_creds(cred);
+
+out:
+	/* Notify the calling thread once all threads are done */
+	if (atomic_dec_return(&ctx->num_unfinished) == 0)
+		complete_all(&ctx->all_finished);
+}
+
+/*
+ * restrict_one_thread_callback - task_work callback for restricting a thread
+ *
+ * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
+ */
+static void restrict_one_thread_callback(struct callback_head *work)
+{
+	struct tsync_work *ctx = container_of(work, struct tsync_work, work);
+
+	restrict_one_thread(ctx->shared_ctx);
+}
+
+/*
+ * struct tsync_works - a growable array of per-task contexts
+ *
+ * The zero-initialized struct represents the empty array.
+ */
+struct tsync_works {
+	struct tsync_work **works;
+	size_t size;
+	size_t capacity;
+};
+
+/*
+ * tsync_works_provide - provides a preallocated tsync_work for the given task
+ *
+ * This also stores a task pointer in the context and increments the reference
+ * count of the task.
+ *
+ * This function may fail in the case where we did not preallocate sufficient
+ * capacity.  This can legitimately happen if new threads get started after we
+ * grew the capacity.
+ *
+ * Returns:
+ *   A pointer to the preallocated context struct, with task filled in.
+ *
+ *   NULL, if we ran out of preallocated context structs.
+ */
+static struct tsync_work *tsync_works_provide(struct tsync_works *s,
+					      struct task_struct *task)
+{
+	struct tsync_work *ctx;
+
+	if (s->size >= s->capacity)
+		return NULL;
+
+	ctx = s->works[s->size];
+	s->size++;
+
+	ctx->task = get_task_struct(task);
+	return ctx;
+}
+
+/*
+ * tsync_works_grow_by - preallocates space for n more contexts in s
+ *
+ * On a successful return, the subsequent n calls to tsync_works_provide() are
+ * guaranteed to succeed.  (size + n <= capacity)
+ *
+ * Returns:
+ *   -ENOMEM if the (re)allocation fails
+
+ *   0       if the allocation succeeds, partially succeeds, or no reallocation
+ *           was needed
+ */
+static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
+{
+	size_t i;
+	size_t new_capacity;
+	struct tsync_work **works;
+	struct tsync_work *work;
+
+	if (check_add_overflow(s->size, n, &new_capacity))
+		return -EOVERFLOW;
+
+	/* No need to reallocate if s already has sufficient capacity. */
+	if (new_capacity <= s->capacity)
+		return 0;
+
+	works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
+			       flags);
+	if (!works)
+		return -ENOMEM;
+
+	s->works = works;
+
+	for (i = s->capacity; i < new_capacity; i++) {
+		work = kzalloc(sizeof(*work), flags);
+		if (!work) {
+			/*
+			 * Leave the object in a consistent state,
+			 * but return an error.
+			 */
+			s->capacity = i;
+			return -ENOMEM;
+		}
+		s->works[i] = work;
+	}
+	s->capacity = new_capacity;
+	return 0;
+}
+
+/*
+ * tsync_works_contains - checks for presence of task in s
+ */
+static bool tsync_works_contains_task(const struct tsync_works *s,
+				      struct task_struct *task)
+{
+	size_t i;
+
+	for (i = 0; i < s->size; i++)
+		if (s->works[i]->task == task)
+			return true;
+	return false;
+}
+
+/*
+ * tsync_works_release - frees memory held by s and drops all task references
+ *
+ * This does not free s itself, only the data structures held by it.
+ */
+static void tsync_works_release(struct tsync_works *s)
+{
+	size_t i;
+
+	for (i = 0; i < s->size; i++) {
+		if (!s->works[i]->task)
+			continue;
+
+		put_task_struct(s->works[i]->task);
+	}
+
+	for (i = 0; i < s->capacity; i++)
+		kfree(s->works[i]);
+	kfree(s->works);
+	s->works = NULL;
+	s->size = 0;
+	s->capacity = 0;
+}
+
+/*
+ * count_additional_threads - counts the sibling threads that are not in works
+ */
+static size_t count_additional_threads(const struct tsync_works *works)
+{
+	struct task_struct *thread, *caller;
+	size_t n = 0;
+
+	caller = current;
+
+	guard(rcu)();
+
+	for_each_thread(caller, thread) {
+		/* Skip current, since it is initiating the sync. */
+		if (thread == caller)
+			continue;
+
+		/* Skip exited threads. */
+		if (thread->flags & PF_EXITING)
+			continue;
+
+		/* Skip threads that we have already seen. */
+		if (tsync_works_contains_task(works, thread))
+			continue;
+
+		n++;
+	}
+	return n;
+}
+
+/*
+ * schedule_task_work - adds task_work for all eligible sibling threads
+ *                      which have not been scheduled yet
+ *
+ * For each added task_work, atomically increments shared_ctx->num_preparing and
+ * shared_ctx->num_unfinished.
+ *
+ * Returns:
+ *     true, if at least one eligible sibling thread was found
+ */
+static bool schedule_task_work(struct tsync_works *works,
+			       struct tsync_shared_context *shared_ctx)
+{
+	int err;
+	struct task_struct *thread, *caller;
+	struct tsync_work *ctx;
+	bool found_more_threads = false;
+
+	caller = current;
+
+	guard(rcu)();
+
+	for_each_thread(caller, thread) {
+		/* Skip current, since it is initiating the sync. */
+		if (thread == caller)
+			continue;
+
+		/* Skip exited threads. */
+		if (thread->flags & PF_EXITING)
+			continue;
+
+		/* Skip threads that we already looked at. */
+		if (tsync_works_contains_task(works, thread))
+			continue;
+
+		/*
+		 * We found a sibling thread that is not doing its task_work yet, and
+		 * which might spawn new threads before our task work runs, so we need
+		 * at least one more round in the outer loop.
+		 */
+		found_more_threads = true;
+
+		ctx = tsync_works_provide(works, thread);
+		if (!ctx) {
+			/*
+			 * We ran out of preallocated contexts -- we need to try again with
+			 * this thread at a later time!
+			 * found_more_threads is already true at this point.
+			 */
+			break;
+		}
+
+		ctx->shared_ctx = shared_ctx;
+
+		atomic_inc(&shared_ctx->num_preparing);
+		atomic_inc(&shared_ctx->num_unfinished);
+
+		init_task_work(&ctx->work, restrict_one_thread_callback);
+		err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
+		if (err) {
+			/*
+			 * task_work_add() only fails if the task is about to exit.  We
+			 * checked that earlier, but it can happen as a race.  Resume
+			 * without setting an error, as the task is probably gone in the
+			 * next loop iteration.  For consistency, remove the task from ctx
+			 * so that it does not look like we handed it a task_work.
+			 */
+			put_task_struct(ctx->task);
+			ctx->task = NULL;
+
+			atomic_dec(&shared_ctx->num_preparing);
+			atomic_dec(&shared_ctx->num_unfinished);
+		}
+	}
+
+	return found_more_threads;
+}
+
+/*
+ * cancel_tsync_works - cancel all task works where it is possible
+ *
+ * Task works can be canceled as long as they are still queued and have not
+ * started running.  If they get canceled, we decrement
+ * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
+ * completions if needed, as if the task was never scheduled.
+ */
+static void cancel_tsync_works(struct tsync_works *works,
+			       struct tsync_shared_context *shared_ctx)
+{
+	int i;
+
+	for (i = 0; i < works->size; i++) {
+		if (!task_work_cancel(works->works[i]->task,
+				      &works->works[i]->work))
+			continue;
+
+		/* After dequeueing, act as if the task work had executed. */
+
+		if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
+			complete_all(&shared_ctx->all_prepared);
+
+		if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
+			complete_all(&shared_ctx->all_finished);
+	}
+}
+
+/*
+ * restrict_sibling_threads - enables a Landlock policy for all sibling threads
+ */
+int landlock_restrict_sibling_threads(const struct cred *old_cred,
+				      const struct cred *new_cred)
+{
+	int err;
+	struct tsync_shared_context shared_ctx;
+	struct tsync_works works = {};
+	size_t newly_discovered_threads;
+	bool found_more_threads;
+
+	atomic_set(&shared_ctx.preparation_error, 0);
+	init_completion(&shared_ctx.all_prepared);
+	init_completion(&shared_ctx.ready_to_commit);
+	atomic_set(&shared_ctx.num_unfinished, 1);
+	init_completion(&shared_ctx.all_finished);
+	shared_ctx.old_cred = old_cred;
+	shared_ctx.new_cred = new_cred;
+	shared_ctx.set_no_new_privs = task_no_new_privs(current);
+
+	/*
+	 * We schedule a pseudo-signal task_work for each of the calling task's
+	 * sibling threads.  In the task work, each thread:
+	 *
+	 * 1) runs prepare_creds() and writes back the error to
+	 *    shared_ctx.preparation_error, if needed.
+	 *
+	 * 2) signals that it's done with prepare_creds() to the calling task.
+	 *    (completion "all_prepared").
+	 *
+	 * 3) waits for the completion "ready_to_commit".  This is sent by the
+	 *    calling task after ensuring that all sibling threads have done
+	 *    with the "preparation" stage.
+	 *
+	 *    After this barrier is reached, it's safe to read
+	 *    shared_ctx.preparation_error.
+	 *
+	 * 4) reads shared_ctx.preparation_error and then either does commit_creds()
+	 *    or abort_creds().
+	 *
+	 * 5) signals that it's done altogether (barrier synchronization
+	 *    "all_finished")
+	 *
+	 * Unlike seccomp, which modifies sibling tasks directly, we do not need to
+	 * acquire the cred_guard_mutex and sighand->siglock:
+	 *
+	 * - As in our case, all threads are themselves exchanging their own struct
+	 *   cred through the credentials API, no locks are needed for that.
+	 * - Our for_each_thread() loops are protected by RCU.
+	 * - We do not acquire a lock to keep the list of sibling threads stable
+	 *   between our for_each_thread loops.  If the list of available sibling
+	 *   threads changes between these for_each_thread loops, we make up for
+	 *   that by continuing to look for threads until they are all discovered
+	 *   and have entered their task_work, where they are unable to spawn new
+	 *   threads.
+	 */
+	do {
+		/* In RCU read-lock, count the threads we need. */
+		newly_discovered_threads = count_additional_threads(&works);
+
+		if (newly_discovered_threads == 0)
+			break; /* done */
+
+		err = tsync_works_grow_by(&works, newly_discovered_threads,
+					  GFP_KERNEL_ACCOUNT);
+		if (err) {
+			atomic_set(&shared_ctx.preparation_error, err);
+			break;
+		}
+
+		/*
+		 * The "all_prepared" barrier is used locally to the loop body, this use
+		 * of for_each_thread().  We can reset it on each loop iteration because
+		 * all previous loop iterations are done with it already.
+		 *
+		 * num_preparing is initialized to 1 so that the counter can not go to 0
+		 * and mark the completion as done before all task works are registered.
+		 * We decrement it at the end of the loop body.
+		 */
+		atomic_set(&shared_ctx.num_preparing, 1);
+		reinit_completion(&shared_ctx.all_prepared);
+
+		/*
+		 * In RCU read-lock, schedule task work on newly discovered sibling
+		 * tasks.
+		 */
+		found_more_threads = schedule_task_work(&works, &shared_ctx);
+
+		/*
+		 * Decrement num_preparing for current, to undo that we initialized it
+		 * to 1 a few lines above.
+		 */
+		if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
+			if (wait_for_completion_interruptible(
+				    &shared_ctx.all_prepared)) {
+				/* In case of interruption, we need to retry the system call. */
+				atomic_set(&shared_ctx.preparation_error,
+					   -ERESTARTNOINTR);
+
+				/*
+				 * Cancel task works for tasks that did not start running yet,
+				 * and decrement all_prepared and num_unfinished accordingly.
+				 */
+				cancel_tsync_works(&works, &shared_ctx);
+
+				/*
+				 * The remaining task works have started running, so waiting for
+				 * their completion will finish.
+				 */
+				wait_for_completion(&shared_ctx.all_prepared);
+			}
+		}
+	} while (found_more_threads &&
+		 !atomic_read(&shared_ctx.preparation_error));
+
+	/*
+	 * We now have all sibling threads blocking and in "prepared" state in the
+	 * task work. Ask all threads to commit.
+	 */
+	complete_all(&shared_ctx.ready_to_commit);
+
+	/*
+	 * Decrement num_unfinished for current, to undo that we initialized it to 1
+	 * at the beginning.
+	 */
+	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
+		wait_for_completion(&shared_ctx.all_finished);
+
+	tsync_works_release(&works);
+
+	return atomic_read(&shared_ctx.preparation_error);
+}
diff --git a/security/landlock/tsync.h b/security/landlock/tsync.h
new file mode 100644
index 000000000000..ef86bb61c2f6
--- /dev/null
+++ b/security/landlock/tsync.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock - Cross-thread ruleset enforcement
+ *
+ * Copyright © 2025 Google LLC
+ */
+
+#ifndef _SECURITY_LANDLOCK_TSYNC_H
+#define _SECURITY_LANDLOCK_TSYNC_H
+
+#include <linux/cred.h>
+
+int landlock_restrict_sibling_threads(const struct cred *old_cred,
+				      const struct cred *new_cred);
+
+#endif /* _SECURITY_LANDLOCK_TSYNC_H */
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 7b69002239d7..fdbb672009ac 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -76,7 +76,7 @@ TEST(abi_version)
 	const struct landlock_ruleset_attr ruleset_attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
 	};
-	ASSERT_EQ(7, landlock_create_ruleset(NULL, 0,
+	ASSERT_EQ(8, landlock_create_ruleset(NULL, 0,
 					     LANDLOCK_CREATE_RULESET_VERSION));
 
 	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
@@ -306,7 +306,7 @@ TEST(restrict_self_fd_flags)
 
 TEST(restrict_self_flags)
 {
-	const __u32 last_flag = LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF;
+	const __u32 last_flag = LANDLOCK_RESTRICT_SELF_TSYNC;
 
 	/* Tests invalid flag combinations. */
 
-- 
cgit v1.2.3


From bbb6f53e905ca119f99ccab8496f8921d9db9c50 Mon Sep 17 00:00:00 2001
From: Matthieu Buffet <matthieu@buffet.re>
Date: Fri, 12 Dec 2025 17:36:57 +0100
Subject: landlock: Minor reword of docs for TCP access rights
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move ABI requirement next to each access right to prepare adding more
  access rights;
- Mention the possibility to remove the random component of a socket's
  ephemeral port choice within the netns-wide ephemeral port range,
  since it allows choosing the "random" ephemeral port.

Signed-off-by: Matthieu Buffet <matthieu@buffet.re>
Link: https://lore.kernel.org/r/20251212163704.142301-2-matthieu@buffet.re
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index d5081ab4e5ef..f88fa1f68b77 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -195,11 +195,13 @@ struct landlock_net_port_attr {
 	 * It should be noted that port 0 passed to :manpage:`bind(2)` will bind
 	 * to an available port from the ephemeral port range.  This can be
 	 * configured with the ``/proc/sys/net/ipv4/ip_local_port_range`` sysctl
-	 * (also used for IPv6).
+	 * (also used for IPv6), and within that range, on a per-socket basis
+	 * with ``setsockopt(IP_LOCAL_PORT_RANGE)``.
 	 *
-	 * A Landlock rule with port 0 and the ``LANDLOCK_ACCESS_NET_BIND_TCP``
+	 * A Landlock rule with port 0 and the %LANDLOCK_ACCESS_NET_BIND_TCP
 	 * right means that requesting to bind on port 0 is allowed and it will
-	 * automatically translate to binding on the related port range.
+	 * automatically translate to binding on a kernel-assigned ephemeral
+	 * port.
 	 */
 	__u64 port;
 };
@@ -342,13 +344,12 @@ struct landlock_net_port_attr {
  * These flags enable to restrict a sandboxed process to a set of network
  * actions.
  *
- * This is supported since Landlock ABI version 4.
- *
  * The following access rights apply to TCP port numbers:
  *
- * - %LANDLOCK_ACCESS_NET_BIND_TCP: Bind a TCP socket to a local port.
- * - %LANDLOCK_ACCESS_NET_CONNECT_TCP: Connect an active TCP socket to
- *   a remote port.
+ * - %LANDLOCK_ACCESS_NET_BIND_TCP: Bind TCP sockets to the given local
+ *   port. Support added in Landlock ABI version 4.
+ * - %LANDLOCK_ACCESS_NET_CONNECT_TCP: Connect TCP sockets to the given
+ *   remote port. Support added in Landlock ABI version 4.
  */
 /* clang-format off */
 #define LANDLOCK_ACCESS_NET_BIND_TCP			(1ULL << 0)
-- 
cgit v1.2.3


From 136f1e168f4941021565f8c10ff4bb81b1f13f2c Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Thu, 5 Feb 2026 18:34:23 +0100
Subject: mptcp: fix kdoc warnings

The following warnings were visible:

  $ ./scripts/kernel-doc -Wall -none \
        net/mptcp/ include/net/mptcp.h include/uapi/linux/mptcp*.h \
        include/trace/events/mptcp.h
  Warning: net/mptcp/token.c:108 No description found for return value of 'mptcp_token_new_request'
  Warning: net/mptcp/token.c:151 No description found for return value of 'mptcp_token_new_connect'
  Warning: net/mptcp/token.c:246 No description found for return value of 'mptcp_token_get_sock'
  Warning: net/mptcp/token.c:298 No description found for return value of 'mptcp_token_iter_next'
  Warning: net/mptcp/protocol.c:4431 No description found for return value of 'mptcp_splice_read'
  Warning: include/uapi/linux/mptcp_pm.h:13 missing initial short description on line:
   * enum mptcp_event_type

Address all of them: either by using the 'Return:' keyword, or by adding
a missing initial short description.

The MPTCP CI will soon report issues with kdoc to avoid introducing new
issues and being flagged by the Netdev CI.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260205-net-mptcp-misc-fixes-6-19-rc8-v2-3-c2720ce75c34@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/mptcp_pm.yaml |  1 +
 include/uapi/linux/mptcp_pm.h             |  2 +-
 net/mptcp/token.c                         | 16 +++++++++-------
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml
index ba30a40b9dbf..39f3facc38e5 100644
--- a/Documentation/netlink/specs/mptcp_pm.yaml
+++ b/Documentation/netlink/specs/mptcp_pm.yaml
@@ -15,6 +15,7 @@ definitions:
     type: enum
     name: event-type
     enum-name: mptcp-event-type
+    doc: Netlink MPTCP event types
     name-prefix: mptcp-event-
     entries:
       -
diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h
index c97d060ee90b..fe9863d75350 100644
--- a/include/uapi/linux/mptcp_pm.h
+++ b/include/uapi/linux/mptcp_pm.h
@@ -11,7 +11,7 @@
 #define MPTCP_PM_VER	1
 
 /**
- * enum mptcp_event_type
+ * enum mptcp_event_type - Netlink MPTCP event types
  * @MPTCP_EVENT_UNSPEC: unused event
  * @MPTCP_EVENT_CREATED: A new MPTCP connection has been created. It is the
  *   good time to allocate memory and send ADD_ADDR if needed. Depending on the
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 5bb924534387..f1a50f367add 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -103,7 +103,7 @@ static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
  * It creates a unique token to identify the new mptcp connection,
  * a secret local key and the initial data sequence number (idsn).
  *
- * Returns 0 on success.
+ * Return: 0 on success.
  */
 int mptcp_token_new_request(struct request_sock *req)
 {
@@ -146,7 +146,7 @@ int mptcp_token_new_request(struct request_sock *req)
  * the computed token at a later time, this is needed to process
  * join requests.
  *
- * returns 0 on success.
+ * Return: 0 on success.
  */
 int mptcp_token_new_connect(struct sock *ssk)
 {
@@ -241,7 +241,7 @@ found:
  * This function returns the mptcp connection structure with the given token.
  * A reference count on the mptcp socket returned is taken.
  *
- * returns NULL if no connection with the given token value exists.
+ * Return: NULL if no connection with the given token value exists.
  */
 struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token)
 {
@@ -288,11 +288,13 @@ EXPORT_SYMBOL_GPL(mptcp_token_get_sock);
  * @s_slot: start slot number
  * @s_num: start number inside the given lock
  *
- * This function returns the first mptcp connection structure found inside the
- * token container starting from the specified position, or NULL.
+ * Description:
+ *   On successful iteration, the iterator is moved to the next position and a
+ *   reference to the returned socket is acquired.
  *
- * On successful iteration, the iterator is moved to the next position and
- * a reference to the returned socket is acquired.
+ * Return:
+ *   The first mptcp connection structure found inside the token container
+ *   starting from the specified position, or NULL.
  */
 struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
 					 long *s_num)
-- 
cgit v1.2.3


From 90079798f1d748e97c74e23736491543577b8aee Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 2 Feb 2026 10:59:00 +0100
Subject: delayacct: fix uapi timespec64 definition

The custom definition of 'struct timespec64' is incompatible with both the
kernel's internal definition and the glibc type, at least on big-endian
targets that have the tv_nsec field in a different place, and the
definition clashes with any userspace that also defines a timespec64
structure.

Running the header check with -Wpadding enabled produces this output that
warns about the incorrect padding:

usr/include/linux/taskstats.h:25:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded]

Remove the hack and instead use the regular __kernel_timespec type that is
meant to be used in uapi definitions.

Link: https://lkml.kernel.org/r/20260202095906.1344100-1-arnd@kernel.org
Fixes: 29b63f6eff0e ("delayacct: add timestamp of delay max")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Jiang Kun <jiang.kun2@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/taskstats.h | 27 +++++++++------------------
 kernel/delayacct.c             |  6 ++++--
 2 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 1b31e8e14d2f..3ae25f3ce067 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -18,16 +18,7 @@
 #define _LINUX_TASKSTATS_H
 
 #include <linux/types.h>
-#ifdef __KERNEL__
-#include <linux/time64.h>
-#else
-#ifndef _LINUX_TIME64_H
-struct timespec64 {
-	__s64   tv_sec;         /* seconds */
-	long    tv_nsec;        /* nanoseconds */
-};
-#endif
-#endif
+#include <linux/time_types.h>
 
 /* Format for per-task data returned to userland when
  *	- a task exits
@@ -242,14 +233,14 @@ struct taskstats {
 	__u64	irq_delay_min;
 
 	/*v17: delay max timestamp record*/
-	struct timespec64 cpu_delay_max_ts;
-	struct timespec64 blkio_delay_max_ts;
-	struct timespec64 swapin_delay_max_ts;
-	struct timespec64 freepages_delay_max_ts;
-	struct timespec64 thrashing_delay_max_ts;
-	struct timespec64 compact_delay_max_ts;
-	struct timespec64 wpcopy_delay_max_ts;
-	struct timespec64 irq_delay_max_ts;
+	struct __kernel_timespec cpu_delay_max_ts;
+	struct __kernel_timespec blkio_delay_max_ts;
+	struct __kernel_timespec swapin_delay_max_ts;
+	struct __kernel_timespec freepages_delay_max_ts;
+	struct __kernel_timespec thrashing_delay_max_ts;
+	struct __kernel_timespec compact_delay_max_ts;
+	struct __kernel_timespec wpcopy_delay_max_ts;
+	struct __kernel_timespec irq_delay_max_ts;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index d58ffc63bcba..2e55c493c98b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -18,7 +18,8 @@
 do { \
 	d->type##_delay_max = tsk->delays->type##_delay_max; \
 	d->type##_delay_min = tsk->delays->type##_delay_min; \
-	d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \
+	d->type##_delay_max_ts.tv_sec = tsk->delays->type##_delay_max_ts.tv_sec; \
+	d->type##_delay_max_ts.tv_nsec = tsk->delays->type##_delay_max_ts.tv_nsec; \
 	tmp = d->type##_delay_total + tsk->delays->type##_delay; \
 	d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
 	d->type##_count += tsk->delays->type##_count; \
@@ -175,7 +176,8 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	d->cpu_delay_max = tsk->sched_info.max_run_delay;
 	d->cpu_delay_min = tsk->sched_info.min_run_delay;
-	d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts;
+	d->cpu_delay_max_ts.tv_sec = tsk->sched_info.max_run_delay_ts.tv_sec;
+	d->cpu_delay_max_ts.tv_nsec = tsk->sched_info.max_run_delay_ts.tv_nsec;
 	tmp = (s64)d->cpu_delay_total + t2;
 	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
 	tmp = (s64)d->cpu_run_virtual_total + t3;
-- 
cgit v1.2.3


From ebcff9dacaf2c1418f8bc927388186d7d3674603 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 2 Feb 2026 23:48:07 +0100
Subject: vduse: avoid adding implicit padding

The vduse_iova_range_v2 and vduse_iotlb_entry_v2 structures are both
defined in a way that adds implicit padding and is incompatible between
i386 and x86_64 userspace because of the different structure alignment
requirements. Building the header with -Wpadded shows these new warnings:

vduse.h:305:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded]
vduse.h:374:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded]

Change the amount of padding in these two structures to align them to
64 bit words and avoid those problems. Since the v1 vduse_iotlb_entry
already has an inconsistent size, do not attempt to reuse the structure
but rather list the members indiviudally, with a fixed amount of
padding.

Fixes: 079212f6877e ("vduse: add vq group asid support")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20260202224835.559538-1-arnd@kernel.org>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 40 +++++++++++++-------------------------
 include/uapi/linux/vduse.h         |  9 +++++++--
 2 files changed, 21 insertions(+), 28 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 73d1d517dc6c..405d59610f76 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1301,7 +1301,7 @@ static int vduse_dev_iotlb_entry(struct vduse_dev *dev,
 	int r = -EINVAL;
 	struct vhost_iotlb_map *map;
 
-	if (entry->v1.start > entry->v1.last || entry->asid >= dev->nas)
+	if (entry->start > entry->last || entry->asid >= dev->nas)
 		return -EINVAL;
 
 	asid = array_index_nospec(entry->asid, dev->nas);
@@ -1312,18 +1312,18 @@ static int vduse_dev_iotlb_entry(struct vduse_dev *dev,
 
 	spin_lock(&dev->as[asid].domain->iotlb_lock);
 	map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb,
-				      entry->v1.start, entry->v1.last);
+				      entry->start, entry->last);
 	if (map) {
 		if (f) {
 			const struct vdpa_map_file *map_file;
 
 			map_file = (struct vdpa_map_file *)map->opaque;
-			entry->v1.offset = map_file->offset;
+			entry->offset = map_file->offset;
 			*f = get_file(map_file->file);
 		}
-		entry->v1.start = map->start;
-		entry->v1.last = map->last;
-		entry->v1.perm = map->perm;
+		entry->start = map->start;
+		entry->last = map->last;
+		entry->perm = map->perm;
 		if (capability) {
 			*capability = 0;
 
@@ -1363,14 +1363,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 			break;
 
 		ret = -EFAULT;
-		if (cmd == VDUSE_IOTLB_GET_FD2) {
-			if (copy_from_user(&entry, argp, sizeof(entry)))
-				break;
-		} else {
-			if (copy_from_user(&entry.v1, argp,
-					   sizeof(entry.v1)))
-				break;
-		}
+		if (copy_from_user(&entry, argp, _IOC_SIZE(cmd)))
+			break;
 
 		ret = -EINVAL;
 		if (!is_mem_zero((const char *)entry.reserved,
@@ -1385,19 +1379,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 		if (!f)
 			break;
 
-		if (cmd == VDUSE_IOTLB_GET_FD2)
-			ret = copy_to_user(argp, &entry,
-					   sizeof(entry));
-		else
-			ret = copy_to_user(argp, &entry.v1,
-					   sizeof(entry.v1));
-
+		ret = copy_to_user(argp, &entry, _IOC_SIZE(cmd));
 		if (ret) {
 			ret = -EFAULT;
 			fput(f);
 			break;
 		}
-		ret = receive_fd(f, NULL, perm_to_file_flags(entry.v1.perm));
+		ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
 		fput(f);
 		break;
 	}
@@ -1603,16 +1591,16 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 		} else if (info.asid >= dev->nas)
 			break;
 
-		entry.v1.start = info.start;
-		entry.v1.last = info.last;
+		entry.start = info.start;
+		entry.last = info.last;
 		entry.asid = info.asid;
 		ret = vduse_dev_iotlb_entry(dev, &entry, NULL,
 					    &info.capability);
 		if (ret < 0)
 			break;
 
-		info.start = entry.v1.start;
-		info.last = entry.v1.last;
+		info.start = entry.start;
+		info.last = entry.last;
 		info.asid = entry.asid;
 
 		ret = -EFAULT;
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 68b4287f9fac..361eea511c21 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -293,9 +293,13 @@ struct vduse_iova_info {
  * Structure used by VDUSE_IOTLB_GET_FD2 ioctl to find an overlapped IOVA region.
  */
 struct vduse_iotlb_entry_v2 {
-	struct vduse_iotlb_entry v1;
+	__u64 offset;
+	__u64 start;
+	__u64 last;
+	__u8 perm;
+	__u8 padding[7];
 	__u32 asid;
-	__u32 reserved[12];
+	__u32 reserved[11];
 };
 
 /*
@@ -365,6 +369,7 @@ struct vduse_iova_range_v2 {
 	__u64 start;
 	__u64 last;
 	__u32 asid;
+	__u32 padding;
 };
 
 /**
-- 
cgit v1.2.3


From c29214677a9fc1a3a4ee65e189afeb5fd10d676f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Feb 2026 21:34:28 +0000
Subject: io_uring/query: return support for custom rx page size

Add an ability to query if the zcrx rx page size setting is available.

Note, even when the API is supported by io_uring, the registration can
still get rejected for various reasons, e.g. when the NIC or the driver
doesn't support it, when the particular specified size is unsupported,
when the memory area doesn't satisfy all requirements, etc.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h       | 8 ++++++++
 include/uapi/linux/io_uring/query.h | 3 ++-
 io_uring/query.c                    | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index da5156954731..c462bdf3c42c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -1090,6 +1090,14 @@ enum zcrx_reg_flags {
 	ZCRX_REG_IMPORT	= 1,
 };
 
+enum zcrx_features {
+	/*
+	 * The user can ask for the desired rx page size by passing the
+	 * value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
+	 */
+	ZCRX_FEATURE_RX_PAGE_SIZE	= 1 << 0,
+};
+
 /*
  * Argument for IORING_REGISTER_ZCRX_IFQ
  */
diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index 2456e6c5ebb5..0b6248175e26 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -50,7 +50,8 @@ struct io_uring_query_zcrx {
 	__u64 area_flags;
 	/* The number of supported ZCRX_CTRL_* opcodes */
 	__u32 nr_ctrl_opcodes;
-	__u32 __resv1;
+	/* Bitmask of ZCRX_FEATURE_* indicating which features are available */
+	__u32 features;
 	/* The refill ring header size */
 	__u32 rq_hdr_size;
 	/* The alignment for the header */
diff --git a/io_uring/query.c b/io_uring/query.c
index abdd6f3e1223..63cc30c9803d 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -39,7 +39,7 @@ static ssize_t io_query_zcrx(union io_query_data *data)
 	e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST;
 	e->rq_hdr_size = sizeof(struct io_uring);
 	e->rq_hdr_alignment = L1_CACHE_BYTES;
-	e->__resv1 = 0;
+	e->features = ZCRX_FEATURE_RX_PAGE_SIZE;
 	e->__resv2 = 0;
 	return sizeof(*e);
 }
-- 
cgit v1.2.3


From 6b34f8edf8b807b7f87901623aa52dfa1b29ef93 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 15 Feb 2026 21:38:09 +0000
Subject: io_uring/query: add query.h copyright notice

Add a copyright notice to io_uring's query uapi header.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/query.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index 0b6248175e26..95500759cc13 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -1,6 +1,9 @@
 /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
 /*
  * Header file for the io_uring query interface.
+ *
+ * Copyright (C) 2026 Pavel Begunkov <asml.silence@gmail.com>
+ * Copyright (C) Meta Platforms, Inc.
  */
 #ifndef LINUX_IO_URING_QUERY_H
 #define LINUX_IO_URING_QUERY_H
-- 
cgit v1.2.3


From be3573124e630736d2d39650b12f5ef220b47ac1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 10 Feb 2026 10:00:44 -0700
Subject: io_uring/bpf_filter: pass in expected filter payload size

It's quite possible that opcodes that have payloads attached to them,
like IORING_OP_OPENAT/OPENAT2 or IORING_OP_SOCKET, that these paylods
can change over time. For example, on the openat/openat2 side, the
struct open_how argument is extensible, and could be extended in the
future to allow further arguments to be passed in.

Allow registration of a cBPF filter to give the size of the filter as
seen by userspace. If that filter is for an opcode that takes extra
payload data, allow it if the application payload expectation is the
same size than the kernels. If that is the case, the kernel supports
filtering on the payload that the application expects. If the size
differs, the behavior depends on the IO_URING_BPF_FILTER_SZ_STRICT flag:

1) If IO_URING_BPF_FILTER_SZ_STRICT is set and the size expectation
   differs, fail the attempt to load the filter.

2) If IO_URING_BPF_FILTER_SZ_STRICT isn't set, allow the filter if
   the userspace pdu size is smaller than what the kernel offers.

3) Regardless if IO_URING_BPF_FILTER_SZ_STRICT, fail loading the filter
   if the userspace pdu size is bigger than what the kernel supports.

An attempt to load a filter due to sizing will error with -EMSGSIZE.
For that error, the registration struct will have filter->pdu_size
populated with the pdu size that the kernel uses.

Reported-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/bpf_filter.h |  8 +++-
 io_uring/bpf_filter.c                    | 65 ++++++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h
index 220351b81bc0..1b461d792a7b 100644
--- a/include/uapi/linux/io_uring/bpf_filter.h
+++ b/include/uapi/linux/io_uring/bpf_filter.h
@@ -35,13 +35,19 @@ enum {
 	 * If set, any currently unset opcode will have a deny filter attached
 	 */
 	IO_URING_BPF_FILTER_DENY_REST	= 1,
+	/*
+	 * If set, if kernel and application don't agree on pdu_size for
+	 * the given opcode, fail the registration of the filter.
+	 */
+	IO_URING_BPF_FILTER_SZ_STRICT	= 2,
 };
 
 struct io_uring_bpf_filter {
 	__u32	opcode;		/* io_uring opcode to filter */
 	__u32	flags;
 	__u32	filter_len;	/* number of BPF instructions */
-	__u32	resv;
+	__u8	pdu_size;	/* expected pdu size for opcode */
+	__u8	resv[3];
 	__u64	filter_ptr;	/* pointer to BPF filter */
 	__u64	resv2[5];
 };
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
index 8ac7d06de122..28a23e92ee81 100644
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -308,36 +308,69 @@ err:
 	return ERR_PTR(-EBUSY);
 }
 
-#define IO_URING_BPF_FILTER_FLAGS	IO_URING_BPF_FILTER_DENY_REST
+#define IO_URING_BPF_FILTER_FLAGS	(IO_URING_BPF_FILTER_DENY_REST | \
+					 IO_URING_BPF_FILTER_SZ_STRICT)
 
-int io_register_bpf_filter(struct io_restriction *res,
-			   struct io_uring_bpf __user *arg)
+static int io_bpf_filter_import(struct io_uring_bpf *reg,
+				struct io_uring_bpf __user *arg)
 {
-	struct io_bpf_filters *filters, *old_filters = NULL;
-	struct io_bpf_filter *filter, *old_filter;
-	struct io_uring_bpf reg;
-	struct bpf_prog *prog;
-	struct sock_fprog fprog;
+	const struct io_issue_def *def;
 	int ret;
 
-	if (copy_from_user(&reg, arg, sizeof(reg)))
+	if (copy_from_user(reg, arg, sizeof(*reg)))
 		return -EFAULT;
-	if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
+	if (reg->cmd_type != IO_URING_BPF_CMD_FILTER)
 		return -EINVAL;
-	if (reg.cmd_flags || reg.resv)
+	if (reg->cmd_flags || reg->resv)
 		return -EINVAL;
 
-	if (reg.filter.opcode >= IORING_OP_LAST)
+	if (reg->filter.opcode >= IORING_OP_LAST)
 		return -EINVAL;
-	if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
+	if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
 		return -EINVAL;
-	if (reg.filter.resv)
+	if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv)))
 		return -EINVAL;
-	if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2)))
+	if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2)))
 		return -EINVAL;
-	if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS)
+	if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS)
 		return -EINVAL;
 
+	/* Verify filter size */
+	def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)];
+
+	/* same size, always ok */
+	ret = 0;
+	if (reg->filter.pdu_size == def->filter_pdu_size)
+		;
+	/* size differs, fail in strict mode */
+	else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT)
+		ret = -EMSGSIZE;
+	/* userspace filter is bigger, always disallow */
+	else if (reg->filter.pdu_size > def->filter_pdu_size)
+		ret = -EMSGSIZE;
+
+	/* copy back kernel filter size */
+	reg->filter.pdu_size = def->filter_pdu_size;
+	if (copy_to_user(&arg->filter, &reg->filter, sizeof(reg->filter)))
+		return -EFAULT;
+
+	return ret;
+}
+
+int io_register_bpf_filter(struct io_restriction *res,
+			   struct io_uring_bpf __user *arg)
+{
+	struct io_bpf_filters *filters, *old_filters = NULL;
+	struct io_bpf_filter *filter, *old_filter;
+	struct io_uring_bpf reg;
+	struct bpf_prog *prog;
+	struct sock_fprog fprog;
+	int ret;
+
+	ret = io_bpf_filter_import(&reg, arg);
+	if (ret)
+		return ret;
+
 	fprog.len = reg.filter.filter_len;
 	fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);
 
-- 
cgit v1.2.3


From 4edd4ba71ce0df015303dba75ea9d20d1a217546 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Sat, 14 Feb 2026 15:54:06 +0100
Subject: include: uapi: netfilter_bridge.h: Cover for musl libc

Musl defines its own struct ethhdr and thus defines __UAPI_DEF_ETHHDR to
zero. To avoid struct redefinition errors, user space is therefore
supposed to include netinet/if_ether.h before (or instead of)
linux/if_ether.h. To relieve them from this burden, include the libc
header here if not building for kernel space.

Reported-by: Alyssa Ross <hi@alyssa.is>
Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter_bridge.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h
index f6e8d1e05c97..758de72b2764 100644
--- a/include/uapi/linux/netfilter_bridge.h
+++ b/include/uapi/linux/netfilter_bridge.h
@@ -5,6 +5,10 @@
 /* bridge-specific defines for netfilter. 
  */
 
+#ifndef __KERNEL__
+#include <netinet/if_ether.h>	/* for __UAPI_DEF_ETHHDR if defined */
+#endif
+
 #include <linux/in.h>
 #include <linux/netfilter.h>
 #include <linux/if_ether.h>
-- 
cgit v1.2.3


From a284dbc96a47891a7a595a1c81b1e2da4d309cf6 Mon Sep 17 00:00:00 2001
From: Muminul Islam <muislam@microsoft.com>
Date: Wed, 18 Feb 2026 14:47:59 +0000
Subject: mshv: Add nested virtualization creation flag

Introduce HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE to
indicate support for nested virtualization during partition creation.

This enables clearer configuration and capability checks for nested
virtualization scenarios.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Signed-off-by: Muminul Islam <muislam@microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/mshv_root_main.c | 2 ++
 include/hyperv/hvhdk.h      | 1 +
 include/uapi/linux/mshv.h   | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index e5d94398528e..e490f8e5a8a5 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1947,6 +1947,8 @@ static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
 		*pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
 	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
 		*pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+	if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
+		*pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
 
 	isol_props->as_uint64 = 0;
 
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index 79d1f16a850a..f139c7c5bb2d 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -335,6 +335,7 @@ union hv_partition_isolation_properties {
 #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED   0x2
 
 /* Note: Exo partition is enabled by default */
+#define HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE	BIT(1)
 #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED		BIT(4)
 #define HV_PARTITION_CREATION_FLAG_EXO_PARTITION			BIT(8)
 #define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED			BIT(13)
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index dee3ece28ce5..7ef5dd67a232 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -27,6 +27,7 @@ enum {
 	MSHV_PT_BIT_X2APIC,
 	MSHV_PT_BIT_GPA_SUPER_PAGES,
 	MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES,
+	MSHV_PT_BIT_NESTED_VIRTUALIZATION,
 	MSHV_PT_BIT_COUNT,
 };
 
-- 
cgit v1.2.3


From 8927a108a7662eb83eb667bc0c5a0633397122b1 Mon Sep 17 00:00:00 2001
From: Anatol Belski <anbelski@linux.microsoft.com>
Date: Wed, 18 Feb 2026 14:48:02 +0000
Subject: mshv: Add SMT_ENABLED_GUEST partition creation flag

Add support for HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST
to allow userspace VMMs to enable SMT for guest partitions.

Expose this via new MSHV_PT_BIT_SMT_ENABLED_GUEST flag in the UAPI.

Without this flag, the hypervisor schedules guest VPs incorrectly,
causing SMT unusable.

Signed-off-by: Anatol Belski <anbelski@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/mshv_root_main.c | 2 ++
 include/hyperv/hvhdk.h      | 1 +
 include/uapi/linux/mshv.h   | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index e490f8e5a8a5..192467a25f66 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1949,6 +1949,8 @@ static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
 		*pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
 	if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
 		*pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
+	if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST))
+		*pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST;
 
 	isol_props->as_uint64 = 0;
 
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index f139c7c5bb2d..245f3db53bf1 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -335,6 +335,7 @@ union hv_partition_isolation_properties {
 #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED   0x2
 
 /* Note: Exo partition is enabled by default */
+#define HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST			BIT(0)
 #define HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE	BIT(1)
 #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED		BIT(4)
 #define HV_PARTITION_CREATION_FLAG_EXO_PARTITION			BIT(8)
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 7ef5dd67a232..e0645a34b55b 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -28,6 +28,7 @@ enum {
 	MSHV_PT_BIT_GPA_SUPER_PAGES,
 	MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES,
 	MSHV_PT_BIT_NESTED_VIRTUALIZATION,
+	MSHV_PT_BIT_SMT_ENABLED_GUEST,
 	MSHV_PT_BIT_COUNT,
 };
 
-- 
cgit v1.2.3