From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:44 -0800 Subject: bpf: Support associating BPF program with struct_ops Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating a BPF program with a struct_ops map. This command takes a file descriptor of a struct_ops map and a BPF program and set prog->aux->st_ops_assoc to the kdata of the struct_ops map. The command does not accept a struct_ops program nor a non-struct_ops map. Programs of a struct_ops map is automatically associated with the map during map update. If a program is shared between two struct_ops maps, prog->aux->st_ops_assoc will be poisoned to indicate that the associated struct_ops is ambiguous. The pointer, once poisoned, cannot be reset since we have lost track of associated struct_ops. For other program types, the associated struct_ops map, once set, cannot be changed later. This restriction may be lifted in the future if there is a use case. A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve the associated struct_ops pointer. The returned pointer, if not NULL, is guaranteed to be valid and point to a fully updated struct_ops struct. For struct_ops program reused in multiple struct_ops map, the return will be NULL. prog->aux->st_ops_assoc is protected by bumping the refcount for non-struct_ops programs and RCU for struct_ops programs. Since it would be inefficient to track programs associated with a struct_ops map, every non-struct_ops program will bump the refcount of the map to make sure st_ops_assoc stays valid. For a struct_ops program, it is protected by RCU as map_free will wait for an RCU grace period before disassociating the program with the map. The helper must be called in BPF program context or RCU read-side critical section. struct_ops implementers should note that the struct_ops returned may not be initialized nor attached yet. The struct_ops implementer will be responsible for tracking and checking the state of the associated struct_ops map if the use case expects an initialized or attached struct_ops. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com --- include/uapi/linux/bpf.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f8d8513eda27..84ced3ed2d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From cc6b66d661fda4fb94c0099dd92b83f8de5c1bf4 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Thu, 21 Aug 2025 17:22:44 +0800 Subject: amdkfd: introduce new ioctl AMDKFD_IOC_CREATE_PROCESS This commit implemetns a new ioctl AMDKFD_IOC_CREATE_PROCESS that creates a new secondary kfd_progress on the FD. To keep backward compatibility, userspace programs need to invoke this ioctl explicitly on a FD to create a secondary kfd_process which replacing its primary kfd_process. This commit bumps ioctl minor version. Signed-off-by: Zhu Lingshan Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 45 ++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 3 +-- include/uapi/linux/kfd_ioctl.h | 8 ++++-- 4 files changed, 53 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 092a2b8aaea1..041237861107 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -3150,6 +3150,48 @@ out: return r; } +/* userspace programs need to invoke this ioctl explicitly on a FD to + * create a secondary kfd_process which replacing its primary kfd_process + */ +static int kfd_ioctl_create_process(struct file *filep, struct kfd_process *p, void *data) +{ + struct kfd_process *process; + int ret; + + /* Each FD owns only one kfd_process */ + if (p->context_id != KFD_CONTEXT_ID_PRIMARY) + return -EINVAL; + + if (!filep->private_data || !p) + return -EINVAL; + + mutex_lock(&kfd_processes_mutex); + if (p != filep->private_data) { + mutex_unlock(&kfd_processes_mutex); + return -EINVAL; + } + + process = create_process(current, false); + if (IS_ERR(process)) { + mutex_unlock(&kfd_processes_mutex); + return PTR_ERR(process); + } + + filep->private_data = process; + mutex_unlock(&kfd_processes_mutex); + + ret = kfd_create_process_sysfs(process); + if (ret) + pr_warn("Failed to create sysfs entry for the kfd_process"); + + /* Each open() increases kref of the primary kfd_process, + * so we need to reduce it here when we create a new secondary process replacing it + */ + kfd_unref_process(p); + + return 0; +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -3268,6 +3310,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP, kfd_ioctl_set_debug_trap, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_PROCESS, + kfd_ioctl_create_process, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 399f32689678..12f640a9370a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1053,6 +1053,7 @@ struct amdkfd_ioctl_desc { }; bool kfd_dev_is_large_bar(struct kfd_node *dev); +struct kfd_process *create_process(const struct task_struct *thread, bool primary); int kfd_process_create_wq(void); void kfd_process_destroy_wq(void); void kfd_cleanup_processes(void); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index c52e56aa9316..b4982da9234b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -68,7 +68,6 @@ static struct workqueue_struct *kfd_restore_wq; static struct kfd_process *find_process(const struct task_struct *thread, bool ref); static void kfd_process_ref_release(struct kref *ref); -static struct kfd_process *create_process(const struct task_struct *thread, bool primary); static void evict_process_worker(struct work_struct *work); static void restore_process_worker(struct work_struct *work); @@ -1582,7 +1581,7 @@ void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd, * On return the kfd_process is fully operational and will be freed when the * mm is released */ -static struct kfd_process *create_process(const struct task_struct *thread, bool primary) +struct kfd_process *create_process(const struct task_struct *thread, bool primary) { struct kfd_process *process; struct mmu_notifier *mn; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 5d1727a6d040..84aa24c02715 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -44,9 +44,10 @@ * - 1.16 - Add contiguous VRAM allocation flag * - 1.17 - Add SDMA queue creation with target SDMA engine ID * - 1.18 - Rename pad in set_memory_policy_args to misc_process_flag + * - 1.19 - Add a new ioctl to craete secondary kfd processes */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 18 +#define KFD_IOCTL_MINOR_VERSION 19 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ @@ -1671,7 +1672,10 @@ struct kfd_ioctl_dbg_trap_args { #define AMDKFD_IOC_DBG_TRAP \ AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args) +#define AMDKFD_IOC_CREATE_PROCESS \ + AMDKFD_IO(0x27) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x27 +#define AMDKFD_COMMAND_END 0x28 #endif -- cgit v1.2.3 From 7a5fb05b5b18e531989aa55b10dfa4be0633207e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 5 Dec 2025 08:04:41 -0600 Subject: amdkfd: Bump ABI to indicate presence of Trap handler support for expert scheduling commit 0f0c8a6983db ("drm/amdkfd: Trap handler support for expert scheduling mode") introduced support for a trap handler when expert scheduling mode. However userspace needs to know whether or not a trap handler support is present. Bump the KFD IOCTL API so that userspace can key off this to decide. Suggested-by: Stella Laurenzo Fixes: 423888879412 ("drm/amdkfd: Trap handler support for expert scheduling mode") Reviewed-by: Kent Russell Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 84aa24c02715..4d0c1a53f9d5 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -45,9 +45,10 @@ * - 1.17 - Add SDMA queue creation with target SDMA engine ID * - 1.18 - Rename pad in set_memory_policy_args to misc_process_flag * - 1.19 - Add a new ioctl to craete secondary kfd processes + * - 1.20 - Trap handler support for expert scheduling mode available */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 19 +#define KFD_IOCTL_MINOR_VERSION 20 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ -- cgit v1.2.3 From 0e5032237ee5530147fbdf33134297e1490d5ec3 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Sat, 29 Nov 2025 14:41:21 +0530 Subject: statmount: accept fd as a parameter Extend `struct mnt_id_req` to take in a fd and introduce STATMOUNT_BY_FD flag. When a valid fd is provided and STATMOUNT_BY_FD is set, statmount will return mountinfo about the mount the fd is on. This even works for "unmounted" mounts (mounts that have been umounted using umount2(mnt, MNT_DETACH)), if you have access to a file descriptor on that mount. These "umounted" mounts will have no mountpoint and no valid mount namespace. Hence, we unset the STATMOUNT_MNT_POINT and STATMOUNT_MNT_NS_ID in statmount.mask for "unmounted" mounts. In case of STATMOUNT_BY_FD, given that we already have access to an fd on the mount, accessing mount information without a capability check seems fine because of the following reasons: - All fs related information is available via fstatfs() without any capability check. - Mount information is also available via /proc/pid/mountinfo (without any capability check). - Given that we have access to a fd on the mount which tells us that we had access to the mount at some point (or someone that had access gave us the fd). So, we should be able to access mount info. Co-developed-by: Pavel Tikhomirov Signed-off-by: Pavel Tikhomirov Signed-off-by: Bhavik Sachdev Link: https://patch.msgid.link/20251129091455.757724-3-b.sachdev1904@gmail.com Acked-by: Andrei Vagin Signed-off-by: Christian Brauner --- fs/namespace.c | 102 +++++++++++++++++++++++++++++---------------- include/uapi/linux/mount.h | 10 ++++- 2 files changed, 76 insertions(+), 36 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/namespace.c b/fs/namespace.c index f6879f282dae..ec3b16fedd9f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5547,31 +5547,49 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) /* locks: namespace_shared */ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, - struct mnt_namespace *ns) + struct file *mnt_file, struct mnt_namespace *ns) { - struct mount *m; int err; - /* Has the namespace already been emptied? */ - if (mnt_ns_id && mnt_ns_empty(ns)) - return -ENOENT; + if (mnt_file) { + WARN_ON_ONCE(ns != NULL); - s->mnt = lookup_mnt_in_ns(mnt_id, ns); - if (!s->mnt) - return -ENOENT; + s->mnt = mnt_file->f_path.mnt; + ns = real_mount(s->mnt)->mnt_ns; + if (!ns) + /* + * We can't set mount point and mnt_ns_id since we don't have a + * ns for the mount. This can happen if the mount is unmounted + * with MNT_DETACH. + */ + s->mask &= ~(STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID); + } else { + /* Has the namespace already been emptied? */ + if (mnt_ns_id && mnt_ns_empty(ns)) + return -ENOENT; - err = grab_requested_root(ns, &s->root); - if (err) - return err; + s->mnt = lookup_mnt_in_ns(mnt_id, ns); + if (!s->mnt) + return -ENOENT; + } - /* - * Don't trigger audit denials. We just want to determine what - * mounts to show users. - */ - m = real_mount(s->mnt); - if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; + if (ns) { + err = grab_requested_root(ns, &s->root); + if (err) + return err; + + if (!mnt_file) { + struct mount *m; + /* + * Don't trigger audit denials. We just want to determine what + * mounts to show users. + */ + m = real_mount(s->mnt); + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + } + } err = security_sb_statfs(s->mnt->mnt_root); if (err) @@ -5693,7 +5711,7 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, } static int copy_mnt_id_req(const struct mnt_id_req __user *req, - struct mnt_id_req *kreq) + struct mnt_id_req *kreq, unsigned int flags) { int ret; size_t usize; @@ -5711,11 +5729,17 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; - if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) - return -EINVAL; - /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ - if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) - return -EINVAL; + + if (flags & STATMOUNT_BY_FD) { + if (kreq->mnt_id || kreq->mnt_ns_id) + return -EINVAL; + } else { + if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) + return -EINVAL; + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; + } return 0; } @@ -5762,25 +5786,33 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, { struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct kstatmount *ks __free(kfree) = NULL; + struct file *mnt_file __free(fput) = NULL; struct mnt_id_req kreq; /* We currently support retrieval of 3 strings. */ size_t seq_size = 3 * PATH_MAX; int ret; - if (flags) + if (flags & ~STATMOUNT_BY_FD) return -EINVAL; - ret = copy_mnt_id_req(req, &kreq); + ret = copy_mnt_id_req(req, &kreq, flags); if (ret) return ret; - ns = grab_requested_mnt_ns(&kreq); - if (IS_ERR(ns)) - return PTR_ERR(ns); + if (flags & STATMOUNT_BY_FD) { + mnt_file = fget_raw(kreq.mnt_fd); + if (!mnt_file) + return -EBADF; + /* do_statmount sets ns in case of STATMOUNT_BY_FD */ + } else { + ns = grab_requested_mnt_ns(&kreq); + if (IS_ERR(ns)) + return PTR_ERR(ns); - if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + } ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); if (!ks) @@ -5792,7 +5824,7 @@ retry: return ret; scoped_guard(namespace_shared) - ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); + ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, mnt_file, ns); if (!ret) ret = copy_statmount_to_user(ks); @@ -5932,7 +5964,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) return -EFAULT; - ret = copy_mnt_id_req(req, &kreq); + ret = copy_mnt_id_req(req, &kreq, 0); if (ret) return ret; diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 5d3f8c9e3a62..18c624405268 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -197,7 +197,10 @@ struct statmount { */ struct mnt_id_req { __u32 size; - __u32 mnt_ns_fd; + union { + __u32 mnt_ns_fd; + __u32 mnt_fd; + }; __u64 mnt_id; __u64 param; __u64 mnt_ns_id; @@ -232,4 +235,9 @@ struct mnt_id_req { #define LSMT_ROOT 0xffffffffffffffff /* root mount */ #define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ +/* + * @flag bits for statmount(2) + */ +#define STATMOUNT_BY_FD 0x00000001U /* want mountinfo for given fd */ + #endif /* _UAPI_LINUX_MOUNT_H */ -- cgit v1.2.3 From e83f63da2ac776fbc30861e4ce8b798df6ee8a7a Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Mon, 23 Jun 2025 14:12:58 -0400 Subject: drm/amdkfd: allow debug subscription to lds violations on gfx 1250 GFX 1250 allows the debugger to subcribe to LDS out-of-range read/write memory violations. Bump IOCTL minor version and flag KFD capabilities for enablement hint. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 7 +++++++ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 4 ++++ include/uapi/linux/kfd_ioctl.h | 4 +++- include/uapi/linux/kfd_sysfs.h | 3 ++- 4 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index ba9a09b6589a..f83e1238c1b3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -519,6 +519,7 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(target->pdds[i]->dev->id); uint32_t caps = topo_dev->node_props.capability; + uint32_t caps2 = topo_dev->node_props.capability2; if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) && (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { @@ -531,6 +532,12 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) *flags = prev_flags; return -EACCES; } + + if (!(caps2 & HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED) && + (*flags & KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE)) { + *flags = prev_flags; + return -EACCES; + } } target->dbg_flags = *flags; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index a0990dd2378c..7a402c9c1b6e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -2028,6 +2028,10 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 0, 0)) dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED; + + if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) + dev->node_props.capability2 |= + HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED; } kfd_topology_set_dbg_firmware_support(dev); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 4d0c1a53f9d5..6e91875c10ba 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -46,9 +46,10 @@ * - 1.18 - Rename pad in set_memory_policy_args to misc_process_flag * - 1.19 - Add a new ioctl to craete secondary kfd processes * - 1.20 - Trap handler support for expert scheduling mode available + * - 1.21 - Debugger support to subscribe to LDS out-of-address exceptions */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 20 +#define KFD_IOCTL_MINOR_VERSION 21 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ @@ -947,6 +948,7 @@ enum kfd_dbg_trap_address_watch_mode { enum kfd_dbg_trap_flags { KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1, KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP = 2, + KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE = 4 }; /* Trap exceptions */ diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h index 1125fe47959f..0b6ce2f3c887 100644 --- a/include/uapi/linux/kfd_sysfs.h +++ b/include/uapi/linux/kfd_sysfs.h @@ -64,7 +64,8 @@ #define HSA_CAP_RESERVED 0x000f8000 #define HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED 0x00000001 -#define HSA_CAP2_RESERVED 0xfffffffe +#define HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED 0x00000002 +#define HSA_CAP2_RESERVED 0xfffffffc /* debug_prop bits in node properties */ #define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK 0x0000000f -- cgit v1.2.3 From cb8fe62f87ad21f4c174aec480694c9b4b8b01c4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 20 Dec 2025 03:04:26 +0900 Subject: nilfs2: convert nilfs_super_block to kernel-doc Eliminate 40+ kernel-doc warnings in nilfs2_ondisk.h by converting all of the struct member comments to kernel-doc comments. Fix one misnamed struct member in nilfs_direct_node. Object files before and after are the same size and content. Examples of warnings: Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_rev_level' not described in 'nilfs_super_block' Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_minor_rev_level' not described in 'nilfs_super_block' Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_magic' not described in 'nilfs_super_block' Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_bytes' not described in 'nilfs_super_block' Warning: include/uapi/linux/nilfs2_ondisk.h:202 struct member 's_flags' not described in 'nilfs_super_block' Signed-off-by: Randy Dunlap Signed-off-by: Ryusuke Konishi Signed-off-by: Viacheslav Dubeyko --- include/uapi/linux/nilfs2_ondisk.h | 163 ++++++++++++++++++++++--------------- 1 file changed, 97 insertions(+), 66 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nilfs2_ondisk.h b/include/uapi/linux/nilfs2_ondisk.h index 3196cc44a002..b3442b16ff6a 100644 --- a/include/uapi/linux/nilfs2_ondisk.h +++ b/include/uapi/linux/nilfs2_ondisk.h @@ -133,73 +133,104 @@ struct nilfs_super_root { /** * struct nilfs_super_block - structure of super block on disk + * @s_rev_level: Revision level + * @s_minor_rev_level: minor revision level + * @s_magic: Magic signature + * @s_bytes: Bytes count of CRC calculation for + * this structure. s_reserved is excluded. + * @s_flags: flags + * @s_crc_seed: Seed value of CRC calculation + * @s_sum: Check sum of super block + * @s_log_block_size: Block size represented as follows: + * blocksize = 1 << (s_log_block_size + 10) + * @s_nsegments: Number of segments in filesystem + * @s_dev_size: block device size in bytes + * @s_first_data_block: 1st seg disk block number + * @s_blocks_per_segment: number of blocks per full segment + * @s_r_segments_percentage: Reserved segments percentage + * @s_last_cno: Last checkpoint number + * @s_last_pseg: disk block addr pseg written last + * @s_last_seq: seq. number of seg written last + * @s_free_blocks_count: Free blocks count + * @s_ctime: Creation time (execution time of newfs) + * @s_mtime: Mount time + * @s_wtime: Write time + * @s_mnt_count: Mount count + * @s_max_mnt_count: Maximal mount count + * @s_state: File system state + * @s_errors: Behaviour when detecting errors + * @s_lastcheck: time of last check + * @s_checkinterval: max. time between checks + * @s_creator_os: OS + * @s_def_resuid: Default uid for reserved blocks + * @s_def_resgid: Default gid for reserved blocks + * @s_first_ino: First non-reserved inode + * @s_inode_size: Size of an inode + * @s_dat_entry_size: Size of a dat entry + * @s_checkpoint_size: Size of a checkpoint + * @s_segment_usage_size: Size of a segment usage + * @s_uuid: 128-bit uuid for volume + * @s_volume_name: volume name + * @s_c_interval: Commit interval of segment + * @s_c_block_max: Threshold of data amount for the + * segment construction + * @s_feature_compat: Compatible feature set + * @s_feature_compat_ro: Read-only compatible feature set + * @s_feature_incompat: Incompatible feature set + * @s_reserved: padding to the end of the block */ struct nilfs_super_block { -/*00*/ __le32 s_rev_level; /* Revision level */ - __le16 s_minor_rev_level; /* minor revision level */ - __le16 s_magic; /* Magic signature */ - - __le16 s_bytes; /* - * Bytes count of CRC calculation - * for this structure. s_reserved - * is excluded. - */ - __le16 s_flags; /* flags */ - __le32 s_crc_seed; /* Seed value of CRC calculation */ -/*10*/ __le32 s_sum; /* Check sum of super block */ - - __le32 s_log_block_size; /* - * Block size represented as follows - * blocksize = - * 1 << (s_log_block_size + 10) - */ - __le64 s_nsegments; /* Number of segments in filesystem */ -/*20*/ __le64 s_dev_size; /* block device size in bytes */ - __le64 s_first_data_block; /* 1st seg disk block number */ -/*30*/ __le32 s_blocks_per_segment; /* number of blocks per full segment */ - __le32 s_r_segments_percentage; /* Reserved segments percentage */ - - __le64 s_last_cno; /* Last checkpoint number */ -/*40*/ __le64 s_last_pseg; /* disk block addr pseg written last */ - __le64 s_last_seq; /* seq. number of seg written last */ -/*50*/ __le64 s_free_blocks_count; /* Free blocks count */ - - __le64 s_ctime; /* - * Creation time (execution time of - * newfs) - */ -/*60*/ __le64 s_mtime; /* Mount time */ - __le64 s_wtime; /* Write time */ -/*70*/ __le16 s_mnt_count; /* Mount count */ - __le16 s_max_mnt_count; /* Maximal mount count */ - __le16 s_state; /* File system state */ - __le16 s_errors; /* Behaviour when detecting errors */ - __le64 s_lastcheck; /* time of last check */ - -/*80*/ __le32 s_checkinterval; /* max. time between checks */ - __le32 s_creator_os; /* OS */ - __le16 s_def_resuid; /* Default uid for reserved blocks */ - __le16 s_def_resgid; /* Default gid for reserved blocks */ - __le32 s_first_ino; /* First non-reserved inode */ - -/*90*/ __le16 s_inode_size; /* Size of an inode */ - __le16 s_dat_entry_size; /* Size of a dat entry */ - __le16 s_checkpoint_size; /* Size of a checkpoint */ - __le16 s_segment_usage_size; /* Size of a segment usage */ - -/*98*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*A8*/ char s_volume_name[80] /* volume name */ - __kernel_nonstring; - -/*F8*/ __le32 s_c_interval; /* Commit interval of segment */ - __le32 s_c_block_max; /* - * Threshold of data amount for - * the segment construction - */ -/*100*/ __le64 s_feature_compat; /* Compatible feature set */ - __le64 s_feature_compat_ro; /* Read-only compatible feature set */ - __le64 s_feature_incompat; /* Incompatible feature set */ - __u32 s_reserved[186]; /* padding to the end of the block */ +/*00*/ __le32 s_rev_level; + __le16 s_minor_rev_level; + __le16 s_magic; + + __le16 s_bytes; + __le16 s_flags; + __le32 s_crc_seed; +/*10*/ __le32 s_sum; + + __le32 s_log_block_size; + __le64 s_nsegments; +/*20*/ __le64 s_dev_size; + __le64 s_first_data_block; +/*30*/ __le32 s_blocks_per_segment; + __le32 s_r_segments_percentage; + + __le64 s_last_cno; +/*40*/ __le64 s_last_pseg; + __le64 s_last_seq; +/*50*/ __le64 s_free_blocks_count; + + __le64 s_ctime; +/*60*/ __le64 s_mtime; + __le64 s_wtime; +/*70*/ __le16 s_mnt_count; + __le16 s_max_mnt_count; + __le16 s_state; + __le16 s_errors; + __le64 s_lastcheck; + +/*80*/ __le32 s_checkinterval; + __le32 s_creator_os; + __le16 s_def_resuid; + __le16 s_def_resgid; + __le32 s_first_ino; + +/*90*/ __le16 s_inode_size; + __le16 s_dat_entry_size; + __le16 s_checkpoint_size; + __le16 s_segment_usage_size; + +/*98*/ __u8 s_uuid[16]; +/*A8*/ char s_volume_name[80] __kernel_nonstring; + +/*F8*/ __le32 s_c_interval; + __le32 s_c_block_max; + +/*100*/ __le64 s_feature_compat; + __le64 s_feature_compat_ro; + __le64 s_feature_incompat; + __u32 s_reserved[186]; }; /* @@ -449,7 +480,7 @@ struct nilfs_btree_node { /** * struct nilfs_direct_node - header of built-in bmap array * @dn_flags: flags - * @dn_pad: padding + * @pad: padding */ struct nilfs_direct_node { __u8 dn_flags; -- cgit v1.2.3 From 6fd8a09f48d6fee184207f4e15e939898a3947f9 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 20 Dec 2025 03:04:27 +0900 Subject: nilfs2: fix missing struct keywords in nilfs2_api.h kernel-doc Eliminate the following kernel-doc warnings in nilfs2_api.h: Warning: include/uapi/linux/nilfs2_api.h:65 cannot understand function prototype: 'struct nilfs_suinfo' Warning: include/uapi/linux/nilfs2_api.h:101 cannot understand function prototype: 'struct nilfs_suinfo_update' This ensures that the documentation for nilfs_suinfo and nilfs_suinfo_update is correctly parsed and generated by adding the missing 'struct' keyword to their kernel-doc comments. Signed-off-by: Ryusuke Konishi Signed-off-by: Viacheslav Dubeyko --- include/uapi/linux/nilfs2_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nilfs2_api.h b/include/uapi/linux/nilfs2_api.h index 8b9b89104f3d..d1b6fcde2fb8 100644 --- a/include/uapi/linux/nilfs2_api.h +++ b/include/uapi/linux/nilfs2_api.h @@ -58,7 +58,7 @@ NILFS_CPINFO_FNS(INVALID, invalid) NILFS_CPINFO_FNS(MINOR, minor) /** - * nilfs_suinfo - segment usage information + * struct nilfs_suinfo - segment usage information * @sui_lastmod: timestamp of last modification * @sui_nblocks: number of written blocks in segment * @sui_flags: segment usage flags @@ -93,7 +93,7 @@ static inline int nilfs_suinfo_clean(const struct nilfs_suinfo *si) } /** - * nilfs_suinfo_update - segment usage information update + * struct nilfs_suinfo_update - segment usage information update * @sup_segnum: segment number * @sup_flags: flags for which fields are active in sup_sui * @sup_reserved: reserved necessary for alignment -- cgit v1.2.3 From 98b9f207afa53aff2edb0e52910c4348b456b37d Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 22 Dec 2025 09:04:13 +0100 Subject: dmaengine: idxd: uapi: use UAPI types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using libc types and headers from the UAPI headers is problematic as it introduces a dependency on a full C toolchain. Use the fixed-width integer types provided by the UAPI headers instead. Signed-off-by: Thomas Weißschuh Acked-by: Arnd Bergmann Link: https://patch.msgid.link/20251222-uapi-idxd-v1-1-baa183adb20d@linutronix.de Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 270 +++++++++++++++++++++++----------------------- 1 file changed, 133 insertions(+), 137 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 3d1987e1bb2d..fdcc8eefb925 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -3,11 +3,7 @@ #ifndef _USR_IDXD_H_ #define _USR_IDXD_H_ -#ifdef __KERNEL__ #include -#else -#include -#endif /* Driver command error status */ enum idxd_scmd_stat { @@ -176,132 +172,132 @@ enum iax_completion_status { #define DSA_COMP_STATUS(status) ((status) & DSA_COMP_STATUS_MASK) struct dsa_hw_desc { - uint32_t pasid:20; - uint32_t rsvd:11; - uint32_t priv:1; - uint32_t flags:24; - uint32_t opcode:8; - uint64_t completion_addr; + __u32 pasid:20; + __u32 rsvd:11; + __u32 priv:1; + __u32 flags:24; + __u32 opcode:8; + __u64 completion_addr; union { - uint64_t src_addr; - uint64_t rdback_addr; - uint64_t pattern; - uint64_t desc_list_addr; - uint64_t pattern_lower; - uint64_t transl_fetch_addr; + __u64 src_addr; + __u64 rdback_addr; + __u64 pattern; + __u64 desc_list_addr; + __u64 pattern_lower; + __u64 transl_fetch_addr; }; union { - uint64_t dst_addr; - uint64_t rdback_addr2; - uint64_t src2_addr; - uint64_t comp_pattern; + __u64 dst_addr; + __u64 rdback_addr2; + __u64 src2_addr; + __u64 comp_pattern; }; union { - uint32_t xfer_size; - uint32_t desc_count; - uint32_t region_size; + __u32 xfer_size; + __u32 desc_count; + __u32 region_size; }; - uint16_t int_handle; - uint16_t rsvd1; + __u16 int_handle; + __u16 rsvd1; union { - uint8_t expected_res; + __u8 expected_res; /* create delta record */ struct { - uint64_t delta_addr; - uint32_t max_delta_size; - uint32_t delt_rsvd; - uint8_t expected_res_mask; + __u64 delta_addr; + __u32 max_delta_size; + __u32 delt_rsvd; + __u8 expected_res_mask; }; - uint32_t delta_rec_size; - uint64_t dest2; + __u32 delta_rec_size; + __u64 dest2; /* CRC */ struct { - uint32_t crc_seed; - uint32_t crc_rsvd; - uint64_t seed_addr; + __u32 crc_seed; + __u32 crc_rsvd; + __u64 seed_addr; }; /* DIF check or strip */ struct { - uint8_t src_dif_flags; - uint8_t dif_chk_res; - uint8_t dif_chk_flags; - uint8_t dif_chk_res2[5]; - uint32_t chk_ref_tag_seed; - uint16_t chk_app_tag_mask; - uint16_t chk_app_tag_seed; + __u8 src_dif_flags; + __u8 dif_chk_res; + __u8 dif_chk_flags; + __u8 dif_chk_res2[5]; + __u32 chk_ref_tag_seed; + __u16 chk_app_tag_mask; + __u16 chk_app_tag_seed; }; /* DIF insert */ struct { - uint8_t dif_ins_res; - uint8_t dest_dif_flag; - uint8_t dif_ins_flags; - uint8_t dif_ins_res2[13]; - uint32_t ins_ref_tag_seed; - uint16_t ins_app_tag_mask; - uint16_t ins_app_tag_seed; + __u8 dif_ins_res; + __u8 dest_dif_flag; + __u8 dif_ins_flags; + __u8 dif_ins_res2[13]; + __u32 ins_ref_tag_seed; + __u16 ins_app_tag_mask; + __u16 ins_app_tag_seed; }; /* DIF update */ struct { - uint8_t src_upd_flags; - uint8_t upd_dest_flags; - uint8_t dif_upd_flags; - uint8_t dif_upd_res[5]; - uint32_t src_ref_tag_seed; - uint16_t src_app_tag_mask; - uint16_t src_app_tag_seed; - uint32_t dest_ref_tag_seed; - uint16_t dest_app_tag_mask; - uint16_t dest_app_tag_seed; + __u8 src_upd_flags; + __u8 upd_dest_flags; + __u8 dif_upd_flags; + __u8 dif_upd_res[5]; + __u32 src_ref_tag_seed; + __u16 src_app_tag_mask; + __u16 src_app_tag_seed; + __u32 dest_ref_tag_seed; + __u16 dest_app_tag_mask; + __u16 dest_app_tag_seed; }; /* Fill */ - uint64_t pattern_upper; + __u64 pattern_upper; /* Translation fetch */ struct { - uint64_t transl_fetch_res; - uint32_t region_stride; + __u64 transl_fetch_res; + __u32 region_stride; }; /* DIX generate */ struct { - uint8_t dix_gen_res; - uint8_t dest_dif_flags; - uint8_t dif_flags; - uint8_t dix_gen_res2[13]; - uint32_t ref_tag_seed; - uint16_t app_tag_mask; - uint16_t app_tag_seed; + __u8 dix_gen_res; + __u8 dest_dif_flags; + __u8 dif_flags; + __u8 dix_gen_res2[13]; + __u32 ref_tag_seed; + __u16 app_tag_mask; + __u16 app_tag_seed; }; - uint8_t op_specific[24]; + __u8 op_specific[24]; }; } __attribute__((packed)); struct iax_hw_desc { - uint32_t pasid:20; - uint32_t rsvd:11; - uint32_t priv:1; - uint32_t flags:24; - uint32_t opcode:8; - uint64_t completion_addr; - uint64_t src1_addr; - uint64_t dst_addr; - uint32_t src1_size; - uint16_t int_handle; + __u32 pasid:20; + __u32 rsvd:11; + __u32 priv:1; + __u32 flags:24; + __u32 opcode:8; + __u64 completion_addr; + __u64 src1_addr; + __u64 dst_addr; + __u32 src1_size; + __u16 int_handle; union { - uint16_t compr_flags; - uint16_t decompr_flags; + __u16 compr_flags; + __u16 decompr_flags; }; - uint64_t src2_addr; - uint32_t max_dst_size; - uint32_t src2_size; - uint32_t filter_flags; - uint32_t num_inputs; + __u64 src2_addr; + __u32 max_dst_size; + __u32 src2_size; + __u32 filter_flags; + __u32 num_inputs; } __attribute__((packed)); struct dsa_raw_desc { - uint64_t field[8]; + __u64 field[8]; } __attribute__((packed)); /* @@ -309,91 +305,91 @@ struct dsa_raw_desc { * volatile and prevent the compiler from optimize the read. */ struct dsa_completion_record { - volatile uint8_t status; + volatile __u8 status; union { - uint8_t result; - uint8_t dif_status; + __u8 result; + __u8 dif_status; }; - uint8_t fault_info; - uint8_t rsvd; + __u8 fault_info; + __u8 rsvd; union { - uint32_t bytes_completed; - uint32_t descs_completed; + __u32 bytes_completed; + __u32 descs_completed; }; - uint64_t fault_addr; + __u64 fault_addr; union { /* common record */ struct { - uint32_t invalid_flags:24; - uint32_t rsvd2:8; + __u32 invalid_flags:24; + __u32 rsvd2:8; }; - uint32_t delta_rec_size; - uint64_t crc_val; + __u32 delta_rec_size; + __u64 crc_val; /* DIF check & strip */ struct { - uint32_t dif_chk_ref_tag; - uint16_t dif_chk_app_tag_mask; - uint16_t dif_chk_app_tag; + __u32 dif_chk_ref_tag; + __u16 dif_chk_app_tag_mask; + __u16 dif_chk_app_tag; }; /* DIF insert */ struct { - uint64_t dif_ins_res; - uint32_t dif_ins_ref_tag; - uint16_t dif_ins_app_tag_mask; - uint16_t dif_ins_app_tag; + __u64 dif_ins_res; + __u32 dif_ins_ref_tag; + __u16 dif_ins_app_tag_mask; + __u16 dif_ins_app_tag; }; /* DIF update */ struct { - uint32_t dif_upd_src_ref_tag; - uint16_t dif_upd_src_app_tag_mask; - uint16_t dif_upd_src_app_tag; - uint32_t dif_upd_dest_ref_tag; - uint16_t dif_upd_dest_app_tag_mask; - uint16_t dif_upd_dest_app_tag; + __u32 dif_upd_src_ref_tag; + __u16 dif_upd_src_app_tag_mask; + __u16 dif_upd_src_app_tag; + __u32 dif_upd_dest_ref_tag; + __u16 dif_upd_dest_app_tag_mask; + __u16 dif_upd_dest_app_tag; }; /* DIX generate */ struct { - uint64_t dix_gen_res; - uint32_t dix_ref_tag; - uint16_t dix_app_tag_mask; - uint16_t dix_app_tag; + __u64 dix_gen_res; + __u32 dix_ref_tag; + __u16 dix_app_tag_mask; + __u16 dix_app_tag; }; - uint8_t op_specific[16]; + __u8 op_specific[16]; }; } __attribute__((packed)); struct dsa_raw_completion_record { - uint64_t field[4]; + __u64 field[4]; } __attribute__((packed)); struct iax_completion_record { - volatile uint8_t status; - uint8_t error_code; - uint8_t fault_info; - uint8_t rsvd; - uint32_t bytes_completed; - uint64_t fault_addr; - uint32_t invalid_flags; - uint32_t rsvd2; - uint32_t output_size; - uint8_t output_bits; - uint8_t rsvd3; - uint16_t xor_csum; - uint32_t crc; - uint32_t min; - uint32_t max; - uint32_t sum; - uint64_t rsvd4[2]; + volatile __u8 status; + __u8 error_code; + __u8 fault_info; + __u8 rsvd; + __u32 bytes_completed; + __u64 fault_addr; + __u32 invalid_flags; + __u32 rsvd2; + __u32 output_size; + __u8 output_bits; + __u8 rsvd3; + __u16 xor_csum; + __u32 crc; + __u32 min; + __u32 max; + __u32 sum; + __u64 rsvd4[2]; } __attribute__((packed)); struct iax_raw_completion_record { - uint64_t field[8]; + __u64 field[8]; } __attribute__((packed)); #endif -- cgit v1.2.3 From 9e541b3cee70a3bbe86b176c903c23b29fe033cd Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Wed, 10 Dec 2025 21:29:05 +0800 Subject: PCI: trace: Add generic RAS tracepoint for hotplug event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hotplug events are critical indicators for analyzing hardware health, and surprise link downs can significantly impact system performance and reliability. Define a new TRACING_SYSTEM named "pci", add a generic RAS tracepoint for hotplug event to help health checks. Add enum pci_hotplug_event in include/uapi/linux/pci.h so applications like rasdaemon can register tracepoint event handlers for it. The following output is generated when a device is hotplugged: $ echo 1 > /sys/kernel/debug/tracing/events/pci/pci_hp_event/enable $ cat /sys/kernel/debug/tracing/trace_pipe irq/51-pciehp-88 [001] ..... 1311.177459: pci_hp_event: 0000:00:02.0 slot:10, event:CARD_PRESENT irq/51-pciehp-88 [001] ..... 1311.177566: pci_hp_event: 0000:00:02.0 slot:10, event:LINK_UP Suggested-by: Lukas Wunner Signed-off-by: Shuai Xue Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Wunner Reviewed-by: Jonathan Cameron Reviewed-by: Steven Rostedt (Google) # for trace event Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20251210132907.58799-2-xueshuai@linux.alibaba.com --- drivers/pci/Makefile | 3 ++ drivers/pci/hotplug/pciehp_ctrl.c | 31 +++++++++++++---- drivers/pci/trace.c | 11 ++++++ include/trace/events/pci.h | 72 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/pci.h | 7 ++++ 5 files changed, 118 insertions(+), 6 deletions(-) create mode 100644 drivers/pci/trace.c create mode 100644 include/trace/events/pci.h (limited to 'include/uapi/linux') diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index e10cfe5a280b..8c259a9a8796 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -47,3 +47,6 @@ obj-y += controller/ obj-y += switch/ subdir-ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG + +CFLAGS_trace.o := -I$(src) +obj-$(CONFIG_TRACING) += trace.o diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index bcc938d4420f..7805f697a02c 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "../pci.h" #include "pciehp.h" @@ -244,12 +245,20 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events) case ON_STATE: ctrl->state = POWEROFF_STATE; mutex_unlock(&ctrl->state_lock); - if (events & PCI_EXP_SLTSTA_DLLSC) + if (events & PCI_EXP_SLTSTA_DLLSC) { ctrl_info(ctrl, "Slot(%s): Link Down\n", slot_name(ctrl)); - if (events & PCI_EXP_SLTSTA_PDC) + trace_pci_hp_event(pci_name(ctrl->pcie->port), + slot_name(ctrl), + PCI_HOTPLUG_LINK_DOWN); + } + if (events & PCI_EXP_SLTSTA_PDC) { ctrl_info(ctrl, "Slot(%s): Card not present\n", slot_name(ctrl)); + trace_pci_hp_event(pci_name(ctrl->pcie->port), + slot_name(ctrl), + PCI_HOTPLUG_CARD_NOT_PRESENT); + } pciehp_disable_slot(ctrl, SURPRISE_REMOVAL); break; default: @@ -269,6 +278,9 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events) INDICATOR_NOOP); ctrl_info(ctrl, "Slot(%s): Card not present\n", slot_name(ctrl)); + trace_pci_hp_event(pci_name(ctrl->pcie->port), + slot_name(ctrl), + PCI_HOTPLUG_CARD_NOT_PRESENT); } mutex_unlock(&ctrl->state_lock); return; @@ -281,12 +293,19 @@ void pciehp_handle_presence_or_link_change(struct controller *ctrl, u32 events) case OFF_STATE: ctrl->state = POWERON_STATE; mutex_unlock(&ctrl->state_lock); - if (present) + if (present) { ctrl_info(ctrl, "Slot(%s): Card present\n", slot_name(ctrl)); - if (link_active) - ctrl_info(ctrl, "Slot(%s): Link Up\n", - slot_name(ctrl)); + trace_pci_hp_event(pci_name(ctrl->pcie->port), + slot_name(ctrl), + PCI_HOTPLUG_CARD_PRESENT); + } + if (link_active) { + ctrl_info(ctrl, "Slot(%s): Link Up\n", slot_name(ctrl)); + trace_pci_hp_event(pci_name(ctrl->pcie->port), + slot_name(ctrl), + PCI_HOTPLUG_LINK_UP); + } ctrl->request_result = pciehp_enable_slot(ctrl); break; default: diff --git a/drivers/pci/trace.c b/drivers/pci/trace.c new file mode 100644 index 000000000000..cf11abca8602 --- /dev/null +++ b/drivers/pci/trace.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Tracepoints for PCI system + * + * Copyright (C) 2025 Alibaba Corporation + */ + +#include + +#define CREATE_TRACE_POINTS +#include diff --git a/include/trace/events/pci.h b/include/trace/events/pci.h new file mode 100644 index 000000000000..39e512a167ee --- /dev/null +++ b/include/trace/events/pci.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM pci + +#if !defined(_TRACE_HW_EVENT_PCI_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HW_EVENT_PCI_H + +#include + +#define PCI_HOTPLUG_EVENT \ + EM(PCI_HOTPLUG_LINK_UP, "LINK_UP") \ + EM(PCI_HOTPLUG_LINK_DOWN, "LINK_DOWN") \ + EM(PCI_HOTPLUG_CARD_PRESENT, "CARD_PRESENT") \ + EMe(PCI_HOTPLUG_CARD_NOT_PRESENT, "CARD_NOT_PRESENT") + +/* Enums require being exported to userspace, for user tool parsing */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +PCI_HOTPLUG_EVENT + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} + +/* + * Note: For generic PCI hotplug events, we pass already-resolved strings + * (port_name, slot) instead of driver-specific structures like 'struct + * controller'. This is because different PCI hotplug drivers (pciehp, cpqphp, + * ibmphp, shpchp) define their own versions of 'struct controller' with + * different fields and helper functions. Using driver-specific structures would + * make the tracepoint interface non-generic and cause compatibility issues + * across different drivers. + */ +TRACE_EVENT(pci_hp_event, + + TP_PROTO(const char *port_name, + const char *slot, + const int event), + + TP_ARGS(port_name, slot, event), + + TP_STRUCT__entry( + __string( port_name, port_name ) + __string( slot, slot ) + __field( int, event ) + ), + + TP_fast_assign( + __assign_str(port_name); + __assign_str(slot); + __entry->event = event; + ), + + TP_printk("%s slot:%s, event:%s\n", + __get_str(port_name), + __get_str(slot), + __print_symbolic(__entry->event, PCI_HOTPLUG_EVENT) + ) +); + +#endif /* _TRACE_HW_EVENT_PCI_H */ + +/* This part must be outside protection */ +#include diff --git a/include/uapi/linux/pci.h b/include/uapi/linux/pci.h index a769eefc5139..4f150028965d 100644 --- a/include/uapi/linux/pci.h +++ b/include/uapi/linux/pci.h @@ -39,4 +39,11 @@ #define PCIIOC_MMAP_IS_MEM (PCIIOC_BASE | 0x02) /* Set mmap state to MEM space. */ #define PCIIOC_WRITE_COMBINE (PCIIOC_BASE | 0x03) /* Enable/disable write-combining. */ +enum pci_hotplug_event { + PCI_HOTPLUG_LINK_UP, + PCI_HOTPLUG_LINK_DOWN, + PCI_HOTPLUG_CARD_PRESENT, + PCI_HOTPLUG_CARD_NOT_PRESENT, +}; + #endif /* _UAPILINUX_PCI_H */ -- cgit v1.2.3 From 3c4629b68dbe18e454cce4b864c530268cffbeed Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 22 Dec 2025 09:00:33 +0100 Subject: virtio: uapi: avoid usage of libc types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using libc types and headers from the UAPI headers is problematic as it introduces a dependency on a full C toolchain. On Linux 'unsigned long' works as a replacement for 'uintptr_t' and does not depend on libc. Signed-off-by: Thomas Weißschuh Acked-by: Arnd Bergmann Signed-off-by: Michael S. Tsirkin Message-Id: <20251222-uapi-virtio-v1-1-29390f87bcad@linutronix.de> --- include/uapi/linux/virtio_ring.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index f8c20d3de8da..3c478582a3c2 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -31,9 +31,6 @@ * SUCH DAMAGE. * * Copyright Rusty Russell IBM Corporation 2007. */ -#ifndef __KERNEL__ -#include -#endif #include #include @@ -202,7 +199,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, vr->num = num; vr->desc = p; vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); - vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } -- cgit v1.2.3 From 40fc797ba18328e57ed1cb213b4b5e48f86f4c7c Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Mon, 15 Dec 2025 18:17:09 +0000 Subject: binder: fix trivial typo in uapi header As reported by codespell: include/uapi/linux/android/binder.h:281: interupted ==> interrupted Signed-off-by: Carlos Llamas Reviewed-by: Alice Ryhl Link: https://patch.msgid.link/20251215181724.3811977-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/android/binder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 03ee4c7010d7..701cad36de43 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -278,7 +278,7 @@ enum { * NOTE: Two special error codes you should check for when calling * in to the driver are: * - * EINTR -- The operation has been interupted. This should be + * EINTR -- The operation has been interrupted. This should be * handled by retrying the ioctl() until a different error code * is returned. * -- cgit v1.2.3 From c51bb53d5c68041dd02f66d9b638cda33647623e Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Tue, 18 Mar 2025 19:49:55 +0000 Subject: drm/amdkfd: Add metadata ring buffer for compute Add support for separate ring-buffer for metadata packets when using compute queues. Userspace application allocate the metadata ring-buffer and the queue ring-buffer with a single allocation. The metadata ring-buffer starts after the queue ring-buffer. Signed-off-by: David Yat Sin Reviewed-by: Philip Yang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 8 ++++++++ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c | 21 +++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 7 +++++-- include/uapi/linux/kfd_ioctl.h | 5 +++-- 5 files changed, 39 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 041237861107..88621cb7d409 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -221,6 +221,11 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, pr_debug("Size lower. clamped to KFD_MIN_QUEUE_RING_SIZE"); } + if ((args->metadata_ring_size != 0) && !is_power_of_2(args->metadata_ring_size)) { + pr_err("Metadata ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + if (!access_ok((const void __user *) args->read_pointer_address, sizeof(uint32_t))) { pr_err("Can't access read pointer\n"); @@ -255,6 +260,9 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->priority = args->queue_priority; q_properties->queue_address = args->ring_base_address; q_properties->queue_size = args->ring_size; + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->metadata_queue_size = args->metadata_ring_size; + q_properties->read_ptr = (void __user *)args->read_pointer_address; q_properties->write_ptr = (void __user *)args->write_pointer_address; q_properties->eop_ring_buffer_address = args->eop_buffer_address; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c index f1c2c9e8cf6b..a06b4e89af8a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12_1.c @@ -266,6 +266,27 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + if (q->metadata_queue_size) { + /* On GC 12.1 is 64 DWs which is 4 times size of AQL packet */ + if (q->metadata_queue_size == q->queue_size * 4) { + /* + * User application allocates main queue ring and metadata queue ring + * with a single allocation. metadata queue ring starts after main + * queue ring. + */ + m->cp_hqd_kd_base = + lower_32_bits((q->queue_address + q->queue_size) >> 8); + m->cp_hqd_kd_base_hi = + upper_32_bits((q->queue_address + q->queue_size) >> 8); + + m->cp_hqd_kd_cntl |= CP_HQD_KD_CNTL__KD_FETCHER_ENABLE_MASK; + /* KD_SIZE = 2 for metadata packet = 64 DWs */ + m->cp_hqd_kd_cntl |= 2 << CP_HQD_KD_CNTL__KD_SIZE__SHIFT; + } else { + pr_warn("Invalid metadata ring size, metadata queue will be ignored\n"); + } + } + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index ebc637c38c04..d798baa7e52e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -506,7 +506,8 @@ struct queue_properties { enum kfd_queue_format format; unsigned int queue_id; uint64_t queue_address; - uint64_t queue_size; + uint64_t queue_size; + uint64_t metadata_queue_size; uint32_t priority; uint32_t queue_percent; void __user *read_ptr; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 56c97189e7f1..1b465fdb2c64 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -247,9 +247,12 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope properties->format == KFD_QUEUE_FORMAT_AQL && topo_dev->node_props.gfx_target_version >= 70000 && topo_dev->node_props.gfx_target_version < 90000) - expected_queue_size = properties->queue_size / 2; + /* metadata_queue_size not supported on GFX7/GFX8 */ + expected_queue_size = + properties->queue_size / 2; else - expected_queue_size = properties->queue_size; + expected_queue_size = + properties->queue_size + properties->metadata_queue_size; vm = drm_priv_to_vm(pdd->drm_priv); err = amdgpu_bo_reserve(vm->root.bo, false); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 6e91875c10ba..047bcb1cc078 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -47,9 +47,10 @@ * - 1.19 - Add a new ioctl to craete secondary kfd processes * - 1.20 - Trap handler support for expert scheduling mode available * - 1.21 - Debugger support to subscribe to LDS out-of-address exceptions + * - 1.22 - Add queue creation with metadata ring base address */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 21 +#define KFD_IOCTL_MINOR_VERSION 22 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ @@ -87,7 +88,7 @@ struct kfd_ioctl_create_queue_args { __u32 ctx_save_restore_size; /* to KFD */ __u32 ctl_stack_size; /* to KFD */ __u32 sdma_engine_id; /* to KFD */ - __u32 pad; + __u32 metadata_ring_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { -- cgit v1.2.3 From 22cd0db47f4f65ebe8afc8c34ab120c47c73da2a Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Mon, 15 Dec 2025 13:08:12 +0100 Subject: media: uapi: mali-c55-config: Remove version identifier The Mali C55 driver uses the v4l2-isp framework, which defines its own versioning number which does not need to be defined again in each platform-specific header. Remove the definition of mali_c55_param_buffer_version enumeration from the Mali C55 uAPI header. Signed-off-by: Jacopo Mondi Reviewed-by: Daniel Scally Signed-off-by: Hans Verkuil --- include/uapi/linux/media/arm/mali-c55-config.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h index 109082c5694f..3d335f950eeb 100644 --- a/include/uapi/linux/media/arm/mali-c55-config.h +++ b/include/uapi/linux/media/arm/mali-c55-config.h @@ -194,15 +194,6 @@ struct mali_c55_stats_buffer { __u32 reserved3[15]; } __attribute__((packed)); -/** - * enum mali_c55_param_buffer_version - Mali-C55 parameters block versioning - * - * @MALI_C55_PARAM_BUFFER_V1: First version of Mali-C55 parameters block - */ -enum mali_c55_param_buffer_version { - MALI_C55_PARAM_BUFFER_V1, -}; - /** * enum mali_c55_param_block_type - Enumeration of Mali-C55 parameter blocks * -- cgit v1.2.3 From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:16 +0800 Subject: bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for following APIs: * 'map_lookup_elem()' * 'map_update_elem()' * 'generic_map_lookup_batch()' * 'generic_map_update_batch()' And, get the correct value size for these APIs. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 23 ++++++++++++++++++++++- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 31 +++++++++++++++++-------------- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 43 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a63e47d2109c..108bab1bda9d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) } #endif +static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) +{ + return false; +} + static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) { - if (flags & ~allowed_flags) + u32 cpu; + + if ((u32)flags & ~allowed_flags) return -EINVAL; if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; + if (!(flags & BPF_F_CPU) && flags >> 32) + return -EINVAL; + + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if (!bpf_map_supports_cpu_flags(map->map_type)) + return -EINVAL; + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) + return -EINVAL; + + cpu = flags >> 32; + if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus()) + return -ERANGE; + } + return 0; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 84ced3ed2d21..2a2ade4be60f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6dd2ad2f9e81..e8cfe9d67e64 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map) return atomic64_read(&map->writecnt) != 0; } -static u32 bpf_map_value_size(const struct bpf_map *map) -{ - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) +static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) +{ + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) + return map->value_size; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); @@ -1729,7 +1731,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; @@ -1737,7 +1739,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(key)) return PTR_ERR(key); - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -1804,7 +1806,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); @@ -2000,11 +2002,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, + BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2059,11 +2062,11 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2185,7 +2188,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, 0); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b92b0847ec2..b816bc53d2e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ -- cgit v1.2.3 From caa07a815d6ee32586beb66f67e7e3c103a02efd Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Thu, 8 Jan 2026 14:32:10 +0900 Subject: PM: EM: Rename em.yaml to dev-energymodel.yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The EM YNL specification used many acronyms, including ‘em’, ‘pd’, ‘ps’, etc. While the acronyms are short and convenient, they could be confusing. So, let’s spell them out to be more specific. The following changes were made in the spec. Note that the protocol name cannot exceed GENL_NAMSIZ (16). em -> dev-energymodel pds -> perf-domains pd -> perf-domain pd-id -> perf-domain-id pd-table -> perf-table ps -> perf-state get-pds -> get-perf-domains get-pd-table -> get-perf-table pd-created -> perf-domain-created pd-updated -> perf-domain-updated pd-deleted -> perf-domain-deleted In addition. doc strings were added to the spec. based on the comments in energy_model.h. Two flag attributes (perf-state-flags and perf-domain-flags) were added for easily interpreting the bit flags. Finally, the autogenerated files and em_netlink.c were updated accordingly to reflect the name changes. Suggested-by: Donald Hunter Reviewed-by: Lukasz Luba Reviewed-by: Donald Hunter Signed-off-by: Changwoo Min Link: https://patch.msgid.link/20260108053212.642478-3-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- Documentation/netlink/specs/dev-energymodel.yaml | 175 +++++++++++++++++++++++ Documentation/netlink/specs/em.yaml | 116 --------------- MAINTAINERS | 8 +- include/uapi/linux/dev_energymodel.h | 89 ++++++++++++ include/uapi/linux/energy_model.h | 63 -------- kernel/power/em_netlink.c | 135 ++++++++++------- kernel/power/em_netlink_autogen.c | 44 +++--- kernel/power/em_netlink_autogen.h | 20 +-- 8 files changed, 384 insertions(+), 266 deletions(-) create mode 100644 Documentation/netlink/specs/dev-energymodel.yaml delete mode 100644 Documentation/netlink/specs/em.yaml create mode 100644 include/uapi/linux/dev_energymodel.h delete mode 100644 include/uapi/linux/energy_model.h (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/dev-energymodel.yaml b/Documentation/netlink/specs/dev-energymodel.yaml new file mode 100644 index 000000000000..cbc4bc38f23c --- /dev/null +++ b/Documentation/netlink/specs/dev-energymodel.yaml @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Copyright (c) 2025 Valve Corporation. +# +--- +name: dev-energymodel + +doc: | + Energy model netlink interface to notify its changes. + +protocol: genetlink + +uapi-header: linux/dev_energymodel.h + +definitions: + - + type: flags + name: perf-state-flags + entries: + - + name: perf-state-inefficient + doc: >- + The performance state is inefficient. There is in this perf-domain, + another performance state with a higher frequency but a lower or + equal power cost. + - + type: flags + name: perf-domain-flags + entries: + - + name: perf-domain-microwatts + doc: >- + The power values are in micro-Watts or some other scale. + - + name: perf-domain-skip-inefficiencies + doc: >- + Skip inefficient states when estimating energy consumption. + - + name: perf-domain-artificial + doc: >- + The power values are artificial and might be created by platform + missing real power information. + +attribute-sets: + - + name: perf-domains + doc: >- + Information on all the performance domains. + attributes: + - + name: perf-domain + type: nest + nested-attributes: perf-domain + multi-attr: true + - + name: perf-domain + doc: >- + Information on a single performance domains. + attributes: + - + name: pad + type: pad + - + name: perf-domain-id + type: u32 + doc: >- + A unique ID number for each performance domain. + - + name: flags + type: u64 + doc: >- + Bitmask of performance domain flags. + enum: perf-domain-flags + - + name: cpus + type: string + doc: >- + CPUs that belong to this performance domain. + - + name: perf-table + doc: >- + Performance states table. + attributes: + - + name: perf-domain-id + type: u32 + doc: >- + A unique ID number for each performance domain. + - + name: perf-state + type: nest + nested-attributes: perf-state + multi-attr: true + - + name: perf-state + doc: >- + Performance state of a performance domain. + attributes: + - + name: pad + type: pad + - + name: performance + type: u64 + doc: >- + CPU performance (capacity) at a given frequency. + - + name: frequency + type: u64 + doc: >- + The frequency in KHz, for consistency with CPUFreq. + - + name: power + type: u64 + doc: >- + The power consumed at this level (by 1 CPU or by a registered + device). It can be a total power: static and dynamic. + - + name: cost + type: u64 + doc: >- + The cost coefficient associated with this level, used during energy + calculation. Equal to: power * max_frequency / frequency. + - + name: flags + type: u64 + doc: >- + Bitmask of performance state flags. + enum: perf-state-flags + +operations: + list: + - + name: get-perf-domains + attribute-set: perf-domains + doc: Get the list of information for all performance domains. + do: + reply: + attributes: + - perf-domain + - + name: get-perf-table + attribute-set: perf-table + doc: Get the energy model table of a performance domain. + do: + request: + attributes: + - perf-domain-id + reply: + attributes: + - perf-domain-id + - perf-state + - + name: perf-domain-created + doc: A performance domain is created. + notify: get-perf-table + mcgrp: event + - + name: perf-domain-updated + doc: A performance domain is updated. + notify: get-perf-table + mcgrp: event + - + name: perf-domain-deleted + doc: A performance domain is deleted. + attribute-set: perf-table + event: + attributes: + - perf-domain-id + mcgrp: event + +mcast-groups: + list: + - + name: event diff --git a/Documentation/netlink/specs/em.yaml b/Documentation/netlink/specs/em.yaml deleted file mode 100644 index 0c595a874f08..000000000000 --- a/Documentation/netlink/specs/em.yaml +++ /dev/null @@ -1,116 +0,0 @@ -# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) -# -# Copyright (c) 2025 Valve Corporation. -# ---- -name: em - -doc: | - Energy model netlink interface to notify its changes. - -protocol: genetlink - -uapi-header: linux/energy_model.h - -attribute-sets: - - - name: pds - attributes: - - - name: pd - type: nest - nested-attributes: pd - multi-attr: true - - - name: pd - attributes: - - - name: pad - type: pad - - - name: pd-id - type: u32 - - - name: flags - type: u64 - - - name: cpus - type: string - - - name: pd-table - attributes: - - - name: pd-id - type: u32 - - - name: ps - type: nest - nested-attributes: ps - multi-attr: true - - - name: ps - attributes: - - - name: pad - type: pad - - - name: performance - type: u64 - - - name: frequency - type: u64 - - - name: power - type: u64 - - - name: cost - type: u64 - - - name: flags - type: u64 - -operations: - list: - - - name: get-pds - attribute-set: pds - doc: Get the list of information for all performance domains. - do: - reply: - attributes: - - pd - - - name: get-pd-table - attribute-set: pd-table - doc: Get the energy model table of a performance domain. - do: - request: - attributes: - - pd-id - reply: - attributes: - - pd-id - - ps - - - name: pd-created - doc: A performance domain is created. - notify: get-pd-table - mcgrp: event - - - name: pd-updated - doc: A performance domain is updated. - notify: get-pd-table - mcgrp: event - - - name: pd-deleted - doc: A performance domain is deleted. - attribute-set: pd-table - event: - attributes: - - pd-id - mcgrp: event - -mcast-groups: - list: - - - name: event diff --git a/MAINTAINERS b/MAINTAINERS index 765ad2daa218..1e208243b28e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9298,12 +9298,12 @@ M: Lukasz Luba M: "Rafael J. Wysocki" L: linux-pm@vger.kernel.org S: Maintained -F: kernel/power/energy_model.c -F: include/linux/energy_model.h +F: Documentation/netlink/specs/dev-energymodel.yaml F: Documentation/power/energy-model.rst -F: Documentation/netlink/specs/em.yaml -F: include/uapi/linux/energy_model.h +F: include/linux/energy_model.h +F: include/uapi/linux/dev_energymodel.h F: kernel/power/em_netlink*.* +F: kernel/power/energy_model.c EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER M: Laurentiu Tudor diff --git a/include/uapi/linux/dev_energymodel.h b/include/uapi/linux/dev_energymodel.h new file mode 100644 index 000000000000..3399967e1f93 --- /dev/null +++ b/include/uapi/linux/dev_energymodel.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/dev-energymodel.yaml */ +/* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _UAPI_LINUX_DEV_ENERGYMODEL_H +#define _UAPI_LINUX_DEV_ENERGYMODEL_H + +#define DEV_ENERGYMODEL_FAMILY_NAME "dev-energymodel" +#define DEV_ENERGYMODEL_FAMILY_VERSION 1 + +/** + * enum dev_energymodel_perf_state_flags + * @DEV_ENERGYMODEL_PERF_STATE_FLAGS_PERF_STATE_INEFFICIENT: The performance + * state is inefficient. There is in this perf-domain, another performance + * state with a higher frequency but a lower or equal power cost. + */ +enum dev_energymodel_perf_state_flags { + DEV_ENERGYMODEL_PERF_STATE_FLAGS_PERF_STATE_INEFFICIENT = 1, +}; + +/** + * enum dev_energymodel_perf_domain_flags + * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_MICROWATTS: The power values + * are in micro-Watts or some other scale. + * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip + * inefficient states when estimating energy consumption. + * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL: The power values + * are artificial and might be created by platform missing real power + * information. + */ +enum dev_energymodel_perf_domain_flags { + DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_MICROWATTS = 1, + DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_SKIP_INEFFICIENCIES = 2, + DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL = 4, +}; + +enum { + DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN = 1, + + __DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX, + DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX - 1) +}; + +enum { + DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD = 1, + DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, + DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS, + DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS, + + __DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX, + DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX - 1) +}; + +enum { + DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID = 1, + DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE, + + __DEV_ENERGYMODEL_A_PERF_TABLE_MAX, + DEV_ENERGYMODEL_A_PERF_TABLE_MAX = (__DEV_ENERGYMODEL_A_PERF_TABLE_MAX - 1) +}; + +enum { + DEV_ENERGYMODEL_A_PERF_STATE_PAD = 1, + DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE, + DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY, + DEV_ENERGYMODEL_A_PERF_STATE_POWER, + DEV_ENERGYMODEL_A_PERF_STATE_COST, + DEV_ENERGYMODEL_A_PERF_STATE_FLAGS, + + __DEV_ENERGYMODEL_A_PERF_STATE_MAX, + DEV_ENERGYMODEL_A_PERF_STATE_MAX = (__DEV_ENERGYMODEL_A_PERF_STATE_MAX - 1) +}; + +enum { + DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS = 1, + DEV_ENERGYMODEL_CMD_GET_PERF_TABLE, + DEV_ENERGYMODEL_CMD_PERF_DOMAIN_CREATED, + DEV_ENERGYMODEL_CMD_PERF_DOMAIN_UPDATED, + DEV_ENERGYMODEL_CMD_PERF_DOMAIN_DELETED, + + __DEV_ENERGYMODEL_CMD_MAX, + DEV_ENERGYMODEL_CMD_MAX = (__DEV_ENERGYMODEL_CMD_MAX - 1) +}; + +#define DEV_ENERGYMODEL_MCGRP_EVENT "event" + +#endif /* _UAPI_LINUX_DEV_ENERGYMODEL_H */ diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h deleted file mode 100644 index 0bcad967854f..000000000000 --- a/include/uapi/linux/energy_model.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ -/* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/em.yaml */ -/* YNL-GEN uapi header */ -/* To regenerate run: tools/net/ynl/ynl-regen.sh */ - -#ifndef _UAPI_LINUX_ENERGY_MODEL_H -#define _UAPI_LINUX_ENERGY_MODEL_H - -#define EM_FAMILY_NAME "em" -#define EM_FAMILY_VERSION 1 - -enum { - EM_A_PDS_PD = 1, - - __EM_A_PDS_MAX, - EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1) -}; - -enum { - EM_A_PD_PAD = 1, - EM_A_PD_PD_ID, - EM_A_PD_FLAGS, - EM_A_PD_CPUS, - - __EM_A_PD_MAX, - EM_A_PD_MAX = (__EM_A_PD_MAX - 1) -}; - -enum { - EM_A_PD_TABLE_PD_ID = 1, - EM_A_PD_TABLE_PS, - - __EM_A_PD_TABLE_MAX, - EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1) -}; - -enum { - EM_A_PS_PAD = 1, - EM_A_PS_PERFORMANCE, - EM_A_PS_FREQUENCY, - EM_A_PS_POWER, - EM_A_PS_COST, - EM_A_PS_FLAGS, - - __EM_A_PS_MAX, - EM_A_PS_MAX = (__EM_A_PS_MAX - 1) -}; - -enum { - EM_CMD_GET_PDS = 1, - EM_CMD_GET_PD_TABLE, - EM_CMD_PD_CREATED, - EM_CMD_PD_UPDATED, - EM_CMD_PD_DELETED, - - __EM_CMD_MAX, - EM_CMD_MAX = (__EM_CMD_MAX - 1) -}; - -#define EM_MCGRP_EVENT "event" - -#endif /* _UAPI_LINUX_ENERGY_MODEL_H */ diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index 4b85da138a06..6f6238c465bb 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -12,27 +12,31 @@ #include #include #include -#include +#include #include "em_netlink.h" #include "em_netlink_autogen.h" -#define EM_A_PD_CPUS_LEN 256 +#define DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN 256 /*************************** Command encoding ********************************/ static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) { - char cpus_buf[EM_A_PD_CPUS_LEN]; + char cpus_buf[DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN]; int *tot_msg_sz = data; int msg_sz, cpus_sz; cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", cpumask_pr_args(to_cpumask(pd->cpus))); - msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */ - nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */ - nla_total_size(cpus_sz); /* EM_A_PD_CPUS */ + msg_sz = nla_total_size(0) + + /* DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN */ + nla_total_size(sizeof(u32)) + + /* DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS */ + nla_total_size(cpus_sz); + /* DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS */ *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz)); return 0; @@ -40,23 +44,26 @@ static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) { - char cpus_buf[EM_A_PD_CPUS_LEN]; + char cpus_buf[DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN]; struct sk_buff *msg = data; struct nlattr *entry; - entry = nla_nest_start(msg, EM_A_PDS_PD); + entry = nla_nest_start(msg, + DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN); if (!entry) goto out_cancel_nest; - if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id)) + if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, + pd->id)) goto out_cancel_nest; - if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD)) + if (nla_put_u64_64bit(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS, + pd->flags, DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD)) goto out_cancel_nest; snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", cpumask_pr_args(to_cpumask(pd->cpus))); - if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf)) + if (nla_put_string(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS, cpus_buf)) goto out_cancel_nest; nla_nest_end(msg, entry); @@ -69,7 +76,8 @@ out_cancel_nest: return -EMSGSIZE; } -int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) +int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, + struct genl_info *info) { struct sk_buff *msg; void *hdr; @@ -82,7 +90,7 @@ int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) if (!msg) return -ENOMEM; - hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + hdr = genlmsg_put_reply(msg, info, &dev_energymodel_nl_family, 0, cmd); if (!hdr) goto out_free_msg; @@ -107,10 +115,10 @@ static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) struct em_perf_domain *pd; int id; - if (!attrs[EM_A_PD_TABLE_PD_ID]) + if (!attrs[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID]) return NULL; - id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]); + id = nla_get_u32(attrs[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID]); pd = em_perf_domain_get_by_id(id); return pd; } @@ -119,25 +127,34 @@ static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd) { int id_sz, ps_sz; - id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ - ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */ - nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */ + id_sz = nla_total_size(sizeof(u32)); + /* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID */ + ps_sz = nla_total_size(0) + + /* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_STATE_POWER */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_STATE_COST */ + nla_total_size_64bit(sizeof(u64)); + /* DEV_ENERGYMODEL_A_PERF_STATE_FLAGS */ ps_sz *= pd->nr_perf_states; return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz)); } -static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) +static +int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) { struct em_perf_state *table, *ps; struct nlattr *entry; int i; - if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) + if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID, + pd->id)) goto out_err; rcu_read_lock(); @@ -146,24 +163,35 @@ static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain for (i = 0; i < pd->nr_perf_states; i++) { ps = &table[i]; - entry = nla_nest_start(msg, EM_A_PD_TABLE_PS); + entry = nla_nest_start(msg, + DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE); if (!entry) goto out_unlock_ps; - if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE, - ps->performance, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE, + ps->performance, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; - if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY, - ps->frequency, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY, + ps->frequency, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; - if (nla_put_u64_64bit(msg, EM_A_PS_POWER, - ps->power, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_POWER, + ps->power, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; - if (nla_put_u64_64bit(msg, EM_A_PS_COST, - ps->cost, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_COST, + ps->cost, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; - if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS, - ps->flags, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_FLAGS, + ps->flags, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; nla_nest_end(msg, entry); @@ -179,7 +207,8 @@ out_err: return -EMSGSIZE; } -int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) +int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb, + struct genl_info *info) { int cmd = info->genlhdr->cmd; int msg_sz, ret = -EMSGSIZE; @@ -197,7 +226,7 @@ int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) if (!msg) return -ENOMEM; - hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + hdr = genlmsg_put_reply(msg, info, &dev_energymodel_nl_family, 0, cmd); if (!hdr) goto out_free_msg; @@ -221,7 +250,7 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) int msg_sz, ret = -EMSGSIZE; void *hdr; - if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + if (!genl_has_listeners(&dev_energymodel_nl_family, &init_net, DEV_ENERGYMODEL_NLGRP_EVENT)) return; msg_sz = __em_nl_get_pd_table_size(pd); @@ -230,7 +259,7 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) if (!msg) return; - hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type); + hdr = genlmsg_put(msg, 0, 0, &dev_energymodel_nl_family, 0, ntf_type); if (!hdr) goto out_free_msg; @@ -240,28 +269,28 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) genlmsg_end(msg, hdr); - genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + genlmsg_multicast(&dev_energymodel_nl_family, msg, 0, + DEV_ENERGYMODEL_NLGRP_EVENT, GFP_KERNEL); return; out_free_msg: nlmsg_free(msg); - return; } void em_notify_pd_created(const struct em_perf_domain *pd) { - __em_notify_pd_table(pd, EM_CMD_PD_CREATED); + __em_notify_pd_table(pd, DEV_ENERGYMODEL_CMD_PERF_DOMAIN_CREATED); } void em_notify_pd_updated(const struct em_perf_domain *pd) { - __em_notify_pd_table(pd, EM_CMD_PD_UPDATED); + __em_notify_pd_table(pd, DEV_ENERGYMODEL_CMD_PERF_DOMAIN_UPDATED); } static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) { - int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + int id_sz = nla_total_size(sizeof(u32)); /* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID */ return nlmsg_total_size(genlmsg_msg_size(id_sz)); } @@ -272,7 +301,8 @@ void em_notify_pd_deleted(const struct em_perf_domain *pd) void *hdr; int msg_sz; - if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + if (!genl_has_listeners(&dev_energymodel_nl_family, &init_net, + DEV_ENERGYMODEL_NLGRP_EVENT)) return; msg_sz = __em_notify_pd_deleted_size(pd); @@ -281,28 +311,29 @@ void em_notify_pd_deleted(const struct em_perf_domain *pd) if (!msg) return; - hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED); + hdr = genlmsg_put(msg, 0, 0, &dev_energymodel_nl_family, 0, + DEV_ENERGYMODEL_CMD_PERF_DOMAIN_DELETED); if (!hdr) goto out_free_msg; - if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) { + if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID, + pd->id)) goto out_free_msg; - } genlmsg_end(msg, hdr); - genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + genlmsg_multicast(&dev_energymodel_nl_family, msg, 0, + DEV_ENERGYMODEL_NLGRP_EVENT, GFP_KERNEL); return; out_free_msg: nlmsg_free(msg); - return; } /**************************** Initialization *********************************/ static int __init em_netlink_init(void) { - return genl_register_family(&em_nl_family); + return genl_register_family(&dev_energymodel_nl_family); } postcore_initcall(em_netlink_init); diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c index ceb3b2bb6ebe..44acef0e7df2 100644 --- a/kernel/power/em_netlink_autogen.c +++ b/kernel/power/em_netlink_autogen.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/em.yaml */ +/* Documentation/netlink/specs/dev-energymodel.yaml */ /* YNL-GEN kernel source */ /* To regenerate run: tools/net/ynl/ynl-regen.sh */ @@ -9,41 +9,41 @@ #include "em_netlink_autogen.h" -#include +#include -/* EM_CMD_GET_PD_TABLE - do */ -static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { - [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +/* DEV_ENERGYMODEL_CMD_GET_PERF_TABLE - do */ +static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID + 1] = { + [DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID] = { .type = NLA_U32, }, }; -/* Ops table for em */ -static const struct genl_split_ops em_nl_ops[] = { +/* Ops table for dev_energymodel */ +static const struct genl_split_ops dev_energymodel_nl_ops[] = { { - .cmd = EM_CMD_GET_PDS, - .doit = em_nl_get_pds_doit, + .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS, + .doit = dev_energymodel_nl_get_perf_domains_doit, .flags = GENL_CMD_CAP_DO, }, { - .cmd = EM_CMD_GET_PD_TABLE, - .doit = em_nl_get_pd_table_doit, - .policy = em_get_pd_table_nl_policy, - .maxattr = EM_A_PD_TABLE_PD_ID, + .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_TABLE, + .doit = dev_energymodel_nl_get_perf_table_doit, + .policy = dev_energymodel_get_perf_table_nl_policy, + .maxattr = DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID, .flags = GENL_CMD_CAP_DO, }, }; -static const struct genl_multicast_group em_nl_mcgrps[] = { - [EM_NLGRP_EVENT] = { "event", }, +static const struct genl_multicast_group dev_energymodel_nl_mcgrps[] = { + [DEV_ENERGYMODEL_NLGRP_EVENT] = { "event", }, }; -struct genl_family em_nl_family __ro_after_init = { - .name = EM_FAMILY_NAME, - .version = EM_FAMILY_VERSION, +struct genl_family dev_energymodel_nl_family __ro_after_init = { + .name = DEV_ENERGYMODEL_FAMILY_NAME, + .version = DEV_ENERGYMODEL_FAMILY_VERSION, .netnsok = true, .parallel_ops = true, .module = THIS_MODULE, - .split_ops = em_nl_ops, - .n_split_ops = ARRAY_SIZE(em_nl_ops), - .mcgrps = em_nl_mcgrps, - .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), + .split_ops = dev_energymodel_nl_ops, + .n_split_ops = ARRAY_SIZE(dev_energymodel_nl_ops), + .mcgrps = dev_energymodel_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(dev_energymodel_nl_mcgrps), }; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h index 140ab548103c..f7e4bddcbd53 100644 --- a/kernel/power/em_netlink_autogen.h +++ b/kernel/power/em_netlink_autogen.h @@ -1,24 +1,26 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/em.yaml */ +/* Documentation/netlink/specs/dev-energymodel.yaml */ /* YNL-GEN kernel header */ /* To regenerate run: tools/net/ynl/ynl-regen.sh */ -#ifndef _LINUX_EM_GEN_H -#define _LINUX_EM_GEN_H +#ifndef _LINUX_DEV_ENERGYMODEL_GEN_H +#define _LINUX_DEV_ENERGYMODEL_GEN_H #include #include -#include +#include -int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); -int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); +int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, + struct genl_info *info); +int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb, + struct genl_info *info); enum { - EM_NLGRP_EVENT, + DEV_ENERGYMODEL_NLGRP_EVENT, }; -extern struct genl_family em_nl_family; +extern struct genl_family dev_energymodel_nl_family; -#endif /* _LINUX_EM_GEN_H */ +#endif /* _LINUX_DEV_ENERGYMODEL_GEN_H */ -- cgit v1.2.3 From 380ff27af25e49e2cb2ff8fd0ecd7c95be2976ee Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Thu, 8 Jan 2026 14:32:12 +0900 Subject: PM: EM: Add dump to get-perf-domains in the EM YNL spec Add dump to get-perf-domains, so that a user can fetch either information about a specific performance domain with do or information about all performance domains with dump. Share the reply format of do and dump using perf-domain-attrs, so remove perf-domains. The YNL spec, autogenerated files, and the do implementation are updated, and the dump implementation is added. Suggested-by: Donald Hunter Reviewed-by: Lukasz Luba Reviewed-by: Donald Hunter Signed-off-by: Changwoo Min Link: https://patch.msgid.link/20260108053212.642478-5-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- Documentation/netlink/specs/dev-energymodel.yaml | 25 +++++---- include/uapi/linux/dev_energymodel.h | 7 --- kernel/power/em_netlink.c | 68 ++++++++++++++++++------ kernel/power/em_netlink_autogen.c | 16 +++++- kernel/power/em_netlink_autogen.h | 2 + 5 files changed, 80 insertions(+), 38 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/dev-energymodel.yaml b/Documentation/netlink/specs/dev-energymodel.yaml index af8b8f72f722..11faabfdfbe8 100644 --- a/Documentation/netlink/specs/dev-energymodel.yaml +++ b/Documentation/netlink/specs/dev-energymodel.yaml @@ -42,16 +42,6 @@ definitions: missing real power information. attribute-sets: - - - name: perf-domains - doc: >- - Information on all the performance domains. - attributes: - - - name: perf-domain - type: nest - nested-attributes: perf-domain - multi-attr: true - name: perf-domain doc: >- @@ -133,12 +123,21 @@ operations: list: - name: get-perf-domains - attribute-set: perf-domains + attribute-set: perf-domain doc: Get the list of information for all performance domains. do: - reply: + request: attributes: - - perf-domain + - perf-domain-id + reply: + attributes: &perf-domain-attrs + - pad + - perf-domain-id + - flags + - cpus + dump: + reply: + attributes: *perf-domain-attrs - name: get-perf-table attribute-set: perf-table diff --git a/include/uapi/linux/dev_energymodel.h b/include/uapi/linux/dev_energymodel.h index 3399967e1f93..355d8885c9a0 100644 --- a/include/uapi/linux/dev_energymodel.h +++ b/include/uapi/linux/dev_energymodel.h @@ -36,13 +36,6 @@ enum dev_energymodel_perf_domain_flags { DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL = 4, }; -enum { - DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN = 1, - - __DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX, - DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX - 1) -}; - enum { DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD = 1, DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index b6edb018c65a..5a611d3950fd 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -18,6 +18,13 @@ #include "em_netlink_autogen.h" /*************************** Command encoding ********************************/ +struct dump_ctx { + int idx; + int start; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) { int nr_cpus, msg_sz, cpus_sz; @@ -43,14 +50,8 @@ static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) { struct sk_buff *msg = data; struct cpumask *cpumask; - struct nlattr *entry; int cpu; - entry = nla_nest_start(msg, - DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN); - if (!entry) - goto out_cancel_nest; - if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, pd->id)) goto out_cancel_nest; @@ -66,26 +67,50 @@ static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) goto out_cancel_nest; } - nla_nest_end(msg, entry); - return 0; out_cancel_nest: - nla_nest_cancel(msg, entry); - return -EMSGSIZE; } +static int __em_nl_get_pd_for_dump(struct em_perf_domain *pd, void *data) +{ + const struct genl_info *info; + struct dump_ctx *ctx = data; + void *hdr; + int ret; + + if (ctx->idx++ < ctx->start) + return 0; + + info = genl_info_dump(ctx->cb); + hdr = genlmsg_iput(ctx->skb, info); + if (!hdr) { + genlmsg_cancel(ctx->skb, hdr); + return -EMSGSIZE; + } + + ret = __em_nl_get_pd(pd, ctx->skb); + genlmsg_end(ctx->skb, hdr); + return ret; +} + int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, struct genl_info *info) { + int id, ret = -EMSGSIZE, msg_sz = 0; + int cmd = info->genlhdr->cmd; + struct em_perf_domain *pd; struct sk_buff *msg; void *hdr; - int cmd = info->genlhdr->cmd; - int ret = -EMSGSIZE, msg_sz = 0; - for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz); + if (!info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID]) + return -EINVAL; + id = nla_get_u32(info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID]); + pd = em_perf_domain_get_by_id(id); + + __em_nl_get_pd_size(pd, &msg_sz); msg = genlmsg_new(msg_sz, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -94,10 +119,9 @@ int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, if (!hdr) goto out_free_msg; - ret = for_each_em_perf_domain(__em_nl_get_pd, msg); + ret = __em_nl_get_pd(pd, msg); if (ret) goto out_cancel_msg; - genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); @@ -106,10 +130,22 @@ out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); - return ret; } +int dev_energymodel_nl_get_perf_domains_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct dump_ctx ctx = { + .idx = 0, + .start = cb->args[0], + .skb = skb, + .cb = cb, + }; + + return for_each_em_perf_domain(__em_nl_get_pd_for_dump, &ctx); +} + static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) { struct em_perf_domain *pd; diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c index 44acef0e7df2..fedd473e4244 100644 --- a/kernel/power/em_netlink_autogen.c +++ b/kernel/power/em_netlink_autogen.c @@ -11,6 +11,11 @@ #include +/* DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS - do */ +static const struct nla_policy dev_energymodel_get_perf_domains_nl_policy[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID + 1] = { + [DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID] = { .type = NLA_U32, }, +}; + /* DEV_ENERGYMODEL_CMD_GET_PERF_TABLE - do */ static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID + 1] = { [DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID] = { .type = NLA_U32, }, @@ -18,10 +23,17 @@ static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENER /* Ops table for dev_energymodel */ static const struct genl_split_ops dev_energymodel_nl_ops[] = { + { + .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS, + .doit = dev_energymodel_nl_get_perf_domains_doit, + .policy = dev_energymodel_get_perf_domains_nl_policy, + .maxattr = DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, + .flags = GENL_CMD_CAP_DO, + }, { .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS, - .doit = dev_energymodel_nl_get_perf_domains_doit, - .flags = GENL_CMD_CAP_DO, + .dumpit = dev_energymodel_nl_get_perf_domains_dumpit, + .flags = GENL_CMD_CAP_DUMP, }, { .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_TABLE, diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h index f7e4bddcbd53..5caf2f7e18a5 100644 --- a/kernel/power/em_netlink_autogen.h +++ b/kernel/power/em_netlink_autogen.h @@ -14,6 +14,8 @@ int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, struct genl_info *info); +int dev_energymodel_nl_get_perf_domains_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From c279e83953d937470f8a6e69b69f62608714f13f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 15 Dec 2025 13:42:19 -0800 Subject: iommu: Introduce pci_dev_reset_iommu_prepare/done() PCIe permits a device to ignore ATS invalidation TLPs while processing a reset. This creates a problem visible to the OS where an ATS invalidation command will time out. E.g. an SVA domain will have no coordination with a reset event and can racily issue ATS invalidations to a resetting device. The OS should do something to mitigate this as we do not want production systems to be reporting critical ATS failures, especially in a hypervisor environment. Broadly, OS could arrange to ignore the timeouts, block page table mutations to prevent invalidations, or disable and block ATS. The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and block ATS before initiating a Function Level Reset. It also mentions that other reset methods could have the same vulnerability as well. Provide a callback from the PCI subsystem that will enclose the reset and have the iommu core temporarily change all the attached RID/PASID domains group->blocking_domain so that the IOMMU hardware would fence any incoming ATS queries. And IOMMU drivers should also synchronously stop issuing new ATS invalidations and wait for all ATS invalidations to complete. This can avoid any ATS invaliation timeouts. However, if there is a domain attachment/replacement happening during an ongoing reset, ATS routines may be re-activated between the two function calls. So, introduce a new resetting_domain in the iommu_group structure to reject any concurrent attach_dev/set_dev_pasid call during a reset for a concern of compatibility failure. Since this changes the behavior of an attach operation, update the uAPI accordingly. Note that there are two corner cases: 1. Devices in the same iommu_group Since an attachment is always per iommu_group, this means that any sibling devices in the iommu_group cannot change domain, to prevent race conditions. 2. An SR-IOV PF that is being reset while its VF is not In such case, the VF itself is already broken. So, there is no point in preventing PF from going through the iommu reset. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 13 ++++ include/uapi/linux/vfio.h | 4 ++ 3 files changed, 190 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 672597100e9a..0665dedd91b2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -61,6 +61,11 @@ struct iommu_group { int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; + /* + * During a group device reset, @resetting_domain points to the physical + * domain, while @domain points to the attached domain before the reset. + */ + struct iommu_domain *resetting_domain; struct iommu_domain *domain; struct list_head entry; unsigned int owner_cnt; @@ -2195,6 +2200,15 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) guard(mutex)(&dev->iommu_group->mutex); + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + * + * Note that this might fail the iommu_dma_map(). But there's nothing + * more we can do here. + */ + if (dev->iommu_group->resetting_domain) + return -EBUSY; return __iommu_attach_device(domain, dev, NULL); } @@ -2253,6 +2267,17 @@ struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev) lockdep_assert_held(&group->mutex); + /* + * Driver handles the low-level __iommu_attach_device(), including the + * one invoked by pci_dev_reset_iommu_done() re-attaching the device to + * the cached group->domain. In this case, the driver must get the old + * domain from group->resetting_domain rather than group->domain. This + * prevents it from re-attaching the device from group->domain (old) to + * group->domain (new). + */ + if (group->resetting_domain) + return group->resetting_domain; + return group->domain; } EXPORT_SYMBOL_GPL(iommu_driver_get_domain_for_dev); @@ -2409,6 +2434,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, if (WARN_ON(!new_domain)) return -EINVAL; + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) + return -EBUSY; + /* * Changing the domain is done by calling attach_dev() on the new * domain. This switch does not have to be atomic and DMA can be @@ -3527,6 +3559,16 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, return -EINVAL; mutex_lock(&group->mutex); + + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) { + ret = -EBUSY; + goto out_unlock; + } + for_each_group_device(group, device) { /* * Skip PASID validation for devices without PASID support @@ -3610,6 +3652,16 @@ int iommu_replace_device_pasid(struct iommu_domain *domain, return -EINVAL; mutex_lock(&group->mutex); + + /* + * This is a concurrent attach during a device reset. Reject it until + * pci_dev_reset_iommu_done() attaches the device to group->domain. + */ + if (group->resetting_domain) { + ret = -EBUSY; + goto out_unlock; + } + entry = iommu_make_pasid_array_entry(domain, handle); curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, XA_ZERO_ENTRY, GFP_KERNEL); @@ -3867,6 +3919,127 @@ err_unlock: } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); +/** + * pci_dev_reset_iommu_prepare() - Block IOMMU to prepare for a PCI device reset + * @pdev: PCI device that is going to enter a reset routine + * + * The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and block + * ATS before initiating a reset. This means that a PCIe device during the reset + * routine wants to block any IOMMU activity: translation and ATS invalidation. + * + * This function attaches the device's RID/PASID(s) the group->blocking_domain, + * setting the group->resetting_domain. This allows the IOMMU driver pausing any + * IOMMU activity while leaving the group->domain pointer intact. Later when the + * reset is finished, pci_dev_reset_iommu_done() can restore everything. + * + * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done() + * before/after the core-level reset routine, to unset the resetting_domain. + * + * Return: 0 on success or negative error code if the preparation failed. + * + * These two functions are designed to be used by PCI reset functions that would + * not invoke any racy iommu_release_device(), since PCI sysfs node gets removed + * before it notifies with a BUS_NOTIFY_REMOVED_DEVICE. When using them in other + * case, callers must ensure there will be no racy iommu_release_device() call, + * which otherwise would UAF the dev->iommu_group pointer. + */ +int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) +{ + struct iommu_group *group = pdev->dev.iommu_group; + unsigned long pasid; + void *entry; + int ret; + + if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev)) + return 0; + + guard(mutex)(&group->mutex); + + /* Re-entry is not allowed */ + if (WARN_ON(group->resetting_domain)) + return -EBUSY; + + ret = __iommu_group_alloc_blocking_domain(group); + if (ret) + return ret; + + /* Stage RID domain at blocking_domain while retaining group->domain */ + if (group->domain != group->blocking_domain) { + ret = __iommu_attach_device(group->blocking_domain, &pdev->dev, + group->domain); + if (ret) + return ret; + } + + /* + * Stage PASID domains at blocking_domain while retaining pasid_array. + * + * The pasid_array is mostly fenced by group->mutex, except one reader + * in iommu_attach_handle_get(), so it's safe to read without xa_lock. + */ + xa_for_each_start(&group->pasid_array, pasid, entry, 1) + iommu_remove_dev_pasid(&pdev->dev, pasid, + pasid_array_entry_to_domain(entry)); + + group->resetting_domain = group->blocking_domain; + return ret; +} +EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); + +/** + * pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done + * @pdev: PCI device that has finished a reset routine + * + * After a PCIe device finishes a reset routine, it wants to restore its IOMMU + * IOMMU activity, including new translation as well as cache invalidation, by + * re-attaching all RID/PASID of the device's back to the domains retained in + * the core-level structure. + * + * Caller must pair it with a successful pci_dev_reset_iommu_prepare(). + * + * Note that, although unlikely, there is a risk that re-attaching domains might + * fail due to some unexpected happening like OOM. + */ +void pci_dev_reset_iommu_done(struct pci_dev *pdev) +{ + struct iommu_group *group = pdev->dev.iommu_group; + unsigned long pasid; + void *entry; + + if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev)) + return; + + guard(mutex)(&group->mutex); + + /* pci_dev_reset_iommu_prepare() was bypassed for the device */ + if (!group->resetting_domain) + return; + + /* pci_dev_reset_iommu_prepare() was not successfully called */ + if (WARN_ON(!group->blocking_domain)) + return; + + /* Re-attach RID domain back to group->domain */ + if (group->domain != group->blocking_domain) { + WARN_ON(__iommu_attach_device(group->domain, &pdev->dev, + group->blocking_domain)); + } + + /* + * Re-attach PASID domains back to the domains retained in pasid_array. + * + * The pasid_array is mostly fenced by group->mutex, except one reader + * in iommu_attach_handle_get(), so it's safe to read without xa_lock. + */ + xa_for_each_start(&group->pasid_array, pasid, entry, 1) + WARN_ON(__iommu_set_group_pasid( + pasid_array_entry_to_domain(entry), group, pasid, + group->blocking_domain)); + + group->resetting_domain = NULL; +} +EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done); + #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) /** * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ff097df318b9..54b8b48c762e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1188,6 +1188,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev); void iommu_free_global_pasid(ioasid_t pasid); + +/* PCI device reset functions */ +int pci_dev_reset_iommu_prepare(struct pci_dev *pdev); +void pci_dev_reset_iommu_done(struct pci_dev *pdev); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1511,6 +1515,15 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) } static inline void iommu_free_global_pasid(ioasid_t pasid) {} + +static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) +{ + return 0; +} + +static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev) +{ +} #endif /* CONFIG_IOMMU_API */ #ifdef CONFIG_IRQ_MSI_IOMMU diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ac2329f24141..bb7b89330d35 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -964,6 +964,10 @@ struct vfio_device_bind_iommufd { * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. + * + * When a device is resetting, -EBUSY will be returned to reject any concurrent + * attachment to the resetting device itself or any sibling device in the IOMMU + * group having the resetting device. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; -- cgit v1.2.3 From bc87b14594e30720a5c1546c24e0f5f08d34eb40 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 8 Jan 2026 10:35:21 -0800 Subject: bnxt_en: Implement ethtool_ops -> get_link_ext_state() Map the link_down_reason from the FW to the ethtool link_ext_state when it is available. Also log it to the link down dmesg when it is available. Add 2 new link_ext_state enums to the UAPI: ETHTOOL_LINK_EXT_STATE_OTP_SPEED_VIOLATION ETHTOOL_LINK_EXT_STATE_BMC_REQUEST_DOWN to cover OTP (one-time-programmable) speed restrictions and BMC (Baseboard management controller) forcing the link down. Reviewed-by: Andy Gospodarek Reviewed-by: Kalesh AP Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20260108183521.215610-7-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 +++++++++++++++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 35 +++++++++++++++++++++++ include/uapi/linux/ethtool.h | 2 ++ 4 files changed, 62 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 9902babd82cb..cb78614d4108 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11915,6 +11915,26 @@ static char *bnxt_report_fec(struct bnxt_link_info *link_info) } } +static char *bnxt_link_down_reason(struct bnxt_link_info *link_info) +{ + u8 reason = link_info->link_down_reason; + + /* Multiple bits can be set, we report 1 bit only in order of + * priority. + */ + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_RF) + return "(Remote fault)"; + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_OTP_SPEED_VIOLATION) + return "(OTP Speed limit violation)"; + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_CABLE_REMOVED) + return "(Cable removed)"; + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_MODULE_FAULT) + return "(Module fault)"; + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_BMC_REQUEST) + return "(BMC request down)"; + return ""; +} + void bnxt_report_link(struct bnxt *bp) { if (BNXT_LINK_IS_UP(bp)) { @@ -11972,8 +11992,10 @@ void bnxt_report_link(struct bnxt *bp) (fec & BNXT_FEC_AUTONEG) ? "on" : "off", bnxt_report_fec(&bp->link_info)); } else { + char *str = bnxt_link_down_reason(&bp->link_info); + netif_carrier_off(bp->dev); - netdev_err(bp->dev, "NIC Link is Down\n"); + netdev_err(bp->dev, "NIC Link is Down %s\n", str); } } @@ -12173,6 +12195,7 @@ int bnxt_update_link(struct bnxt *bp, bool chng_link_state) link_info->phy_addr = resp->eee_config_phy_addr & PORT_PHY_QCFG_RESP_PHY_ADDR_MASK; link_info->module_status = resp->module_status; + link_info->link_down_reason = resp->link_down_reason; if (bp->phy_flags & BNXT_PHY_FL_EEE_CAP) { struct ethtool_keee *eee = &bp->eee; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3afd1d5e364a..e441a002ddef 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1553,6 +1553,7 @@ struct bnxt_link_info { #define BNXT_LINK_STATE_DOWN 1 #define BNXT_LINK_STATE_UP 2 #define BNXT_LINK_IS_UP(bp) ((bp)->link_info.link_state == BNXT_LINK_STATE_UP) + u8 link_down_reason; u8 active_lanes; u8 duplex; #define BNXT_LINK_DUPLEX_HALF PORT_PHY_QCFG_RESP_DUPLEX_STATE_HALF diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 4dfae7b61c76..6b15fedbb16f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -3432,6 +3432,40 @@ static u32 bnxt_get_link(struct net_device *dev) return BNXT_LINK_IS_UP(bp); } +static int bnxt_get_link_ext_state(struct net_device *dev, + struct ethtool_link_ext_state_info *info) +{ + struct bnxt *bp = netdev_priv(dev); + u8 reason; + + if (BNXT_LINK_IS_UP(bp)) + return -ENODATA; + + reason = bp->link_info.link_down_reason; + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_RF) { + info->link_ext_state = ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE; + info->link_training = ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT; + return 0; + } + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_CABLE_REMOVED) { + info->link_ext_state = ETHTOOL_LINK_EXT_STATE_NO_CABLE; + return 0; + } + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_OTP_SPEED_VIOLATION) { + info->link_ext_state = ETHTOOL_LINK_EXT_STATE_OTP_SPEED_VIOLATION; + return 0; + } + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_MODULE_FAULT) { + info->link_ext_state = ETHTOOL_LINK_EXT_STATE_MODULE; + return 0; + } + if (reason & PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_BMC_REQUEST) { + info->link_ext_state = ETHTOOL_LINK_EXT_STATE_BMC_REQUEST_DOWN; + return 0; + } + return -ENODATA; +} + int bnxt_hwrm_nvm_get_dev_info(struct bnxt *bp, struct hwrm_nvm_get_dev_info_output *nvm_dev_info) { @@ -5711,6 +5745,7 @@ const struct ethtool_ops bnxt_ethtool_ops = { .get_eeprom = bnxt_get_eeprom, .set_eeprom = bnxt_set_eeprom, .get_link = bnxt_get_link, + .get_link_ext_state = bnxt_get_link_ext_state, .get_link_ext_stats = bnxt_get_link_ext_stats, .get_eee = bnxt_get_eee, .set_eee = bnxt_set_eee, diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index eb7ff2602fbb..5daa8f225b67 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -603,6 +603,8 @@ enum ethtool_link_ext_state { ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, ETHTOOL_LINK_EXT_STATE_OVERHEAT, ETHTOOL_LINK_EXT_STATE_MODULE, + ETHTOOL_LINK_EXT_STATE_OTP_SPEED_VIOLATION, + ETHTOOL_LINK_EXT_STATE_BMC_REQUEST_DOWN, }; /* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ -- cgit v1.2.3 From 576ee5dfd459abe8e29bee8b204cd259e60b4e18 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 12 Jan 2026 16:47:10 +0100 Subject: fs: add immutable rootfs Currently pivot_root() doesn't work on the real rootfs because it cannot be unmounted. Userspace has to do a recursive removal of the initramfs contents manually before continuing the boot. Really all we want from the real rootfs is to serve as the parent mount for anything that is actually useful such as the tmpfs or ramfs for initramfs unpacking or the rootfs itself. There's no need for the real rootfs to actually be anything meaningful or useful. Add a immutable rootfs called "nullfs" that can be selected via the "nullfs_rootfs" kernel command line option. The kernel will mount a tmpfs/ramfs on top of it, unpack the initramfs and fire up userspace which mounts the rootfs and can then just do: chdir(rootfs); pivot_root(".", "."); umount2(".", MNT_DETACH); and be done with it. (Ofc, userspace can also choose to retain the initramfs contents by using something like pivot_root(".", "/initramfs") without unmounting it.) Technically this also means that the rootfs mount in unprivileged namespaces doesn't need to become MNT_LOCKED anymore as it's guaranteed that the immutable rootfs remains permanently empty so there cannot be anything revealed by unmounting the covering mount. In the future this will also allow us to create completely empty mount namespaces without risking to leak anything. systemd already handles this all correctly as it tries to pivot_root() first and falls back to MS_MOVE only when that fails. This goes back to various discussion in previous years and a LPC 2024 presentation about this very topic. Link: https://patch.msgid.link/20260112-work-immutable-rootfs-v2-3-88dd1c34a204@kernel.org Signed-off-by: Christian Brauner --- fs/Makefile | 2 +- fs/mount.h | 1 + fs/namespace.c | 82 +++++++++++++++++++++++++++++++++++++++------- fs/nullfs.c | 70 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/magic.h | 1 + init/do_mounts.c | 14 ++++++++ init/do_mounts.h | 1 + 7 files changed, 159 insertions(+), 12 deletions(-) create mode 100644 fs/nullfs.c (limited to 'include/uapi/linux') diff --git a/fs/Makefile b/fs/Makefile index a04274a3c854..becf133e4791 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ - file_attr.o + file_attr.o nullfs.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/mount.h b/fs/mount.h index 2d28ef2a3aed..e0816c11a198 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,7 @@ #include #include +extern struct file_system_type nullfs_fs_type; extern struct list_head notify_list; struct mnt_namespace { diff --git a/fs/namespace.c b/fs/namespace.c index 9261f56ccc81..a44ebb2f1161 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,6 +75,17 @@ static int __init initramfs_options_setup(char *str) __setup("initramfs_options=", initramfs_options_setup); +bool nullfs_rootfs = false; + +static int __init nullfs_rootfs_setup(char *str) +{ + if (*str) + return 0; + nullfs_rootfs = true; + return 1; +} +__setup("nullfs_rootfs", nullfs_rootfs_setup); + static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); @@ -4582,8 +4593,9 @@ int path_pivot_root(struct path *new, struct path *old) * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * - * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives + * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem + * unless the kernel was booted with "nullfs_rootfs". See + * Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: @@ -5976,24 +5988,72 @@ struct mnt_namespace init_mnt_ns = { static void __init init_mount_tree(void) { - struct vfsmount *mnt; - struct mount *m; + struct vfsmount *mnt, *nullfs_mnt; + struct mount *mnt_root; struct path root; + /* + * When nullfs is used, we create two mounts: + * + * (1) nullfs with mount id 1 + * (2) mutable rootfs with mount id 2 + * + * with (2) mounted on top of (1). + */ + if (nullfs_rootfs) { + nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL); + if (IS_ERR(nullfs_mnt)) + panic("VFS: Failed to create nullfs"); + } + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); - m = real_mount(mnt); - init_mnt_ns.root = m; - init_mnt_ns.nr_mounts = 1; - mnt_add_to_ns(&init_mnt_ns, m); + if (nullfs_rootfs) { + VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1); + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2); + + /* The namespace root is the nullfs mnt. */ + mnt_root = real_mount(nullfs_mnt); + init_mnt_ns.root = mnt_root; + + /* Mount mutable rootfs on top of nullfs. */ + root.mnt = nullfs_mnt; + root.dentry = nullfs_mnt->mnt_root; + + LOCK_MOUNT_EXACT(mp, &root); + if (unlikely(IS_ERR(mp.parent))) + panic("VFS: Failed to mount rootfs on nullfs"); + scoped_guard(mount_writer) + attach_mnt(real_mount(mnt), mp.parent, mp.mp); + + pr_info("VFS: Finished mounting rootfs on nullfs\n"); + } else { + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 1); + + /* The namespace root is the mutable rootfs. */ + mnt_root = real_mount(mnt); + init_mnt_ns.root = mnt_root; + } + + /* + * We've dropped all locks here but that's fine. Not just are we + * the only task that's running, there's no other mount + * namespace in existence and the initial mount namespace is + * completely empty until we add the mounts we just created. + */ + for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) { + mnt_add_to_ns(&init_mnt_ns, p); + init_mnt_ns.nr_mounts++; + } + init_task.nsproxy->mnt_ns = &init_mnt_ns; get_mnt_ns(&init_mnt_ns); - root.mnt = mnt; - root.dentry = mnt->mnt_root; - + /* The root and pwd always point to the mutable rootfs. */ + root.mnt = mnt; + root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); diff --git a/fs/nullfs.c b/fs/nullfs.c new file mode 100644 index 000000000000..fdbd3e5d3d71 --- /dev/null +++ b/fs/nullfs.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Christian Brauner */ +#include +#include +#include + +static const struct super_operations nullfs_super_operations = { + .statfs = simple_statfs, +}; + +static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct inode *inode; + + s->s_maxbytes = MAX_LFS_FILESIZE; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; + s->s_magic = NULL_FS_MAGIC; + s->s_op = &nullfs_super_operations; + s->s_export_op = NULL; + s->s_xattr = NULL; + s->s_time_gran = 1; + s->s_d_flags = 0; + + inode = new_inode(s); + if (!inode) + return -ENOMEM; + + /* nullfs is permanently empty... */ + make_empty_dir_inode(inode); + simple_inode_init_ts(inode); + inode->i_ino = 1; + /* ... and immutable. */ + inode->i_flags |= S_IMMUTABLE; + + s->s_root = d_make_root(inode); + if (!s->s_root) + return -ENOMEM; + + return 0; +} + +/* + * For now this is a single global instance. If needed we can make it + * mountable by userspace at which point we will need to make it + * multi-instance. + */ +static int nullfs_fs_get_tree(struct fs_context *fc) +{ + return get_tree_single(fc, nullfs_fs_fill_super); +} + +static const struct fs_context_operations nullfs_fs_context_ops = { + .get_tree = nullfs_fs_get_tree, +}; + +static int nullfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &nullfs_fs_context_ops; + fc->global = true; + fc->sb_flags = SB_NOUSER; + fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV; + return 0; +} + +struct file_system_type nullfs_fs_type = { + .name = "nullfs", + .init_fs_context = nullfs_init_fs_context, + .kill_sb = kill_anon_super, +}; diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 638ca21b7a90..4f2da935a76c 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -104,5 +104,6 @@ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ #define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ +#define NULL_FS_MAGIC 0x4E554C4C /* "NULL" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/init/do_mounts.c b/init/do_mounts.c index defbbf1d55f7..675397c8a7a4 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -492,6 +492,20 @@ void __init prepare_namespace(void) mount_root(saved_root_name); out: devtmpfs_mount(); + + if (nullfs_rootfs) { + if (init_pivot_root(".", ".")) { + pr_err("VFS: Failed to pivot into new rootfs\n"); + return; + } + if (init_umount(".", MNT_DETACH)) { + pr_err("VFS: Failed to unmount old rootfs\n"); + return; + } + pr_info("VFS: Pivoted into new rootfs\n"); + return; + } + init_mount(".", "/", NULL, MS_MOVE, NULL); init_chroot("."); } diff --git a/init/do_mounts.h b/init/do_mounts.h index 6069ea3eb80d..fbfee810aa89 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -15,6 +15,7 @@ void mount_root_generic(char *name, char *pretty_name, int flags); void mount_root(char *root_device_name); extern int root_mountflags; +extern bool nullfs_rootfs; static inline __init int create_dev(char *name, dev_t dev) { -- cgit v1.2.3 From 6abbb8703aeeb645a681ab6ad155e0b450413787 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Sun, 11 Jan 2026 18:52:04 +0100 Subject: landlock: Clarify documentation for the IOCTL access right MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the description of the LANDLOCK_ACCESS_FS_IOCTL_DEV access right together with the file access rights. This group of access rights applies to files (in this case device files), and they can be added to file or directory inodes using landlock_add_rule(2). The check for that works the same for all file access rights, including LANDLOCK_ACCESS_FS_IOCTL_DEV. Invoking ioctl(2) on directory FDs can not currently be restricted with Landlock. Having it grouped separately in the documentation is a remnant from earlier revisions of the LANDLOCK_ACCESS_FS_IOCTL_DEV patch set. Link: https://lore.kernel.org/all/20260108.Thaex5ruach2@digikod.net/ Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20260111175203.6545-2-gnoack3000@gmail.com Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index f030adc462ee..75fd7f5e6cc3 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -216,6 +216,23 @@ struct landlock_net_port_attr { * :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with * ``O_TRUNC``. This access right is available since the third version of the * Landlock ABI. + * - %LANDLOCK_ACCESS_FS_IOCTL_DEV: Invoke :manpage:`ioctl(2)` commands on an opened + * character or block device. + * + * This access right applies to all `ioctl(2)` commands implemented by device + * drivers. However, the following common IOCTL commands continue to be + * invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL_DEV right: + * + * * IOCTL commands targeting file descriptors (``FIOCLEX``, ``FIONCLEX``), + * * IOCTL commands targeting file descriptions (``FIONBIO``, ``FIOASYNC``), + * * IOCTL commands targeting file systems (``FIFREEZE``, ``FITHAW``, + * ``FIGETBSZ``, ``FS_IOC_GETFSUUID``, ``FS_IOC_GETFSSYSFSPATH``) + * * Some IOCTL commands which do not make sense when used with devices, but + * whose implementations are safe and return the right error codes + * (``FS_IOC_FIEMAP``, ``FICLONE``, ``FICLONERANGE``, ``FIDEDUPERANGE``) + * + * This access right is available since the fifth version of the Landlock + * ABI. * * Whether an opened file can be truncated with :manpage:`ftruncate(2)` or used * with `ioctl(2)` is determined during :manpage:`open(2)`, in the same way as @@ -275,26 +292,6 @@ struct landlock_net_port_attr { * If multiple requirements are not met, the ``EACCES`` error code takes * precedence over ``EXDEV``. * - * The following access right applies both to files and directories: - * - * - %LANDLOCK_ACCESS_FS_IOCTL_DEV: Invoke :manpage:`ioctl(2)` commands on an opened - * character or block device. - * - * This access right applies to all `ioctl(2)` commands implemented by device - * drivers. However, the following common IOCTL commands continue to be - * invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL_DEV right: - * - * * IOCTL commands targeting file descriptors (``FIOCLEX``, ``FIONCLEX``), - * * IOCTL commands targeting file descriptions (``FIONBIO``, ``FIOASYNC``), - * * IOCTL commands targeting file systems (``FIFREEZE``, ``FITHAW``, - * ``FIGETBSZ``, ``FS_IOC_GETFSUUID``, ``FS_IOC_GETFSSYSFSPATH``) - * * Some IOCTL commands which do not make sense when used with devices, but - * whose implementations are safe and return the right error codes - * (``FS_IOC_FIEMAP``, ``FICLONE``, ``FICLONERANGE``, ``FIDEDUPERANGE``) - * - * This access right is available since the fifth version of the Landlock - * ABI. - * * .. warning:: * * It is currently not possible to restrict some file-related actions -- cgit v1.2.3 From 98bf2256855eb682433a33e6a7c4bce35191ca99 Mon Sep 17 00:00:00 2001 From: Stanley Zhang Date: Thu, 8 Jan 2026 02:19:31 -0700 Subject: ublk: support UBLK_PARAM_TYPE_INTEGRITY in device creation Add a feature flag UBLK_F_INTEGRITY for a ublk server to request integrity/metadata support when creating a ublk device. The ublk server can also check for the feature flag on the created device or the result of UBLK_U_CMD_GET_FEATURES to tell if the ublk driver supports it. UBLK_F_INTEGRITY requires UBLK_F_USER_COPY, as user copy is the only data copy mode initially supported for integrity data. Add UBLK_PARAM_TYPE_INTEGRITY and struct ublk_param_integrity to struct ublk_params to specify the integrity params of a ublk device. UBLK_PARAM_TYPE_INTEGRITY requires UBLK_F_INTEGRITY and a nonzero metadata_size. The LBMD_PI_CAP_* and LBMD_PI_CSUM_* values from the linux/fs.h UAPI header are used for the flags and csum_type fields. If the UBLK_PARAM_TYPE_INTEGRITY flag is set, validate the integrity parameters and apply them to the blk_integrity limits. The struct ublk_param_integrity validations are based on the checks in blk_validate_integrity_limits(). Any invalid parameters should be rejected before being applied to struct blk_integrity. [csander: drop redundant pi_tuple_size field, use block metadata UAPI constants, add param validation] Signed-off-by: Stanley Zhang Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 101 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 19 ++++++++ 2 files changed, 119 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 53df4bfa2c92..a4d62e8e4f6b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -44,6 +44,8 @@ #include #include #include +#include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -83,7 +85,8 @@ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ - UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) + UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \ + UBLK_PARAM_TYPE_INTEGRITY) struct ublk_uring_cmd_pdu { /* @@ -301,6 +304,11 @@ static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_ZONED; } +static inline bool ublk_dev_support_integrity(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_INTEGRITY; +} + #ifdef CONFIG_BLK_DEV_ZONED struct ublk_zoned_report_desc { @@ -616,6 +624,53 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) set_capacity(ub->ub_disk, p->dev_sectors); } +static int ublk_integrity_flags(u32 flags) +{ + int ret_flags = 0; + + if (flags & LBMD_PI_CAP_INTEGRITY) { + flags &= ~LBMD_PI_CAP_INTEGRITY; + ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + } + if (flags & LBMD_PI_CAP_REFTAG) { + flags &= ~LBMD_PI_CAP_REFTAG; + ret_flags |= BLK_INTEGRITY_REF_TAG; + } + return flags ? -EINVAL : ret_flags; +} + +static int ublk_integrity_pi_tuple_size(u8 csum_type) +{ + switch (csum_type) { + case LBMD_PI_CSUM_NONE: + return 0; + case LBMD_PI_CSUM_IP: + case LBMD_PI_CSUM_CRC16_T10DIF: + return 8; + case LBMD_PI_CSUM_CRC64_NVME: + return 16; + default: + return -EINVAL; + } +} + +static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type) +{ + switch (csum_type) { + case LBMD_PI_CSUM_NONE: + return BLK_INTEGRITY_CSUM_NONE; + case LBMD_PI_CSUM_IP: + return BLK_INTEGRITY_CSUM_IP; + case LBMD_PI_CSUM_CRC16_T10DIF: + return BLK_INTEGRITY_CSUM_CRC; + case LBMD_PI_CSUM_CRC64_NVME: + return BLK_INTEGRITY_CSUM_CRC64; + default: + WARN_ON_ONCE(1); + return BLK_INTEGRITY_CSUM_NONE; + } +} + static int ublk_validate_params(const struct ublk_device *ub) { /* basic param is the only one which must be set */ @@ -678,6 +733,29 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { + const struct ublk_param_integrity *p = &ub->params.integrity; + int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); + int flags = ublk_integrity_flags(p->flags); + + if (!ublk_dev_support_integrity(ub)) + return -EINVAL; + if (flags < 0) + return flags; + if (pi_tuple_size < 0) + return pi_tuple_size; + if (!p->metadata_size) + return -EINVAL; + if (p->csum_type == LBMD_PI_CSUM_NONE && + p->flags & LBMD_PI_CAP_REFTAG) + return -EINVAL; + if (p->pi_offset + pi_tuple_size > p->metadata_size) + return -EINVAL; + if (p->interval_exp < SECTOR_SHIFT || + p->interval_exp > ub->params.basic.logical_bs_shift) + return -EINVAL; + } + return 0; } @@ -2950,6 +3028,23 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, lim.max_segments = ub->params.seg.max_segments; } + if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { + const struct ublk_param_integrity *p = &ub->params.integrity; + int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); + + lim.max_integrity_segments = + p->max_integrity_segments ?: USHRT_MAX; + lim.integrity = (struct blk_integrity) { + .flags = ublk_integrity_flags(p->flags), + .csum_type = ublk_integrity_csum_type(p->csum_type), + .metadata_size = p->metadata_size, + .pi_offset = p->pi_offset, + .interval_exp = p->interval_exp, + .tag_size = p->tag_size, + .pi_tuple_size = pi_tuple_size, + }; + } + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -3140,6 +3235,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) return -EINVAL; } + /* User copy is required to access integrity buffer */ + if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY)) + return -EINVAL; + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index ec77dabba45b..4c141d7e4710 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -311,6 +311,12 @@ */ #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) +/* + * ublk device supports requests with integrity/metadata buffer. + * Requires UBLK_F_USER_COPY. + */ +#define UBLK_F_INTEGRITY (1ULL << 16) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -600,6 +606,17 @@ struct ublk_param_segment { __u8 pad[2]; }; +struct ublk_param_integrity { + __u32 flags; /* LBMD_PI_CAP_* from linux/fs.h */ + __u16 max_integrity_segments; /* 0 means no limit */ + __u8 interval_exp; + __u8 metadata_size; /* UBLK_PARAM_TYPE_INTEGRITY requires nonzero */ + __u8 pi_offset; + __u8 csum_type; /* LBMD_PI_CSUM_* from linux/fs.h */ + __u8 tag_size; + __u8 pad[5]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -614,6 +631,7 @@ struct ublk_params { #define UBLK_PARAM_TYPE_ZONED (1 << 3) #define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4) #define UBLK_PARAM_TYPE_SEGMENT (1 << 5) +#define UBLK_PARAM_TYPE_INTEGRITY (1 << 6) /* requires UBLK_F_INTEGRITY */ __u32 types; /* types of parameter included */ struct ublk_param_basic basic; @@ -622,6 +640,7 @@ struct ublk_params { struct ublk_param_zoned zoned; struct ublk_param_dma_align dma; struct ublk_param_segment seg; + struct ublk_param_integrity integrity; }; #endif -- cgit v1.2.3 From f82f0a16a8270b17211254beeb123d11a0f279cd Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:32 -0700 Subject: ublk: set UBLK_IO_F_INTEGRITY in ublksrv_io_desc Indicate to the ublk server when an incoming request has integrity data by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 +++ include/uapi/linux/ublk_cmd.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a4d62e8e4f6b..fc7de2985a20 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1113,6 +1113,9 @@ static inline unsigned int ublk_req_build_flags(struct request *req) if (req->cmd_flags & REQ_SWAP) flags |= UBLK_IO_F_SWAP; + if (blk_integrity_rq(req)) + flags |= UBLK_IO_F_INTEGRITY; + return flags; } diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 4c141d7e4710..dfde4aee39eb 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -414,6 +414,8 @@ struct ublksrv_ctrl_dev_info { * passed in. */ #define UBLK_IO_F_NEED_REG_BUF (1U << 17) +/* Request has an integrity data buffer */ +#define UBLK_IO_F_INTEGRITY (1UL << 18) /* * io cmd is described by this structure, and stored in share memory, indexed -- cgit v1.2.3 From be82a89066d595da334f6e153ababcedc3f92ad6 Mon Sep 17 00:00:00 2001 From: Stanley Zhang Date: Thu, 8 Jan 2026 02:19:37 -0700 Subject: ublk: implement integrity user copy Add a function ublk_copy_user_integrity() to copy integrity information between a request and a user iov_iter. This mirrors the existing ublk_copy_user_pages() but operates on request integrity data instead of regular data. Check UBLKSRV_IO_INTEGRITY_FLAG in iocb->ki_pos in ublk_user_copy() to choose between copying data or integrity data. [csander: change offset units from data bytes to integrity data bytes, fix CONFIG_BLK_DEV_INTEGRITY=n build, rebase on user copy refactor] Signed-off-by: Stanley Zhang Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 53 +++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 4 ++++ 2 files changed, 55 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d428a25121db..5c441f507c43 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1040,6 +1040,33 @@ static size_t ublk_copy_user_pages(const struct request *req, return done; } +#ifdef CONFIG_BLK_DEV_INTEGRITY +static size_t ublk_copy_user_integrity(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) +{ + size_t done = 0; + struct bio *bio = req->bio; + struct bvec_iter iter; + struct bio_vec iv; + + if (!blk_integrity_rq(req)) + return 0; + + bio_for_each_integrity_vec(iv, bio, iter) { + if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done)) + break; + } + + return done; +} +#else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */ +static size_t ublk_copy_user_integrity(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) +{ + return 0; +} +#endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */ + static inline bool ublk_need_map_req(const struct request *req) { return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; @@ -2668,6 +2695,8 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) struct ublk_queue *ubq; struct request *req; struct ublk_io *io; + unsigned data_len; + bool is_integrity; size_t buf_off; u16 tag, q_id; ssize_t ret; @@ -2681,6 +2710,10 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) tag = ublk_pos_to_tag(iocb->ki_pos); q_id = ublk_pos_to_hwq(iocb->ki_pos); buf_off = ublk_pos_to_buf_off(iocb->ki_pos); + is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG); + + if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity)) + return -EINVAL; if (q_id >= ub->dev_info.nr_hw_queues) return -EINVAL; @@ -2697,7 +2730,14 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) if (!req) return -EINVAL; - if (buf_off > blk_rq_bytes(req)) { + if (is_integrity) { + struct blk_integrity *bi = &req->q->limits.integrity; + + data_len = bio_integrity_bytes(bi, blk_rq_sectors(req)); + } else { + data_len = blk_rq_bytes(req); + } + if (buf_off > data_len) { ret = -EINVAL; goto out; } @@ -2707,7 +2747,10 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) goto out; } - ret = ublk_copy_user_pages(req, buf_off, iter, dir); + if (is_integrity) + ret = ublk_copy_user_integrity(req, buf_off, iter, dir); + else + ret = ublk_copy_user_pages(req, buf_off, iter, dir); out: ublk_put_req_ref(io, req); @@ -3948,6 +3991,12 @@ static int __init ublk_init(void) BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + /* + * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE + * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG + */ + BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >= + UBLKSRV_IO_INTEGRITY_FLAG); BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8); init_waitqueue_head(&ublk_idr_wq); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index dfde4aee39eb..61ac5d8e1078 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -134,6 +134,10 @@ #define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS) #define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) +/* Copy to/from request integrity buffer instead of data buffer */ +#define UBLK_INTEGRITY_FLAG_OFF 62 +#define UBLKSRV_IO_INTEGRITY_FLAG (1ULL << UBLK_INTEGRITY_FLAG_OFF) + /* * ublk server can register data buffers for incoming I/O requests with a sparse * io_uring buffer table. The request buffer can then be used as the data buffer -- cgit v1.2.3 From e6ce36ccc86f6d447808a6e620f56d440d74aa19 Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Wed, 19 Nov 2025 22:24:07 +0000 Subject: init: remove /proc/sys/kernel/real-root-dev It is not used anymore. Signed-off-by: Askar Safin Link: https://patch.msgid.link/20251119222407.3333257-4-safinaskar@gmail.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- Documentation/admin-guide/sysctl/kernel.rst | 6 ------ include/uapi/linux/sysctl.h | 1 - init/do_mounts_initrd.c | 20 -------------------- 3 files changed, 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 239da22c4e28..bb577fac76a0 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1235,12 +1235,6 @@ that support this feature. == =========================================================================== -real-root-dev -============= - -See Documentation/admin-guide/initrd.rst. - - reboot-cmd (SPARC only) ======================= diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 63d1464cb71c..1c7fe0f4dca4 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -92,7 +92,6 @@ enum KERN_DOMAINNAME=8, /* string: domainname */ KERN_PANIC=15, /* int: panic timeout */ - KERN_REALROOTDEV=16, /* real root device to mount after initrd */ KERN_SPARC_REBOOT=21, /* reboot command on Sparc */ KERN_CTLALTDEL=22, /* int: allow ctl-alt-del to reboot */ diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index fe335dbc95e0..892e69ab41c4 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -8,31 +8,11 @@ unsigned long initrd_start, initrd_end; int initrd_below_start_ok; -static unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ static int __initdata mount_initrd = 1; phys_addr_t phys_initrd_start __initdata; unsigned long phys_initrd_size __initdata; -#ifdef CONFIG_SYSCTL -static const struct ctl_table kern_do_mounts_initrd_table[] = { - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -}; - -static __init int kernel_do_mounts_initrd_sysctls_init(void) -{ - register_sysctl_init("kernel", kern_do_mounts_initrd_table); - return 0; -} -late_initcall(kernel_do_mounts_initrd_sysctls_init); -#endif /* CONFIG_SYSCTL */ - static int __init no_initrd(char *str) { pr_warn("noinitrd option is deprecated and will be removed soon\n"); -- cgit v1.2.3 From e1cbdf78f60c35a1a320ca401852fd6a73624a4a Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 9 Jan 2026 19:14:39 +1100 Subject: wifi: cfg80211: include S1G_NO_PRIMARY flag when sending channel When sending a channel ensure we include the IEEE80211_CHAN_S1G_NO_PRIMARY flag. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20260109081439.3168-1-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 3 +++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8134f10e4e6c..964e1c779cdd 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4444,6 +4444,9 @@ enum nl80211_wmm_rule { * channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_NO_16MHZ: 16 MHz operation is not allowed on this * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY: Channel is not permitted for use + * as a primary channel. Does not prevent the channel from existing + * as a non-primary subchannel. Only applicable to S1G channels. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4492,6 +4495,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_4MHZ, NL80211_FREQUENCY_ATTR_NO_8MHZ, NL80211_FREQUENCY_ATTR_NO_16MHZ, + NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c961cd42a832..225580507a4b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1314,6 +1314,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, -- cgit v1.2.3 From 93ada1b3da398b492c45429cef1a1c9651d5c7ba Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Tue, 13 Jan 2026 00:05:01 +0200 Subject: ublk: add UBLK_CMD_TRY_STOP_DEV command Add a best-effort stop command, UBLK_CMD_TRY_STOP_DEV, which only stops a ublk device when it has no active openers. Unlike UBLK_CMD_STOP_DEV, this command does not disrupt existing users. New opens are blocked only after disk_openers has reached zero; if the device is busy, the command returns -EBUSY and leaves it running. The ub->block_open flag is used only to close a race with an in-progress open and does not otherwise change open behavior. Advertise support via the UBLK_F_SAFE_STOP_DEV feature flag. Signed-off-by: Yoav Cohen Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 44 +++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 9 ++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 73490890242b..aaf94d2fb789 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -56,6 +56,7 @@ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) +#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV) #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -76,7 +77,8 @@ | UBLK_F_QUIESCE \ | UBLK_F_PER_IO_DAEMON \ | UBLK_F_BUF_REG_OFF_DAEMON \ - | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0)) + | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ + | UBLK_F_SAFE_STOP_DEV) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -243,6 +245,8 @@ struct ublk_device { struct delayed_work exit_work; struct work_struct partition_scan_work; + bool block_open; /* protected by open_mutex */ + struct ublk_queue *queues[]; }; @@ -984,6 +988,9 @@ static int ublk_open(struct gendisk *disk, blk_mode_t mode) return -EPERM; } + if (ub->block_open) + return -ENXIO; + return 0; } @@ -3343,7 +3350,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK | UBLK_F_PER_IO_DAEMON | - UBLK_F_BUF_REG_OFF_DAEMON; + UBLK_F_BUF_REG_OFF_DAEMON | + UBLK_F_SAFE_STOP_DEV; /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | @@ -3464,6 +3472,34 @@ static void ublk_ctrl_stop_dev(struct ublk_device *ub) ublk_stop_dev(ub); } +static int ublk_ctrl_try_stop_dev(struct ublk_device *ub) +{ + struct gendisk *disk; + int ret = 0; + + disk = ublk_get_disk(ub); + if (!disk) + return -ENODEV; + + mutex_lock(&disk->open_mutex); + if (disk_openers(disk) > 0) { + ret = -EBUSY; + goto unlock; + } + ub->block_open = true; + /* release open_mutex as del_gendisk() will reacquire it */ + mutex_unlock(&disk->open_mutex); + + ublk_ctrl_stop_dev(ub); + goto out; + +unlock: + mutex_unlock(&disk->open_mutex); +out: + ublk_put_disk(disk); + return ret; +} + static int ublk_ctrl_get_dev_info(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) { @@ -3859,6 +3895,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_END_USER_RECOVERY: case UBLK_CMD_UPDATE_SIZE: case UBLK_CMD_QUIESCE_DEV: + case UBLK_CMD_TRY_STOP_DEV: mask = MAY_READ | MAY_WRITE; break; default: @@ -3972,6 +4009,9 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_QUIESCE_DEV: ret = ublk_ctrl_quiesce_dev(ub, header); break; + case UBLK_CMD_TRY_STOP_DEV: + ret = ublk_ctrl_try_stop_dev(ub); + break; default: ret = -EOPNOTSUPP; break; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 61ac5d8e1078..90f47da4f435 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -55,7 +55,8 @@ _IOWR('u', 0x15, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_QUIESCE_DEV \ _IOWR('u', 0x16, struct ublksrv_ctrl_cmd) - +#define UBLK_U_CMD_TRY_STOP_DEV \ + _IOWR('u', 0x17, struct ublksrv_ctrl_cmd) /* * 64bits are enough now, and it should be easy to extend in case of * running out of feature flags @@ -321,6 +322,12 @@ */ #define UBLK_F_INTEGRITY (1ULL << 16) +/* + * The device supports the UBLK_CMD_TRY_STOP_DEV command, which + * allows stopping the device only if there are no openers. + */ +#define UBLK_F_SAFE_STOP_DEV (1ULL << 17) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 -- cgit v1.2.3 From 406fc2e9ca65e0df345ebf4ce95aa87cb6416f35 Mon Sep 17 00:00:00 2001 From: Deepa Guthyappa Madivalara Date: Wed, 10 Dec 2025 10:59:04 -0800 Subject: media: uapi: videodev2: Add support for AV1 stateful decoder Introduce a new pixel format, V4L2_PIX_FMT_AV1, to the Video4Linux2(V4L2) API. This format is intended for AV1 bitstreams in stateful decoding/encoding workflows. The fourcc code 'AV10' is used to distinguish this format from the existing V4L2_PIX_FMT_AV1_FRAME, which is used for stateless AV1 decoder implementation. Reviewed-by: Bryan O'Donoghue Reviewed-by: Nicolas Dufresne Reviewed-by: Hans Verkuil Signed-off-by: Deepa Guthyappa Madivalara Tested-by: Val Packett Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- Documentation/userspace-api/media/v4l/pixfmt-compressed.rst | 8 ++++++++ include/uapi/linux/videodev2.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst index c7efb0465db6..235f955d3cd5 100644 --- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst +++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst @@ -275,6 +275,14 @@ Compressed Formats of macroblocks to decode a full corresponding frame to the matching capture buffer. + * .. _V4L2-PIX-FMT-AV1: + + - ``V4L2_PIX_FMT_AV1`` + - 'AV01' + - AV1 compressed video frame. This format is adapted for implementing AV1 + pipeline. The decoder implements stateful video decoder and expects one + temporal unit per buffer in OBU stream format. + The encoder generates one Temporal Unit per buffer. .. raw:: latex \normalsize diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index add08188f068..848e86617d5c 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -775,6 +775,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */ #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ #define V4L2_PIX_FMT_AV1_FRAME v4l2_fourcc('A', 'V', '1', 'F') /* AV1 parsed frame */ +#define V4L2_PIX_FMT_AV1 v4l2_fourcc('A', 'V', '0', '1') /* AV1 */ #define V4L2_PIX_FMT_SPK v4l2_fourcc('S', 'P', 'K', '0') /* Sorenson Spark */ #define V4L2_PIX_FMT_RV30 v4l2_fourcc('R', 'V', '3', '0') /* RealVideo 8 */ #define V4L2_PIX_FMT_RV40 v4l2_fourcc('R', 'V', '4', '0') /* RealVideo 9 & 10 */ -- cgit v1.2.3 From 1bddd758bac21fbbd8a06af746ec7b6d878a9d2c Mon Sep 17 00:00:00 2001 From: Jonas Köppeler Date: Fri, 9 Jan 2026 14:15:34 +0100 Subject: net/sched: sch_cake: share shaper state across sub-instances of cake_mq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds shared shaper state across the cake instances beneath a cake_mq qdisc. It works by periodically tracking the number of active instances, and scaling the configured rate by the number of active queues. The scan is lockless and simply reads the qlen and the last_active state variable of each of the instances configured beneath the parent cake_mq instance. Locking is not required since the values are only updated by the owning instance, and eventual consistency is sufficient for the purpose of estimating the number of active queues. The interval for scanning the number of active queues is set to 200 us. We found this to be a good tradeoff between overhead and response time. For a detailed analysis of this aspect see the Netdevconf talk: https://netdevconf.info/0x19/docs/netdev-0x19-paper16-talk-paper.pdf Reviewed-by: Jamal Hadi Salim Signed-off-by: Jonas Köppeler Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260109-mq-cake-sub-qdisc-v8-5-8d613fece5d8@redhat.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/tc.yaml | 3 +++ include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_cake.c | 51 +++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml index b398f7a46dae..2e663333a279 100644 --- a/Documentation/netlink/specs/tc.yaml +++ b/Documentation/netlink/specs/tc.yaml @@ -2207,6 +2207,9 @@ attribute-sets: - name: blue-timer-us type: s32 + - + name: active-queues + type: u32 - name: cake-tin-stats-attrs name-prefix: tca-cake-tin-stats- diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index c2da76e78bad..66e8072f44df 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1036,6 +1036,7 @@ enum { TCA_CAKE_STATS_DROP_NEXT_US, TCA_CAKE_STATS_P_DROP, TCA_CAKE_STATS_BLUE_TIMER_US, + TCA_CAKE_STATS_ACTIVE_QUEUES, __TCA_CAKE_STATS_MAX }; #define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 2e60e7980558..e30ef7f8ee68 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -202,6 +202,7 @@ struct cake_sched_config { u64 rate_bps; u64 interval; u64 target; + u64 sync_time; u32 buffer_config_limit; u32 fwmark_mask; u16 fwmark_shft; @@ -258,6 +259,11 @@ struct cake_sched_data { u16 max_adjlen; u16 min_netlen; u16 min_adjlen; + + /* mq sync state */ + u64 last_checked_active; + u64 last_active; + u32 active_queues; }; enum { @@ -384,6 +390,8 @@ static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 1239850263, 1191209601, 1147878294, 1108955788 }; +static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, + u64 target_ns, u64 rtt_est_ns); /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * @@ -2004,6 +2012,40 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) u64 delay; u32 len; + if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) { + struct net_device *dev = qdisc_dev(sch); + struct cake_sched_data *other_priv; + u64 new_rate = q->config->rate_bps; + u64 other_qlen, other_last_active; + struct Qdisc *other_sch; + u32 num_active_qs = 1; + unsigned int ntx; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); + other_priv = qdisc_priv(other_sch); + + if (other_priv == q) + continue; + + other_qlen = READ_ONCE(other_sch->q.qlen); + other_last_active = READ_ONCE(other_priv->last_active); + + if (other_qlen || other_last_active > q->last_checked_active) + num_active_qs++; + } + + if (num_active_qs > 1) + new_rate = div64_u64(q->config->rate_bps, num_active_qs); + + /* mtu = 0 is used to only update the rate and not mess with cobalt params */ + cake_set_rate(b, new_rate, 0, 0, 0); + q->last_checked_active = now; + q->active_queues = num_active_qs; + q->rate_ns = b->tin_rate_ns; + q->rate_shft = b->tin_rate_shft; + } + begin: if (!sch->q.qlen) return NULL; @@ -2203,6 +2245,7 @@ retry: b->tin_ecn_mark += !!flow->cvars.ecn_marked; qdisc_bstats_update(sch, skb); + WRITE_ONCE(q->last_active, now); /* collect delay stats */ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); @@ -2303,6 +2346,9 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, b->tin_rate_ns = rate_ns; b->tin_rate_shft = rate_shft; + if (mtu == 0) + return; + byte_target_ns = (byte_target * rate_ns) >> rate_shft; b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); @@ -2769,6 +2815,7 @@ static void cake_config_init(struct cake_sched_config *q, bool is_shared) */ q->rate_flags |= CAKE_FLAG_SPLIT_GSO; q->is_shared = is_shared; + q->sync_time = 200 * NSEC_PER_USEC; } static int cake_init(struct Qdisc *sch, struct nlattr *opt, @@ -2842,6 +2889,9 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, qd->avg_peak_bandwidth = q->rate_bps; qd->min_netlen = ~0; qd->min_adjlen = ~0; + qd->active_queues = 0; + qd->last_checked_active = 0; + return 0; err: kvfree(qd->config); @@ -2974,6 +3024,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); PUT_STAT_U32(MIN_NETLEN, q->min_netlen); PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); + PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues); #undef PUT_STAT_U32 #undef PUT_STAT_U64 -- cgit v1.2.3 From f29c852149f94dc1975c64fa919b3dd62db04d23 Mon Sep 17 00:00:00 2001 From: Ainy Kumari Date: Wed, 14 Jan 2026 16:48:52 +0530 Subject: wifi: cfg80211: add support for EPPKE Authentication Protocol Add an extended feature flag NL80211_EXT_FEATURE_EPPKE to allow a driver to indicate support for the Enhanced Privacy Protection Key Exchange (EPPKE) authentication protocol in non-AP STA mode, as defined in "IEEE P802.11bi/D3.0, 12.16.9". In case of SME in userspace, the Authentication frame body is prepared in userspace while the driver finalizes the Authentication frame once it receives the required fields and elements. The driver indicates support for EPPKE using the extended feature flag so that userspace can initiate EPPKE authentication. When the feature flag is set, process EPPKE Authentication frames from userspace in non-AP STA mode. If the flag is not set, reject EPPKE Authentication frames. Define a new authentication type NL80211_AUTHTYPE_EPPKE for EPPKE. Signed-off-by: Ainy Kumari Co-developed-by: Kavita Kavita Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260114111900.2196941-2-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/uapi/linux/nl80211.h | 7 +++++++ net/wireless/nl80211.c | 14 ++++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 96439de55f07..fbde215c25aa 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1351,6 +1351,7 @@ struct ieee80211_tdls_data { #define WLAN_AUTH_FILS_SK 4 #define WLAN_AUTH_FILS_SK_PFS 5 #define WLAN_AUTH_FILS_PK 6 +#define WLAN_AUTH_EPPKE 9 #define WLAN_AUTH_LEAP 128 #define WLAN_AUTH_CHALLENGE_LEN 128 diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 964e1c779cdd..351d4d176f87 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5429,6 +5429,7 @@ enum nl80211_bss_status { * @NL80211_AUTHTYPE_FILS_SK: Fast Initial Link Setup shared key * @NL80211_AUTHTYPE_FILS_SK_PFS: Fast Initial Link Setup shared key with PFS * @NL80211_AUTHTYPE_FILS_PK: Fast Initial Link Setup public key + * @NL80211_AUTHTYPE_EPPKE: Enhanced Privacy Protection Key Exchange * @__NL80211_AUTHTYPE_NUM: internal * @NL80211_AUTHTYPE_MAX: maximum valid auth algorithm * @NL80211_AUTHTYPE_AUTOMATIC: determine automatically (if necessary by @@ -5444,6 +5445,7 @@ enum nl80211_auth_type { NL80211_AUTHTYPE_FILS_SK, NL80211_AUTHTYPE_FILS_SK_PFS, NL80211_AUTHTYPE_FILS_PK, + NL80211_AUTHTYPE_EPPKE, /* keep last */ __NL80211_AUTHTYPE_NUM, @@ -6748,6 +6750,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_BEACON_RATE_EHT: Driver supports beacon rate * configuration (AP/mesh) with EHT rates. * + * @NL80211_EXT_FEATURE_EPPKE: Driver supports Enhanced Privacy Protection + * Key Exchange (EPPKE) with user space SME (NL80211_CMD_AUTHENTICATE) + * in non-AP STA mode. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6824,6 +6830,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_DFS_CONCURRENT, NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT, NL80211_EXT_FEATURE_BEACON_RATE_EHT, + NL80211_EXT_FEATURE_EPPKE, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 225580507a4b..8f3a27b7d4fd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6473,6 +6473,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || auth_type == NL80211_AUTHTYPE_FILS_PK)) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EPPKE) && + auth_type == NL80211_AUTHTYPE_EPPKE) + return false; return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && @@ -6490,6 +6494,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) && auth_type == NL80211_AUTHTYPE_FILS_SK) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EPPKE) && + auth_type == NL80211_AUTHTYPE_EPPKE) + return false; return true; case NL80211_CMD_START_AP: if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -11956,7 +11964,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if ((auth_type == NL80211_AUTHTYPE_SAE || auth_type == NL80211_AUTHTYPE_FILS_SK || auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || - auth_type == NL80211_AUTHTYPE_FILS_PK) && + auth_type == NL80211_AUTHTYPE_FILS_PK || + auth_type == NL80211_AUTHTYPE_EPPKE) && !info->attrs[NL80211_ATTR_AUTH_DATA]) return -EINVAL; @@ -11964,7 +11973,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (auth_type != NL80211_AUTHTYPE_SAE && auth_type != NL80211_AUTHTYPE_FILS_SK && auth_type != NL80211_AUTHTYPE_FILS_SK_PFS && - auth_type != NL80211_AUTHTYPE_FILS_PK) + auth_type != NL80211_AUTHTYPE_FILS_PK && + auth_type != NL80211_AUTHTYPE_EPPKE) return -EINVAL; req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); -- cgit v1.2.3 From 9d17a040c15d4b99484f13cf08dd45a9e308beeb Mon Sep 17 00:00:00 2001 From: Ainy Kumari Date: Wed, 14 Jan 2026 16:48:53 +0530 Subject: wifi: cfg80211: add feature flag for (re)association frame encryption Introduce an extended feature flag that allows drivers to signal support for encryption of (Re)Association Request and Response frames in both non-AP STA and AP mode, as specified in specification "IEEE P802.11bi/D3.0, 12.16.6". Signed-off-by: Ainy Kumari Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260114111900.2196941-3-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 351d4d176f87..60573334e086 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6754,6 +6754,11 @@ enum nl80211_feature_flags { * Key Exchange (EPPKE) with user space SME (NL80211_CMD_AUTHENTICATE) * in non-AP STA mode. * + * @NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION: This specifies that the + * driver supports encryption of (Re)Association Request and Response + * frames in both non‑AP STA and AP mode as specified in + * "IEEE P802.11bi/D3.0, 12.16.6". + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6831,6 +6836,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT, NL80211_EXT_FEATURE_BEACON_RATE_EHT, NL80211_EXT_FEATURE_EPPKE, + NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 6ee3a22c61cdf57d71592ec9f3b9439cd5d0c75f Mon Sep 17 00:00:00 2001 From: Sai Pratyusha Magam Date: Wed, 14 Jan 2026 16:48:55 +0530 Subject: wifi: nl80211: Add support for EPP peer indication Introduce a new netlink attribute NL80211_ATTR_EPP_PEER to be used with NL80211_CMD_NEW_STA and NL80211_CMD_ADD_LINK_STA for the userspace to indicate that a non-AP STA is an Enhanced Privacy Protection (EPP) peer. Co-developed-by: Rohan Dutta Signed-off-by: Rohan Dutta Signed-off-by: Sai Pratyusha Magam Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260114111900.2196941-5-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 5 +++++ net/wireless/nl80211.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cbccedf32228..6d8e35a0dde4 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1785,6 +1785,7 @@ struct cfg80211_ttlm_params { * present/updated * @eml_cap: EML capabilities of this station * @link_sta_params: link related params. + * @epp_peer: EPP peer indication */ struct station_parameters { struct net_device *vlan; @@ -1811,6 +1812,7 @@ struct station_parameters { bool eml_cap_present; u16 eml_cap; struct link_station_parameters link_sta_params; + bool epp_peer; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 60573334e086..eb92296457c9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2973,6 +2973,9 @@ enum nl80211_commands { * primary channel is 2 MHz wide, and the control channel designates * the 1 MHz primary subchannel within that 2 MHz primary. * + * @NL80211_ATTR_EPP_PEER: A flag attribute to indicate if the peer is an EPP + * STA. Used with %NL80211_CMD_NEW_STA and %NL80211_CMD_ADD_LINK_STA + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3541,6 +3544,8 @@ enum nl80211_attrs { NL80211_ATTR_S1G_PRIMARY_2MHZ, + NL80211_ATTR_EPP_PEER, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index df159a5f1816..3d74bea09ba3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -932,6 +932,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_NESTED(nl80211_s1g_short_beacon), [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, + [NL80211_ATTR_EPP_PEER] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -8792,6 +8793,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) goto out; } } + + params.epp_peer = + nla_get_flag(info->attrs[NL80211_ATTR_EPP_PEER]); + err = rdev_add_station(rdev, dev, mac_addr, ¶ms); out: dev_put(params.vlan); -- cgit v1.2.3 From dacbfc16780837aa3e00c684d89492d211fd809f Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 5 Jan 2026 13:24:03 +0100 Subject: crypto: af_alg - Annotate struct af_alg_iv with __counted_by Add the __counted_by() compiler attribute to the flexible array member 'iv' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Reviewed-by: Simon Horman Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20260105122402.2685-2-thorsten.blum@linux.dev Signed-off-by: Kees Cook --- include/uapi/linux/if_alg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index b35871cbeed7..4f51e198ac2e 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -42,7 +42,7 @@ struct sockaddr_alg_new { struct af_alg_iv { __u32 ivlen; - __u8 iv[]; + __u8 iv[] __counted_by(ivlen); }; /* Socket options */ -- cgit v1.2.3 From d2bdcde9626cbea0c44a6aaa33b440c8adf81e09 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:45 +0800 Subject: perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR With the introduction of the OMR feature, the PEBS memory auxiliary info field for load and store latency events has been restructured for DMR. The memory auxiliary info field's bit[8] indicates whether a L2 cache miss occurred for a memory load or store instruction. If bit[8] is 0, it signifies no L2 cache miss, and bits[7:0] specify the exact cache data source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent the OMR encoding, indicating the specific L3 cache or memory region involved in the memory access. A significant enhancement is OMR encoding provides up to 8 fine-grained memory regions besides the cache region. A significant enhancement for OMR encoding is the ability to provide up to 8 fine-grained memory regions in addition to the cache region, offering more detailed insights into memory access regions. For detailed information on the memory auxiliary info encoding, please refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in the ISE documentation. This patch ensures that the PEBS memory auxiliary info field is correctly interpreted and utilized in DMR. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/ds.c | 140 ++++++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 2 + include/uapi/linux/perf_event.h | 27 ++++++- tools/include/uapi/linux/perf_event.h | 27 ++++++- 4 files changed, 190 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index feb1c3cf63e4..272e652f25fc 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -34,6 +34,17 @@ struct pebs_record_32 { */ +union omr_encoding { + struct { + u8 omr_source : 4; + u8 omr_remote : 1; + u8 omr_hitm : 1; + u8 omr_snoop : 1; + u8 omr_promoted : 1; + }; + u8 omr_full; +}; + union intel_x86_pebs_dse { u64 val; struct { @@ -73,6 +84,18 @@ union intel_x86_pebs_dse { unsigned int lnc_addr_blk:1; unsigned int ld_reserved6:18; }; + struct { + unsigned int pnc_dse: 8; + unsigned int pnc_l2_miss:1; + unsigned int pnc_stlb_clean_hit:1; + unsigned int pnc_stlb_any_hit:1; + unsigned int pnc_stlb_miss:1; + unsigned int pnc_locked:1; + unsigned int pnc_data_blk:1; + unsigned int pnc_addr_blk:1; + unsigned int pnc_fb_full:1; + unsigned int ld_reserved8:16; + }; }; @@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void) __intel_pmu_pebs_data_source_cmt(data_source); } +/* Version for Panthercove and later */ + +/* L2 hit */ +#define PNC_PEBS_DATA_SOURCE_MAX 16 +static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */ + OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */ + 0, /* 0x05: Reserved */ + 0, /* 0x06: Reserved */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */ + 0, /* 0x0b: Reserved */ + 0, /* 0x0c: Reserved */ + 0, /* 0x0d: Reserved */ + 0, /* 0x0e: Reserved */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ +}; + +/* L2 miss */ +#define OMR_DATA_SOURCE_MAX 16 +static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */ + 0, /* 0x01: Reserved */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */ + OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */ + OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */ +}; + +static u64 parse_omr_data_source(u8 dse) +{ + union omr_encoding omr; + u64 val = 0; + + omr.omr_full = dse; + val = omr_data_source[omr.omr_source]; + if (omr.omr_source > 0x1 && omr.omr_source < 0x7) + val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0; + else if (omr.omr_source > 0x7) + val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM); + + if (omr.omr_remote) + val |= REM; + + val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT); + + if (omr.omr_source == 0x2) { + u8 snoop = omr.omr_snoop | omr.omr_promoted; + + if (snoop == 0x0) + val |= P(SNOOP, NA); + else if (snoop == 0x1) + val |= P(SNOOP, MISS); + else if (snoop == 0x2) + val |= P(SNOOP, HIT); + else if (snoop == 0x3) + val |= P(SNOOP, NONE); + } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) { + val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0; + } + + return val; +} + static u64 precise_store_data(u64 status) { union intel_x86_pebs_dse dse; @@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status) return lnl_latency_data(event, status); } +u64 pnc_latency_data(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + union perf_mem_data_src src; + u64 val; + + dse.val = status; + + if (!dse.pnc_l2_miss) + val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf]; + else + val = parse_omr_data_source(dse.pnc_dse); + + if (!val) + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA); + + if (dse.pnc_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + if (dse.pnc_locked) + val |= P(LOCK, LOCKED); + + if (dse.pnc_data_blk) + val |= P(BLK, DATA); + if (dse.pnc_addr_blk) + val |= P(BLK, ADDR); + if (!dse.pnc_data_blk && !dse.pnc_addr_blk) + val |= P(BLK, NA); + + src.val = val; + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + src.mem_op = P(OP, STORE); + + return src.val; +} + static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 586e3fdfe6d8..bd501c2a0f73 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status); u64 arl_h_latency_data(struct perf_event *event, u64 status); +u64 pnc_latency_data(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c44a8fb3e418..533393ec94d0 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index c44a8fb3e418..d4b99610a3b0 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) -- cgit v1.2.3 From 567873005dca1be0a3b3e2e309a8f0de14d2b827 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 15 Jan 2026 08:05:44 +0200 Subject: ethtool: Clarify len/n_stats fields in/out semantics Document that the 'len' field in ethtool_gstrings and 'n_stats' field in ethtool_stats optionally serve dual purposes: on entry they specify the number of items requested, and on return they indicate the number actually returned (which is not necessarily the same). Signed-off-by: Gal Pressman Reviewed-by: Dragos Tatulea Link: https://patch.msgid.link/20260115060544.481550-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 5daa8f225b67..bbfe6e1cf01b 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1096,13 +1096,20 @@ enum ethtool_module_fw_flash_status { * struct ethtool_gstrings - string set for data tagging * @cmd: Command number = %ETHTOOL_GSTRINGS * @string_set: String set ID; one of &enum ethtool_stringset - * @len: On return, the number of strings in the string set + * @len: Number of strings in the string set * @data: Buffer for strings. Each string is null-padded to a size of * %ETH_GSTRING_LEN. * * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in * the string set. They must allocate a buffer of the appropriate * size immediately following this structure. + * + * Setting @len on input is optional (though preferred), but must be zeroed + * otherwise. + * When set, @len will return the requested count if it matches the actual + * count; otherwise, it will be zero. + * This prevents issues when the number of strings is different than the + * userspace allocation. */ struct ethtool_gstrings { __u32 cmd; @@ -1179,13 +1186,20 @@ struct ethtool_test { /** * struct ethtool_stats - device-specific statistics * @cmd: Command number = %ETHTOOL_GSTATS - * @n_stats: On return, the number of statistics + * @n_stats: Number of statistics * @data: Array of statistics * * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the * number of statistics that will be returned. They must allocate a * buffer of the appropriate size (8 * number of statistics) * immediately following this structure. + * + * Setting @n_stats on input is optional (though preferred), but must be zeroed + * otherwise. + * When set, @n_stats will return the requested count if it matches the actual + * count; otherwise, it will be zero. + * This prevents issues when the number of stats is different than the + * userspace allocation. */ struct ethtool_stats { __u32 cmd; -- cgit v1.2.3 From d89ccbf3dde727d91a242a5a3f3b70a90579b057 Mon Sep 17 00:00:00 2001 From: Richard Leitner Date: Tue, 9 Dec 2025 23:44:36 +0100 Subject: media: v4l: ctrls: add a control for flash/strobe duration Add a V4L2_CID_FLASH_DURATION control to set the duration of a flash/strobe pulse. This controls the length of the flash/strobe pulse output by device (typically a camera sensor) and connected to the flash controller. This is different to the V4L2_CID_FLASH_TIMEOUT control, which is implemented by the flash controller and defines a limit after which the flash is "forcefully" turned off again. Reviewed-by: Laurent Pinchart Signed-off-by: Richard Leitner Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-ctrls-defs.c | 1 + include/uapi/linux/v4l2-controls.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c index ad41f65374e2..4848423205ff 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c @@ -1135,6 +1135,7 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_FLASH_FAULT: return "Faults"; case V4L2_CID_FLASH_CHARGE: return "Charge"; case V4L2_CID_FLASH_READY: return "Ready to Strobe"; + case V4L2_CID_FLASH_DURATION: return "Strobe Duration"; /* JPEG encoder controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index f84ed133a6c9..357845830fe9 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -1192,6 +1192,7 @@ enum v4l2_flash_strobe_source { #define V4L2_CID_FLASH_CHARGE (V4L2_CID_FLASH_CLASS_BASE + 11) #define V4L2_CID_FLASH_READY (V4L2_CID_FLASH_CLASS_BASE + 12) +#define V4L2_CID_FLASH_DURATION (V4L2_CID_FLASH_CLASS_BASE + 13) /* JPEG-class control IDs */ -- cgit v1.2.3 From 5be4154f6255d92d9d2ad5da658d7d33a655386f Mon Sep 17 00:00:00 2001 From: Richard Leitner Date: Tue, 9 Dec 2025 23:44:37 +0100 Subject: media: v4l: ctrls: add a control for enabling strobe output Add a control V4L2_CID_FLASH_STROBE_OE to en- or disable the strobe output of v4l2 devices (most likely sensors). Signed-off-by: Richard Leitner Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-ctrls-defs.c | 2 ++ include/uapi/linux/v4l2-controls.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c index 4848423205ff..765aeeec84fe 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c @@ -1136,6 +1136,7 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_FLASH_CHARGE: return "Charge"; case V4L2_CID_FLASH_READY: return "Ready to Strobe"; case V4L2_CID_FLASH_DURATION: return "Strobe Duration"; + case V4L2_CID_FLASH_STROBE_OE: return "Strobe Output Enable"; /* JPEG encoder controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ @@ -1282,6 +1283,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_FLASH_STROBE_STATUS: case V4L2_CID_FLASH_CHARGE: case V4L2_CID_FLASH_READY: + case V4L2_CID_FLASH_STROBE_OE: case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER: case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE: case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE: diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 357845830fe9..572622e4535e 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -1193,6 +1193,7 @@ enum v4l2_flash_strobe_source { #define V4L2_CID_FLASH_CHARGE (V4L2_CID_FLASH_CLASS_BASE + 11) #define V4L2_CID_FLASH_READY (V4L2_CID_FLASH_CLASS_BASE + 12) #define V4L2_CID_FLASH_DURATION (V4L2_CID_FLASH_CLASS_BASE + 13) +#define V4L2_CID_FLASH_STROBE_OE (V4L2_CID_FLASH_CLASS_BASE + 14) /* JPEG-class control IDs */ -- cgit v1.2.3 From 10d28cffb3f6ec7ad67f0a4cd32c2afa92909452 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Wed, 3 Dec 2025 16:24:38 +0000 Subject: comedi: Fix getting range information for subdevices 16 to 255 The `COMEDI_RANGEINFO` ioctl does not work properly for subdevice indices above 15. Currently, the only in-tree COMEDI drivers that support more than 16 subdevices are the "8255" driver and the "comedi_bond" driver. Making the ioctl work for subdevice indices up to 255 is achievable. It needs minor changes to the handling of the `COMEDI_RANGEINFO` and `COMEDI_CHANINFO` ioctls that should be mostly harmless to user-space, apart from making them less broken. Details follow... The `COMEDI_RANGEINFO` ioctl command gets the list of supported ranges (usually with units of volts or milliamps) for a COMEDI subdevice or channel. (Only some subdevices have per-channel range tables, indicated by the `SDF_RANGETYPE` flag in the subdevice information.) It uses a `range_type` value and a user-space pointer, both supplied by user-space, but the `range_type` value should match what was obtained using the `COMEDI_CHANINFO` ioctl (if the subdevice has per-channel range tables) or `COMEDI_SUBDINFO` ioctl (if the subdevice uses a single range table for all channels). Bits 15 to 0 of the `range_type` value contain the length of the range table, which is the only part that user-space should care about (so it can use a suitably sized buffer to fetch the range table). Bits 23 to 16 store the channel index, which is assumed to be no more than 255 if the subdevice has per-channel range tables, and is set to 0 if the subdevice has a single range table. For `range_type` values produced by the `COMEDI_SUBDINFO` ioctl, bits 31 to 24 contain the subdevice index, which is assumed to be no more than 255. But for `range_type` values produced by the `COMEDI_CHANINFO` ioctl, bits 27 to 24 contain the subdevice index, which is assumed to be no more than 15, and bits 31 to 28 contain the COMEDI device's minor device number for some unknown reason lost in the mists of time. The `COMEDI_RANGEINFO` ioctl extract the length from bits 15 to 0 of the user-supplied `range_type` value, extracts the channel index from bits 23 to 16 (only used if the subdevice has per-channel range tables), extracts the subdevice index from bits 27 to 24, and ignores bits 31 to 28. So for subdevice indices 16 to 255, the `COMEDI_SUBDINFO` or `COMEDI_CHANINFO` ioctl will report a `range_type` value that doesn't work with the `COMEDI_RANGEINFO` ioctl. It will either get the range table for the subdevice index modulo 16, or will fail with `-EINVAL`. To fix this, always use bits 31 to 24 of the `range_type` value to hold the subdevice index (assumed to be no more than 255). This affects the `COMEDI_CHANINFO` and `COMEDI_RANGEINFO` ioctls. There should not be anything in user-space that depends on the old, broken usage, although it may now see different values in bits 31 to 28 of the `range_type` values reported by the `COMEDI_CHANINFO` ioctl for subdevices that have per-channel subdevices. User-space should not be trying to decode bits 31 to 16 of the `range_type` values anyway. Fixes: ed9eccbe8970 ("Staging: add comedi core") Cc: stable@vger.kernel.org #5.17+ Signed-off-by: Ian Abbott Link: https://patch.msgid.link/20251203162438.176841-1-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- drivers/comedi/comedi_fops.c | 2 +- drivers/comedi/range.c | 2 +- include/uapi/linux/comedi.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c index 657c98cd723e..2c3eb9e89571 100644 --- a/drivers/comedi/comedi_fops.c +++ b/drivers/comedi/comedi_fops.c @@ -1155,7 +1155,7 @@ static int do_chaninfo_ioctl(struct comedi_device *dev, for (i = 0; i < s->n_chan; i++) { int x; - x = (dev->minor << 28) | (it->subdev << 24) | (i << 16) | + x = (it->subdev << 24) | (i << 16) | (s->range_table_list[i]->length); if (put_user(x, it->rangelist + i)) return -EFAULT; diff --git a/drivers/comedi/range.c b/drivers/comedi/range.c index 8f43cf88d784..5b8f662365e3 100644 --- a/drivers/comedi/range.c +++ b/drivers/comedi/range.c @@ -52,7 +52,7 @@ int do_rangeinfo_ioctl(struct comedi_device *dev, const struct comedi_lrange *lr; struct comedi_subdevice *s; - subd = (it->range_type >> 24) & 0xf; + subd = (it->range_type >> 24) & 0xff; chan = (it->range_type >> 16) & 0xff; if (!dev->attached) diff --git a/include/uapi/linux/comedi.h b/include/uapi/linux/comedi.h index 7314e5ee0a1e..798ec9a39e12 100644 --- a/include/uapi/linux/comedi.h +++ b/include/uapi/linux/comedi.h @@ -640,7 +640,7 @@ struct comedi_chaninfo { /** * struct comedi_rangeinfo - used to retrieve the range table for a channel - * @range_type: Encodes subdevice index (bits 27:24), channel index + * @range_type: Encodes subdevice index (bits 31:24), channel index * (bits 23:16) and range table length (bits 15:0). * @range_ptr: Pointer to array of @struct comedi_krange to be filled * in with the range table for the channel or subdevice. -- cgit v1.2.3 From 9b8a0ba68246a61d903ce62c35c303b1501df28b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Dec 2025 14:03:24 +0100 Subject: mount: add OPEN_TREE_NAMESPACE When creating containers the setup usually involves using CLONE_NEWNS via clone3() or unshare(). This copies the caller's complete mount namespace. The runtime will also assemble a new rootfs and then use pivot_root() to switch the old mount tree with the new rootfs. Afterward it will recursively umount the old mount tree thereby getting rid of all mounts. On a basic system here where the mount table isn't particularly large this still copies about 30 mounts. Copying all of these mounts only to get rid of them later is pretty wasteful. This is exacerbated if intermediary mount namespaces are used that only exist for a very short amount of time and are immediately destroyed again causing a ton of mounts to be copied and destroyed needlessly. With a large mount table and a system where thousands or ten-thousands of containers are spawned in parallel this quickly becomes a bottleneck increasing contention on the semaphore. Extend open_tree() with a new OPEN_TREE_NAMESPACE flag. Similar to OPEN_TREE_CLONE only the indicated mount tree is copied. Instead of returning a file descriptor referring to that mount tree OPEN_TREE_NAMESPACE will cause open_tree() to return a file descriptor to a new mount namespace. In that new mount namespace the copied mount tree has been mounted on top of a copy of the real rootfs. The caller can setns() into that mount namespace and perform any additionally required setup such as move_mount() detached mounts in there. This allows OPEN_TREE_NAMESPACE to function as a combined unshare(CLONE_NEWNS) and pivot_root(). A caller may for example choose to create an extremely minimal rootfs: fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE); This will create a mount namespace where "wootwoot" has become the rootfs mounted on top of the real rootfs. The caller can now setns() into this new mount namespace and assemble additional mounts. This also works with user namespaces: unshare(CLONE_NEWUSER); fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE); which creates a new mount namespace owned by the earlier created user namespace with "wootwoot" as the rootfs mounted on top of the real rootfs. Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-1-bfb24c7b061f@kernel.org Tested-by: Jeff Layton Reviewed-by: Aleksa Sarai Reviewed-by: Jeff Layton Suggested-by: Christian Brauner Suggested-by: Aleksa Sarai Signed-off-by: Christian Brauner --- fs/internal.h | 1 + fs/namespace.c | 163 ++++++++++++++++++++++++++++++++++++++++----- fs/nsfs.c | 13 ++++ include/uapi/linux/mount.h | 3 +- 4 files changed, 163 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/internal.h b/fs/internal.h index e333b105337a..f6932e639f36 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -246,6 +246,7 @@ extern void mnt_pin_kill(struct mount *m); */ extern const struct dentry_operations ns_dentry_operations; int open_namespace(struct ns_common *ns); +struct file *open_namespace_file(struct ns_common *ns); /* * fs/stat.c: diff --git a/fs/namespace.c b/fs/namespace.c index ec3b16fedd9f..59557019e422 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2796,6 +2796,9 @@ static inline void unlock_mount(struct pinned_mountpoint *m) __unlock_mount(m); } +static void lock_mount_exact(const struct path *path, + struct pinned_mountpoint *mp); + #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ do_lock_mount((path), &mp, (beneath)) @@ -2946,10 +2949,11 @@ static inline bool may_copy_tree(const struct path *path) return check_anonymous_mnt(mnt); } - -static struct mount *__do_loopback(const struct path *old_path, int recurse) +static struct mount *__do_loopback(const struct path *old_path, + unsigned int flags, unsigned int copy_flags) { struct mount *old = real_mount(old_path->mnt); + bool recurse = flags & AT_RECURSIVE; if (IS_MNT_UNBINDABLE(old)) return ERR_PTR(-EINVAL); @@ -2960,10 +2964,22 @@ static struct mount *__do_loopback(const struct path *old_path, int recurse) if (!recurse && __has_locked_children(old, old_path->dentry)) return ERR_PTR(-EINVAL); + /* + * When creating a new mount namespace we don't want to copy over + * mounts of mount namespaces to avoid the risk of cycles and also to + * minimize the default complex interdependencies between mount + * namespaces. + * + * We could ofc just check whether all mount namespace files aren't + * creating cycles but really let's keep this simple. + */ + if (!(flags & OPEN_TREE_NAMESPACE)) + copy_flags |= CL_COPY_MNT_NS_FILE; + if (recurse) - return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); - else - return clone_mnt(old, old_path->dentry, 0); + return copy_tree(old, old_path->dentry, copy_flags); + + return clone_mnt(old, old_path->dentry, copy_flags); } /* @@ -2974,7 +2990,9 @@ static int do_loopback(const struct path *path, const char *old_name, { struct path old_path __free(path_put) = {}; struct mount *mnt = NULL; + unsigned int flags = recurse ? AT_RECURSIVE : 0; int err; + if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); @@ -2991,7 +3009,7 @@ static int do_loopback(const struct path *path, const char *old_name, if (!check_mnt(mp.parent)) return -EINVAL; - mnt = __do_loopback(&old_path, recurse); + mnt = __do_loopback(&old_path, flags, 0); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -3004,7 +3022,7 @@ static int do_loopback(const struct path *path, const char *old_name, return err; } -static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive) +static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; @@ -3029,7 +3047,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec ns->seq_origin = src_mnt_ns->ns.ns_id; } - mnt = __do_loopback(path, recursive); + mnt = __do_loopback(path, flags, 0); if (IS_ERR(mnt)) { emptied_ns = ns; return ERR_CAST(mnt); @@ -3043,9 +3061,9 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec return ns; } -static struct file *open_detached_copy(struct path *path, bool recursive) +static struct file *open_detached_copy(struct path *path, unsigned int flags) { - struct mnt_namespace *ns = get_detached_copy(path, recursive); + struct mnt_namespace *ns = get_detached_copy(path, flags); struct file *file; if (IS_ERR(ns)) @@ -3061,21 +3079,122 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return file; } +DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *, + if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T)) + +static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) +{ + struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL; + struct path to_path __free(path_put) = {}; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct user_namespace *user_ns = current_user_ns(); + struct mount *new_ns_root; + struct mount *mnt; + unsigned int copy_flags = 0; + bool locked = false; + + if (user_ns != ns->user_ns) + copy_flags |= CL_SLAVE; + + new_ns = alloc_mnt_ns(user_ns, false); + if (IS_ERR(new_ns)) + return ERR_CAST(new_ns); + + scoped_guard(namespace_excl) { + new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); + if (IS_ERR(new_ns_root)) + return ERR_CAST(new_ns_root); + + /* + * If the real rootfs had a locked mount on top of it somewhere + * in the stack, lock the new mount tree as well so it can't be + * exposed. + */ + mnt = ns->root; + while (mnt->overmount) { + mnt = mnt->overmount; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + locked = true; + } + } + + /* + * We dropped the namespace semaphore so we can actually lock + * the copy for mounting. The copied mount isn't attached to any + * mount namespace and it is thus excluded from any propagation. + * So realistically we're isolated and the mount can't be + * overmounted. + */ + + /* Borrow the reference from clone_mnt(). */ + to_path.mnt = &new_ns_root->mnt; + to_path.dentry = dget(new_ns_root->mnt.mnt_root); + + /* Now lock for actual mounting. */ + LOCK_MOUNT_EXACT(mp, &to_path); + if (unlikely(IS_ERR(mp.parent))) + return ERR_CAST(mp.parent); + + /* + * We don't emulate unshare()ing a mount namespace. We stick to the + * restrictions of creating detached bind-mounts. It has a lot + * saner and simpler semantics. + */ + mnt = __do_loopback(path, flags, copy_flags); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + scoped_guard(mount_writer) { + if (locked) + mnt->mnt.mnt_flags |= MNT_LOCKED; + /* + * Now mount the detached tree on top of the copy of the + * real rootfs we created. + */ + attach_mnt(mnt, new_ns_root, mp.mp); + if (user_ns != ns->user_ns) + lock_mnt_tree(new_ns_root); + } + + /* Add all mounts to the new namespace. */ + for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) { + mnt_add_to_ns(new_ns, p); + new_ns->nr_mounts++; + } + + new_ns->root = real_mount(no_free_ptr(to_path.mnt)); + ns_tree_add_raw(new_ns); + return no_free_ptr(new_ns); +} + +static struct file *open_new_namespace(struct path *path, unsigned int flags) +{ + struct mnt_namespace *new_ns; + + new_ns = create_new_namespace(path, flags); + if (IS_ERR(new_ns)) + return ERR_CAST(new_ns); + return open_namespace_file(to_ns_common(new_ns)); +} + static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { int ret; struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; - bool detached = flags & OPEN_TREE_CLONE; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | - OPEN_TREE_CLOEXEC)) + OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE)) return ERR_PTR(-EINVAL); - if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) + if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) == + AT_RECURSIVE) + return ERR_PTR(-EINVAL); + + if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1) return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) @@ -3085,15 +3204,27 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; - if (detached && !may_mount()) + /* + * If we create a new mount namespace with the cloned mount tree we + * just care about being privileged over our current user namespace. + * The new mount namespace will be owned by it. + */ + if ((flags & OPEN_TREE_NAMESPACE) && + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if ((flags & OPEN_TREE_CLONE) && !may_mount()) return ERR_PTR(-EPERM); ret = user_path_at(dfd, filename, lookup_flags, &path); if (unlikely(ret)) return ERR_PTR(ret); - if (detached) - return open_detached_copy(&path, flags & AT_RECURSIVE); + if (flags & OPEN_TREE_NAMESPACE) + return open_new_namespace(&path, flags); + + if (flags & OPEN_TREE_CLONE) + return open_detached_copy(&path, flags); return dentry_open(&path, O_PATH, current_cred()); } diff --git a/fs/nsfs.c b/fs/nsfs.c index bf27d5da91f1..db91de208645 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -99,6 +99,19 @@ int ns_get_path(struct path *path, struct task_struct *task, return ns_get_path_cb(path, ns_get_path_task, &args); } +struct file *open_namespace_file(struct ns_common *ns) +{ + struct path path __free(path_put) = {}; + int err; + + /* call first to consume reference */ + err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (err < 0) + return ERR_PTR(err); + + return dentry_open(&path, O_RDONLY, current_cred()); +} + /** * open_namespace - open a namespace * @ns: the namespace to open diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 18c624405268..d9d86598d100 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -61,7 +61,8 @@ /* * open_tree() flags. */ -#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ +#define OPEN_TREE_CLONE (1 << 0) /* Clone the target tree and attach the clone */ +#define OPEN_TREE_NAMESPACE (1 << 1) /* Clone the target tree into a new mount namespace */ #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ /* -- cgit v1.2.3 From 1e5271393d777f6159d896943b4c44c4f3ecff52 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 15 Jan 2026 08:35:44 +0100 Subject: hyper-v: Mark inner union in hv_kvp_exchg_msg_value as packed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unpacked union within a packed struct generates alignment warnings on clang for 32-bit ARM: ./usr/include/linux/hyperv.h:361:2: error: field within 'struct hv_kvp_exchg_msg_value' is less aligned than 'union hv_kvp_exchg_msg_value::(anonymous at ./usr/include/linux/hyperv.h:361:2)' and is usually due to 'struct hv_kvp_exchg_msg_value' being packed, which can lead to unaligned accesses [-Werror,-Wunaligned-access] 361 | union { | ^ With the recent changes to compile-test the UAPI headers in more cases, this warning in combination with CONFIG_WERROR breaks the build. Fix the warning. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512140314.DzDxpIVn-lkp@intel.com/ Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/linux-kbuild/20260110-uapi-test-disable-headers-arm-clang-unaligned-access-v1-1-b7b0fa541daa@kernel.org/ Suggested-by: Arnd Bergmann Link: https://lore.kernel.org/linux-kbuild/29b2e736-d462-45b7-a0a9-85f8d8a3de56@app.fastmail.com/ Signed-off-by: Thomas Weißschuh Acked-by: Wei Liu (Microsoft) Tested-by: Nicolas Schier Reviewed-by: Nicolas Schier Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260115-kbuild-alignment-vbox-v1-1-076aed1623ff@linutronix.de Signed-off-by: Nathan Chancellor --- include/uapi/linux/hyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index aaa502a7bff4..1749b35ab2c2 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -362,7 +362,7 @@ struct hv_kvp_exchg_msg_value { __u8 value[HV_KVP_EXCHANGE_MAX_VALUE_SIZE]; __u32 value_u32; __u64 value_u64; - }; + } __attribute__((packed)); } __attribute__((packed)); struct hv_kvp_msg_enumerate { -- cgit v1.2.3 From c25d01e1c4f2d43f47af87c00e223f5ca7c71792 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 15 Jan 2026 08:35:45 +0100 Subject: virt: vbox: uapi: Mark inner unions in packed structs as packed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unpacked unions within a packed struct generates alignment warnings on clang for 32-bit ARM: ./usr/include/linux/vbox_vmmdev_types.h:239:4: error: field u within 'struct vmmdev_hgcm_function_parameter32' is less aligned than 'union (unnamed union at ./usr/include/linux/vbox_vmmdev_types.h:223:2)' and is usually due to 'struct vmmdev_hgcm_function_parameter32' being packed, which can lead to unaligned accesses [-Werror,-Wunaligned-access] 239 | } u; | ^ ./usr/include/linux/vbox_vmmdev_types.h:254:6: error: field u within 'struct vmmdev_hgcm_function_parameter64::(anonymous union)::(unnamed at ./usr/include/linux/vbox_vmmdev_types.h:249:3)' is less aligned than 'union (unnamed union at ./usr/include/linux/vbox_vmmdev_types.h:251:4)' and is usually due to 'struct vmmdev_hgcm_function_parameter64::(anonymous union)::(unnamed at ./usr/include/linux/vbox_vmmdev_types.h:249:3)' being packed, which can lead to unaligned accesses [-Werror,-Wunaligned-access] With the recent changes to compile-test the UAPI headers in more cases, these warning in combination with CONFIG_WERROR breaks the build. Fix the warnings. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512140314.DzDxpIVn-lkp@intel.com/ Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/linux-kbuild/20260110-uapi-test-disable-headers-arm-clang-unaligned-access-v1-1-b7b0fa541daa@kernel.org/ Suggested-by: Arnd Bergmann Link: https://lore.kernel.org/linux-kbuild/29b2e736-d462-45b7-a0a9-85f8d8a3de56@app.fastmail.com/ Signed-off-by: Thomas Weißschuh Tested-by: Nicolas Schier Reviewed-by: Nicolas Schier Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260115-kbuild-alignment-vbox-v1-2-076aed1623ff@linutronix.de Signed-off-by: Nathan Chancellor --- include/uapi/linux/vbox_vmmdev_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vbox_vmmdev_types.h b/include/uapi/linux/vbox_vmmdev_types.h index 6073858d52a2..11f3627c3729 100644 --- a/include/uapi/linux/vbox_vmmdev_types.h +++ b/include/uapi/linux/vbox_vmmdev_types.h @@ -236,7 +236,7 @@ struct vmmdev_hgcm_function_parameter32 { /** Relative to the request header. */ __u32 offset; } page_list; - } u; + } __packed u; } __packed; VMMDEV_ASSERT_SIZE(vmmdev_hgcm_function_parameter32, 4 + 8); @@ -251,7 +251,7 @@ struct vmmdev_hgcm_function_parameter64 { union { __u64 phys_addr; __u64 linear_addr; - } u; + } __packed u; } __packed pointer; struct { /** Size of the buffer described by the page list. */ -- cgit v1.2.3 From 150a04d817d8f5be5a4f92799827cdc8d7e45989 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Fri, 16 Jan 2026 00:57:57 +0000 Subject: compiler_types.h: Attributes: Add __counted_by_ptr macro Introduce __counted_by_ptr(), which works like __counted_by(), but for pointer struct members. struct foo { int a, b, c; char *buffer __counted_by_ptr(bytes); short nr_bars; struct bar *bars __counted_by_ptr(nr_bars); size_t bytes; }; Because "counted_by" can only be applied to pointer members in very recent compiler versions, its application ends up needing to be distinct from flexibe array "counted_by" annotations, hence a separate macro. This is a reworking of Kees' previous patch [1]. Link: https://lore.kernel.org/all/20251020220118.1226740-1-kees@kernel.org/ [1] Co-developed-by: Kees Cook Signed-off-by: Bill Wendling Link: https://patch.msgid.link/20260116005838.2419118-1-morbo@google.com Signed-off-by: Kees Cook --- Makefile | 6 ++++++ include/linux/compiler_types.h | 18 +++++++++++++++++- include/uapi/linux/stddef.h | 4 ++++ init/Kconfig | 7 +++++++ 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Makefile b/Makefile index 3cd00b62cde9..c0751976cdee 100644 --- a/Makefile +++ b/Makefile @@ -952,6 +952,12 @@ KBUILD_CFLAGS += $(CC_AUTO_VAR_INIT_ZERO_ENABLER) endif endif +ifdef CONFIG_CC_IS_CLANG +ifdef CONFIG_CC_HAS_COUNTED_BY_PTR +KBUILD_CFLAGS += -fexperimental-late-parse-attributes +endif +endif + # Explicitly clear padding bits during variable initialization KBUILD_CFLAGS += $(call cc-option,-fzero-init-padding-bits=all) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index d3318a3c2577..d095beb904ea 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -369,7 +369,7 @@ struct ftrace_likely_data { * Optional: only supported since clang >= 18 * * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 - * clang: https://github.com/llvm/llvm-project/pull/76348 + * clang: https://clang.llvm.org/docs/AttributeReference.html#counted-by-counted-by-or-null-sized-by-sized-by-or-null * * __bdos on clang < 19.1.2 can erroneously return 0: * https://github.com/llvm/llvm-project/pull/110497 @@ -383,6 +383,22 @@ struct ftrace_likely_data { # define __counted_by(member) #endif +/* + * Runtime track number of objects pointed to by a pointer member for use by + * CONFIG_FORTIFY_SOURCE and CONFIG_UBSAN_BOUNDS. + * + * Optional: only supported since gcc >= 16 + * Optional: only supported since clang >= 22 + * + * gcc: https://gcc.gnu.org/pipermail/gcc-patches/2025-April/681727.html + * clang: https://clang.llvm.org/docs/AttributeReference.html#counted-by-counted-by-or-null-sized-by-sized-by-or-null + */ +#ifdef CONFIG_CC_HAS_COUNTED_BY_PTR +#define __counted_by_ptr(member) __attribute__((__counted_by__(member))) +#else +#define __counted_by_ptr(member) +#endif + /* * Optional: only supported since gcc >= 15 * Optional: not supported by Clang diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index 9a28f7d9a334..111b097ec00b 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -72,6 +72,10 @@ #define __counted_by_be(m) #endif +#ifndef __counted_by_ptr +#define __counted_by_ptr(m) +#endif + #ifdef __KERNEL__ #define __kernel_nonstring __nonstring #else diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..96b7cd481eaa 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -143,6 +143,13 @@ config CC_HAS_COUNTED_BY # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 default y if CC_IS_GCC && GCC_VERSION >= 150100 +config CC_HAS_COUNTED_BY_PTR + bool + # supported since clang 22 + default y if CC_IS_CLANG && CLANG_VERSION >= 220000 + # supported since gcc 16.0.0 + default y if CC_IS_GCC && GCC_VERSION >= 160000 + config CC_HAS_MULTIDIMENSIONAL_NONSTRING def_bool $(success,echo 'char tag[][4] __attribute__((__nonstring__)) = { };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror) -- cgit v1.2.3 From ca9d74eb5f6aea6eee746aae648382332dbcf24e Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 13 Jan 2026 08:44:17 +0100 Subject: uapi: add INT_MAX and INT_MIN constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some UAPI headers use INT_MAX and INT_MIN. Currently they include for their definitions, which introduces a problematic dependency on libc. Add custom, namespaced definitions of INT_MAX and INT_MIN using the same values as the regular kernel code. These definitions are not added to uapi/linux/limits.h, as that header will conflict with libc definitions on some platforms. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20260113-uapi-limits-v2-1-93c20f4b2c1a@linutronix.de Signed-off-by: Jakub Kicinski --- include/uapi/linux/typelimits.h | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 include/uapi/linux/typelimits.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/typelimits.h b/include/uapi/linux/typelimits.h new file mode 100644 index 000000000000..8166c639b518 --- /dev/null +++ b/include/uapi/linux/typelimits.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_TYPELIMITS_H +#define _UAPI_LINUX_TYPELIMITS_H + +#define __KERNEL_INT_MAX ((int)(~0U >> 1)) +#define __KERNEL_INT_MIN (-__KERNEL_INT_MAX - 1) + +#endif /* _UAPI_LINUX_TYPELIMITS_H */ -- cgit v1.2.3 From a8a11e5237aed71b7f5f9d33c554ef06fe974311 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 13 Jan 2026 08:44:18 +0100 Subject: ethtool: uapi: Use UAPI definition of INT_MAX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using to gain access to INT_MAX introduces a dependency on a libc, which UAPI headers should not do. Use the equivalent UAPI constant. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20260113-uapi-limits-v2-2-93c20f4b2c1a@linutronix.de Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index bbfe6e1cf01b..ce9aeb65a8e1 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -15,13 +15,10 @@ #define _UAPI_LINUX_ETHTOOL_H #include +#include #include #include -#ifndef __KERNEL__ -#include /* for INT_MAX */ -#endif - /* All structures exposed to userland should be defined such that they * have the same layout for 32-bit and 64-bit userland. */ @@ -2216,7 +2213,7 @@ enum ethtool_link_mode_bit_indices { static inline int ethtool_validate_speed(__u32 speed) { - return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN; + return speed <= __KERNEL_INT_MAX || speed == (__u32)SPEED_UNKNOWN; } /* Duplex, half or full. */ -- cgit v1.2.3 From 0b3877bec78b0f26a280078a15f8992426de1db7 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 13 Jan 2026 08:44:19 +0100 Subject: netfilter: uapi: Use UAPI definition of INT_MAX and INT_MIN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using to gain access to INT_MAX and INT_MIN introduces a dependency on a libc, which UAPI headers should not do. Use the equivalent UAPI constants. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20260113-uapi-limits-v2-3-93c20f4b2c1a@linutronix.de Signed-off-by: Jakub Kicinski --- include/uapi/linux/netfilter_bridge.h | 9 +++------ include/uapi/linux/netfilter_ipv4.h | 9 ++++----- include/uapi/linux/netfilter_ipv6.h | 7 +++---- 3 files changed, 10 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h index 1610fdbab98d..f6e8d1e05c97 100644 --- a/include/uapi/linux/netfilter_bridge.h +++ b/include/uapi/linux/netfilter_bridge.h @@ -10,10 +10,7 @@ #include #include #include - -#ifndef __KERNEL__ -#include /* for INT_MIN, INT_MAX */ -#endif +#include /* Bridge Hooks */ /* After promisc drops, checksum checks. */ @@ -31,14 +28,14 @@ #define NF_BR_NUMHOOKS 6 enum nf_br_hook_priorities { - NF_BR_PRI_FIRST = INT_MIN, + NF_BR_PRI_FIRST = __KERNEL_INT_MIN, NF_BR_PRI_NAT_DST_BRIDGED = -300, NF_BR_PRI_FILTER_BRIDGED = -200, NF_BR_PRI_BRNF = 0, NF_BR_PRI_NAT_DST_OTHER = 100, NF_BR_PRI_FILTER_OTHER = 200, NF_BR_PRI_NAT_SRC = 300, - NF_BR_PRI_LAST = INT_MAX, + NF_BR_PRI_LAST = __KERNEL_INT_MAX, }; #endif /* _UAPI__LINUX_BRIDGE_NETFILTER_H */ diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h index 155e77d6a42d..439d3c59862b 100644 --- a/include/uapi/linux/netfilter_ipv4.h +++ b/include/uapi/linux/netfilter_ipv4.h @@ -7,12 +7,11 @@ #include +#include /* only for userspace compatibility */ #ifndef __KERNEL__ -#include /* for INT_MIN, INT_MAX */ - /* IP Hooks */ /* After promisc drops, checksum checks. */ #define NF_IP_PRE_ROUTING 0 @@ -28,7 +27,7 @@ #endif /* ! __KERNEL__ */ enum nf_ip_hook_priorities { - NF_IP_PRI_FIRST = INT_MIN, + NF_IP_PRI_FIRST = __KERNEL_INT_MIN, NF_IP_PRI_RAW_BEFORE_DEFRAG = -450, NF_IP_PRI_CONNTRACK_DEFRAG = -400, NF_IP_PRI_RAW = -300, @@ -41,8 +40,8 @@ enum nf_ip_hook_priorities { NF_IP_PRI_NAT_SRC = 100, NF_IP_PRI_SELINUX_LAST = 225, NF_IP_PRI_CONNTRACK_HELPER = 300, - NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX, - NF_IP_PRI_LAST = INT_MAX, + NF_IP_PRI_CONNTRACK_CONFIRM = __KERNEL_INT_MAX, + NF_IP_PRI_LAST = __KERNEL_INT_MAX, }; /* Arguments for setsockopt SOL_IP: */ diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h index 80aa9b0799af..0e40d00b37fa 100644 --- a/include/uapi/linux/netfilter_ipv6.h +++ b/include/uapi/linux/netfilter_ipv6.h @@ -10,12 +10,11 @@ #include +#include /* only for userspace compatibility */ #ifndef __KERNEL__ -#include /* for INT_MIN, INT_MAX */ - /* IP6 Hooks */ /* After promisc drops, checksum checks. */ #define NF_IP6_PRE_ROUTING 0 @@ -32,7 +31,7 @@ enum nf_ip6_hook_priorities { - NF_IP6_PRI_FIRST = INT_MIN, + NF_IP6_PRI_FIRST = __KERNEL_INT_MIN, NF_IP6_PRI_RAW_BEFORE_DEFRAG = -450, NF_IP6_PRI_CONNTRACK_DEFRAG = -400, NF_IP6_PRI_RAW = -300, @@ -45,7 +44,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_NAT_SRC = 100, NF_IP6_PRI_SELINUX_LAST = 225, NF_IP6_PRI_CONNTRACK_HELPER = 300, - NF_IP6_PRI_LAST = INT_MAX, + NF_IP6_PRI_LAST = __KERNEL_INT_MAX, }; -- cgit v1.2.3 From 7d8b06ecc45bd679dec58d2cc2bd86223d4e076d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:02 +0000 Subject: iommu/amd: Add support for hw_info for iommu capability query AMD IOMMU Extended Feature (EFR) and Extended Feature 2 (EFR2) registers specify features supported by each IOMMU hardware instance. The IOMMU driver checks each feature-specific bits before enabling each feature at run time. For IOMMUFD, the hypervisor passes the raw value of amd_iommu_efr and amd_iommu_efr2 to VMM via iommufd IOMMU_DEVICE_GET_HW_INFO ioctl. Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- drivers/iommu/amd/Kconfig | 10 ++++++++++ drivers/iommu/amd/Makefile | 1 + drivers/iommu/amd/iommu.c | 2 ++ drivers/iommu/amd/iommufd.c | 31 +++++++++++++++++++++++++++++++ drivers/iommu/amd/iommufd.h | 15 +++++++++++++++ include/uapi/linux/iommufd.h | 28 ++++++++++++++++++++++++++++ 6 files changed, 87 insertions(+) create mode 100644 drivers/iommu/amd/iommufd.c create mode 100644 drivers/iommu/amd/iommufd.h (limited to 'include/uapi/linux') diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index f2acf471cb5d..588355ff7eb7 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -30,6 +30,16 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. +config AMD_IOMMU_IOMMUFD + bool "Enable IOMMUFD features for AMD IOMMU (EXPERIMENTAL)" + depends on IOMMUFD + depends on AMD_IOMMU + help + Support for IOMMUFD features intended to support virtual machines + with accelerated virtual IOMMUs. + + Say Y here if you are doing development and testing on this feature. + config AMD_IOMMU_DEBUGFS bool "Enable AMD IOMMU internals in DebugFS" depends on AMD_IOMMU && IOMMU_DEBUGFS diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 5412a563c697..41f053b49dce 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += iommu.o init.o quirks.o ppr.o pasid.o +obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d7f457338de7..d550a7e431ac 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -43,6 +43,7 @@ #include #include "amd_iommu.h" +#include "iommufd.h" #include "../irq_remapping.h" #include "../iommu-pages.h" @@ -3083,6 +3084,7 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, + .hw_info = amd_iommufd_hw_info, .blocked_domain = &blocked_domain, .release_domain = &blocked_domain, .identity_domain = &identity_domain.domain, diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c new file mode 100644 index 000000000000..72eaaa923d04 --- /dev/null +++ b/drivers/iommu/amd/iommufd.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include + +#include "iommufd.h" +#include "amd_iommu.h" +#include "amd_iommu_types.h" + +void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct iommu_hw_info_amd *hwinfo; + + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_AMD) + return ERR_PTR(-EOPNOTSUPP); + + hwinfo = kzalloc(sizeof(*hwinfo), GFP_KERNEL); + if (!hwinfo) + return ERR_PTR(-ENOMEM); + + *length = sizeof(*hwinfo); + *type = IOMMU_HW_INFO_TYPE_AMD; + + hwinfo->efr = amd_iommu_efr; + hwinfo->efr2 = amd_iommu_efr2; + + return hwinfo; +} diff --git a/drivers/iommu/amd/iommufd.h b/drivers/iommu/amd/iommufd.h new file mode 100644 index 000000000000..f880be80a30d --- /dev/null +++ b/drivers/iommu/amd/iommufd.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef AMD_IOMMUFD_H +#define AMD_IOMMUFD_H + +#if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD) +void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type); +#else +#define amd_iommufd_hw_info NULL +#endif /* CONFIG_AMD_IOMMU_IOMMUFD */ + +#endif /* AMD_IOMMUFD_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 2c41920b641d..3db37f6042a0 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -623,6 +623,32 @@ struct iommu_hw_info_tegra241_cmdqv { __u8 __reserved; }; +/** + * struct iommu_hw_info_amd - AMD IOMMU device info + * + * @efr : Value of AMD IOMMU Extended Feature Register (EFR) + * @efr2: Value of AMD IOMMU Extended Feature 2 Register (EFR2) + * + * Please See description of these registers in the following sections of + * the AMD I/O Virtualization Technology (IOMMU) Specification. + * (https://docs.amd.com/v/u/en-US/48882_3.10_PUB) + * + * - MMIO Offset 0030h IOMMU Extended Feature Register + * - MMIO Offset 01A0h IOMMU Extended Feature 2 Register + * + * Note: The EFR and EFR2 are raw values reported by hardware. + * VMM is responsible to determine the appropriate flags to be exposed to + * the VM since cetertain features are not currently supported by the kernel + * for HW-vIOMMU. + * + * Current VMM-allowed list of feature flags are: + * - EFR[GTSup, GASup, GioSup, PPRSup, EPHSup, GATS, GLX, PASmax] + */ +struct iommu_hw_info_amd { + __aligned_u64 efr; + __aligned_u64 efr2; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware @@ -632,6 +658,7 @@ struct iommu_hw_info_tegra241_cmdqv { * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM * SMMUv3) info type + * @IOMMU_HW_INFO_TYPE_AMD: AMD IOMMU info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, @@ -639,6 +666,7 @@ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, + IOMMU_HW_INFO_TYPE_AMD = 4, }; /** -- cgit v1.2.3 From e05698c10d980ac0a0b57ed81ec9353b9e9533c6 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 15 Jan 2026 06:08:06 +0000 Subject: iommufd: Introduce data struct for AMD nested domain allocation Introduce IOMMU_HWPT_DATA_AMD_GUEST data type for IOMMU guest page table, which is used for stage-1 in nested translation. The data structure contains information necessary for setting up the AMD HW-vIOMMU support. Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- include/uapi/linux/iommufd.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 3db37f6042a0..1dafbc552d37 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -465,16 +465,27 @@ struct iommu_hwpt_arm_smmuv3 { __aligned_le64 ste[2]; }; +/** + * struct iommu_hwpt_amd_guest - AMD IOMMU guest I/O page table data + * (IOMMU_HWPT_DATA_AMD_GUEST) + * @dte: Guest Device Table Entry (DTE) + */ +struct iommu_hwpt_amd_guest { + __aligned_u64 dte[4]; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table + * @IOMMU_HWPT_DATA_AMD_GUEST: AMD IOMMU guest page table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE = 0, IOMMU_HWPT_DATA_VTD_S1 = 1, IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, + IOMMU_HWPT_DATA_AMD_GUEST = 3, }; /** -- cgit v1.2.3 From cd16edba1c6a24af138e1a5ded2711231fffa99f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Dec 2025 11:19:10 +0100 Subject: ext4: fix ext4_tune_sb_params padding The padding at the end of struct ext4_tune_sb_params is architecture specific and in particular is different between x86-32 and x86-64, since the __u64 member only enforces struct alignment on the latter. This shows up as a new warning when test-building the headers with -Wpadded: include/linux/ext4.h:144:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded] All members inside the structure are naturally aligned, so the only difference here is the amount of padding at the end. Make the padding explicit, to have a consistent sizeof(struct ext4_tune_sb_params) of 232 on all architectures and avoid adding compat ioctl handling for EXT4_IOC_GET_TUNE_SB_PARAM/EXT4_IOC_SET_TUNE_SB_PARAM. This is an ABI break on x86-32 but hopefully this can go into 6.18.y early enough as a fixup so no actual users will be affected. Alternatively, the kernel could handle the ioctl commands for both sizes (232 and 228 bytes) on all architectures. Fixes: 04a91570ac67 ("ext4: implemet new ioctls to set and get superblock parameters") Signed-off-by: Arnd Bergmann Reviewed-by: Jan Kara Link: https://patch.msgid.link/20251204101914.1037148-1-arnd@kernel.org Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- include/uapi/linux/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h index 411dcc1e4a35..9c683991c32f 100644 --- a/include/uapi/linux/ext4.h +++ b/include/uapi/linux/ext4.h @@ -139,7 +139,7 @@ struct ext4_tune_sb_params { __u32 clear_feature_incompat_mask; __u32 clear_feature_ro_compat_mask; __u8 mount_opts[64]; - __u8 pad[64]; + __u8 pad[68]; }; #define EXT4_TUNE_FL_ERRORS_BEHAVIOR 0x00000001 -- cgit v1.2.3 From 8a42938a28941da29bf3e4cd2af877b0d5d929e1 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Wed, 7 Jan 2026 14:22:54 +0200 Subject: wifi: nl80211: ignore cluster id after NAN started After NAN was started, cluster id updates from the user space should not happen, since the device already started a cluster with the previousely provided id. Since NL80211_CMD_CHANGE_NAN_CONFIG requires to set the full NAN configuration, we can't require that NL80211_NAN_CONF_CLUSTER_ID won't be included in this command, and keeping the last confgiured value just to be able to compare it against the new one seems a bit overkill. Therefore, just ignore cluster id in this command and clarify the documentation. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260107142229.fb55e5853269.I10d18c8f69d98b28916596d6da4207c15ea4abb5@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 +++- net/wireless/nl80211.c | 11 ++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index eb92296457c9..b0f050e36fa4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,7 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -7454,6 +7454,8 @@ enum nl80211_nan_band_conf_attributes { * address that can take values from 50-6F-9A-01-00-00 to * 50-6F-9A-01-FF-FF. This attribute is optional. If not present, * a random Cluster ID will be chosen. + * This attribute will be ignored in NL80211_CMD_CHANGE_NAN_CONFIG + * since after NAN was started, the cluster ID can no longer change. * @NL80211_NAN_CONF_EXTRA_ATTRS: Additional NAN attributes to be * published in the beacons. This is an optional byte array. * @NL80211_NAN_CONF_VENDOR_ELEMS: Vendor-specific elements that will diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bcf30c5f5042..56cc5ed33ea3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include @@ -15583,7 +15583,8 @@ static int nl80211_parse_nan_band_config(struct wiphy *wiphy, static int nl80211_parse_nan_conf(struct wiphy *wiphy, struct genl_info *info, struct cfg80211_nan_conf *conf, - u32 *changed_flags) + u32 *changed_flags, + bool start) { struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; int err, rem; @@ -15630,7 +15631,7 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, return err; changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; - if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + if (attrs[NL80211_NAN_CONF_CLUSTER_ID] && start) conf->cluster_id = nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); @@ -15741,7 +15742,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL, true); if (err) return err; @@ -16107,7 +16108,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed, false); if (err) return err; -- cgit v1.2.3 From a9927022c4491ba44249af079e8799ce56f8053c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 15 Jan 2026 12:56:44 +0100 Subject: net: ethtool: Add support for 80Gbps speed USB4 v2 link used in peer-to-peer networking is symmetric 80Gbps so in order to support reading this link speed, add support for it to ethtool. Signed-off-by: Mika Westerberg Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20260115115646.328898-3-mika.westerberg@linux.intel.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy-caps.h | 1 + drivers/net/phy/phy-core.c | 2 ++ drivers/net/phy/phy_caps.c | 2 ++ drivers/net/phy/phylink.c | 1 + include/linux/phylink.h | 7 ++++--- include/uapi/linux/ethtool.h | 1 + 6 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy-caps.h b/drivers/net/phy/phy-caps.h index 5f3f757e0b2f..421088e6f6e8 100644 --- a/drivers/net/phy/phy-caps.h +++ b/drivers/net/phy/phy-caps.h @@ -25,6 +25,7 @@ enum { LINK_CAPA_40000FD, LINK_CAPA_50000FD, LINK_CAPA_56000FD, + LINK_CAPA_80000FD, LINK_CAPA_100000FD, LINK_CAPA_200000FD, LINK_CAPA_400000FD, diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 3badf6e84554..d7a4a977fc8a 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -48,6 +48,8 @@ const char *phy_speed_to_str(int speed) return "50Gbps"; case SPEED_56000: return "56Gbps"; + case SPEED_80000: + return "80Gbps"; case SPEED_100000: return "100Gbps"; case SPEED_200000: diff --git a/drivers/net/phy/phy_caps.c b/drivers/net/phy/phy_caps.c index 17a63c931335..942d43191561 100644 --- a/drivers/net/phy/phy_caps.c +++ b/drivers/net/phy/phy_caps.c @@ -21,6 +21,7 @@ static struct link_capabilities link_caps[__LINK_CAPA_MAX] __ro_after_init = { { SPEED_40000, DUPLEX_FULL, {0} }, /* LINK_CAPA_40000FD */ { SPEED_50000, DUPLEX_FULL, {0} }, /* LINK_CAPA_50000FD */ { SPEED_56000, DUPLEX_FULL, {0} }, /* LINK_CAPA_56000FD */ + { SPEED_80000, DUPLEX_FULL, {0} }, /* LINK_CAPA_80000FD */ { SPEED_100000, DUPLEX_FULL, {0} }, /* LINK_CAPA_100000FD */ { SPEED_200000, DUPLEX_FULL, {0} }, /* LINK_CAPA_200000FD */ { SPEED_400000, DUPLEX_FULL, {0} }, /* LINK_CAPA_400000FD */ @@ -49,6 +50,7 @@ static int speed_duplex_to_capa(int speed, unsigned int duplex) case SPEED_40000: return LINK_CAPA_40000FD; case SPEED_50000: return LINK_CAPA_50000FD; case SPEED_56000: return LINK_CAPA_56000FD; + case SPEED_80000: return LINK_CAPA_80000FD; case SPEED_100000: return LINK_CAPA_100000FD; case SPEED_200000: return LINK_CAPA_200000FD; case SPEED_400000: return LINK_CAPA_400000FD; diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 43d8380aaefb..c8fd6b91cdd4 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -311,6 +311,7 @@ static struct { { MAC_400000FD, SPEED_400000, DUPLEX_FULL, BIT(LINK_CAPA_400000FD) }, { MAC_200000FD, SPEED_200000, DUPLEX_FULL, BIT(LINK_CAPA_200000FD) }, { MAC_100000FD, SPEED_100000, DUPLEX_FULL, BIT(LINK_CAPA_100000FD) }, + { MAC_80000FD, SPEED_80000, DUPLEX_FULL, BIT(LINK_CAPA_80000FD) }, { MAC_56000FD, SPEED_56000, DUPLEX_FULL, BIT(LINK_CAPA_56000FD) }, { MAC_50000FD, SPEED_50000, DUPLEX_FULL, BIT(LINK_CAPA_50000FD) }, { MAC_40000FD, SPEED_40000, DUPLEX_FULL, BIT(LINK_CAPA_40000FD) }, diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 38363e566ac3..20996f5778d1 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -90,9 +90,10 @@ enum { MAC_40000FD = BIT(13), MAC_50000FD = BIT(14), MAC_56000FD = BIT(15), - MAC_100000FD = BIT(16), - MAC_200000FD = BIT(17), - MAC_400000FD = BIT(18), + MAC_80000FD = BIT(16), + MAC_100000FD = BIT(17), + MAC_200000FD = BIT(18), + MAC_400000FD = BIT(19), }; static inline bool phylink_autoneg_inband(unsigned int mode) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index ce9aeb65a8e1..b74b80508553 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2203,6 +2203,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_40000 40000 #define SPEED_50000 50000 #define SPEED_56000 56000 +#define SPEED_80000 80000 #define SPEED_100000 100000 #define SPEED_200000 200000 #define SPEED_400000 400000 -- cgit v1.2.3 From 50b359896fe55d0443ed550e1fabba71d242031a Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 18 Jan 2026 09:51:15 +0200 Subject: wifi: cfg80211: ignore link disabled flag from userspace When the AP has an advertised TID to Link Mapping (TTLM) it shall include the element in the association response. As such, when this element is present it needs to be used for the currently dormant links. See Draft P802.11REVmf_D1.0 section 35.3.7.2.3 ("Negotiation of TTLM") for the details. The flag is also not usable in case userspace wants to specify a negotiated TTLM during association. Note that for the link reconfiguration case, mac80211 did not use the information. Draft P802.11REVmf_D1.0 states in section 35.3.6.4 ("Link reconfiguration to the setup links) that we "shall operate with all the TIDs mapped to the newly added links ..." All this means that the flag is not needed. The implementation should parse the information from the association response. Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260118093904.754e057896a5.Ifd06f5ef839a93bfd54d0593dc932870f95f3242@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 --- include/uapi/linux/nl80211.h | 5 +++-- net/wireless/nl80211.c | 10 ---------- 3 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 899f267b7cf9..2900202588a5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3221,8 +3221,6 @@ struct cfg80211_auth_request { * if this is %NULL for a link, that link is not requested * @elems: extra elements for the per-STA profile for this link * @elems_len: length of the elements - * @disabled: If set this link should be included during association etc. but it - * should not be used until enabled by the AP MLD. * @error: per-link error code, must be <= 0. If there is an error, then the * operation as a whole must fail. */ @@ -3230,7 +3228,6 @@ struct cfg80211_assoc_link { struct cfg80211_bss *bss; const u8 *elems; size_t elems_len; - bool disabled; int error; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8134f10e4e6c..8433bac48112 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2880,8 +2880,9 @@ enum nl80211_commands { * index. If the userspace includes more RNR elements than number of * MBSSID elements then these will be added in every EMA beacon. * - * @NL80211_ATTR_MLO_LINK_DISABLED: Flag attribute indicating that the link is - * disabled. + * @NL80211_ATTR_MLO_LINK_DISABLED: Unused. It was used to indicate that a link + * is disabled during association. However, the AP will send the + * information by including a TTLM in the association response. * * @NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA: Include BSS usage data, i.e. * include BSSes that can only be used in restricted scenarios and/or diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c961cd42a832..03efd45c007f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12241,9 +12241,6 @@ static int nl80211_process_links(struct cfg80211_registered_device *rdev, return -EINVAL; } } - - links[link_id].disabled = - nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]); } return 0; @@ -12423,13 +12420,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto free; } - if (req.links[req.link_id].disabled) { - GENL_SET_ERR_MSG(info, - "cannot have assoc link disabled"); - err = -EINVAL; - goto free; - } - if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) req.ext_mld_capa_ops = nla_get_u16(info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]); -- cgit v1.2.3 From a5546e18f77c0cb15d434bf5b92647687fe483e3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:48 +0100 Subject: net: Add queue-create operation Add a ynl netdev family operation called queue-create that creates a new queue on a netdevice: name: queue-create attribute-set: queue flags: [admin-perm] do: request: attributes: - ifindex - type - lease reply: &queue-create-op attributes: - id This is a generic operation such that it can be extended for various use cases in future. Right now it is mandatory to specify ifindex, the queue type which is enforced to rx and a lease. The newly created queue id is returned to the caller. A queue from a virtual device can have a lease which refers to another queue from a physical device. This is useful for memory providers and AF_XDP operations which take an ifindex and queue id to allow applications to bind against virtual devices in containers. The lease couples both queues together and allows to proxy the operations from a virtual device in a container to the physical device. In future, the nested lease attribute can be lifted and made optional for other use-cases such as dynamic queue creation for physical netdevs. The lack of lease and the specification of the physical device as an ifindex will imply that we need a real queue to be allocated. Similarly, the queue type enforcement to rx can then be lifted as well to support tx. An early implementation had only driver-specific integration [0], but in order for other virtual devices to reuse, it makes sense to have this as a generic API in core net. For leasing queues, the virtual netdev must have real_num_rx_queue less than num_rx_queues at the time of calling queue-create. The queue-type must be rx as only rx queues are supported for leasing for now. We also enforce that the queue-create ifindex must point to a virtual device, and that the nested lease attribute's ifindex must point to a physical device. The nested lease attribute set contains a netns-id attribute which is currently only intended for dumping as part of the queue-get operation. Also, it is modeled as an s32 type similarly as done elsewhere in the stack. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0] Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-2-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/netdev.yaml | 44 +++++++++++++++++++++++++++++++++ include/uapi/linux/netdev.h | 11 +++++++++ net/core/netdev-genl-gen.c | 20 +++++++++++++++ net/core/netdev-genl-gen.h | 2 ++ net/core/netdev-genl.c | 5 ++++ tools/include/uapi/linux/netdev.h | 11 +++++++++ 6 files changed, 93 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 596c306ce52b..b86db8656eac 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -339,6 +339,15 @@ attribute-sets: doc: XSK information for this queue, if any. type: nest nested-attributes: xsk-info + - + name: lease + doc: | + A queue from a virtual device can have a lease which refers to + another queue from a physical device. This is useful for memory + providers and AF_XDP operations which take an ifindex and queue id + to allow applications to bind against virtual devices in containers. + type: nest + nested-attributes: lease - name: qstats doc: | @@ -537,6 +546,24 @@ attribute-sets: name: id - name: type + - + name: lease + attributes: + - + name: ifindex + doc: The netdev ifindex to lease the queue from. + type: u32 + checks: + min: 1 + - + name: queue + doc: The netdev queue to lease from. + type: nest + nested-attributes: queue-id + - + name: netns-id + doc: The network namespace id of the netdev. + type: s32 - name: dmabuf attributes: @@ -686,6 +713,7 @@ operations: - dmabuf - io-uring - xsk + - lease dump: request: attributes: @@ -797,6 +825,22 @@ operations: reply: attributes: - id + - + name: queue-create + doc: | + Create a new queue for the given netdevice. Whether this operation + is supported depends on the device and the driver. + attribute-set: queue + flags: [admin-perm] + do: + request: + attributes: + - ifindex + - type + - lease + reply: &queue-create-op + attributes: + - id kernel-family: headers: ["net/netdev_netlink.h"] diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e0b579a1df4f..7df1056a35fd 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -160,6 +160,7 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, + NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -202,6 +203,15 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_LEASE_IFINDEX = 1, + NETDEV_A_LEASE_QUEUE, + NETDEV_A_LEASE_NETNS_ID, + + __NETDEV_A_LEASE_MAX, + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) +}; + enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -228,6 +238,7 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, + NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index ba673e81716f..52ba99c019e7 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -28,6 +28,12 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range }; /* Common nested types */ +const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = { + [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), + [NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, }, +}; + const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), @@ -107,6 +113,13 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, }; +/* NETDEV_CMD_QUEUE_CREATE - do */ +static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -205,6 +218,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_QUEUE_CREATE, + .doit = netdev_nl_queue_create_doit, + .policy = netdev_queue_create_nl_policy, + .maxattr = NETDEV_A_QUEUE_LEASE, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index cffc08517a41..d71b435d72c1 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -14,6 +14,7 @@ #include /* Common nested types */ +extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1]; extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; @@ -36,6 +37,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 470fabbeacd9..aae75431858d 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -1120,6 +1120,11 @@ err_genlmsg_free: return err; } +int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e0b579a1df4f..7df1056a35fd 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -160,6 +160,7 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, + NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -202,6 +203,15 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_LEASE_IFINDEX = 1, + NETDEV_A_LEASE_QUEUE, + NETDEV_A_LEASE_NETNS_ID, + + __NETDEV_A_LEASE_MAX, + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) +}; + enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -228,6 +238,7 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, + NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From b5c3fa4a0b16d4a7d0bd0e5626a13fec0024030a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:56 +0100 Subject: netkit: Add single device mode for netkit Add a single device mode for netkit instead of netkit pairs. The primary target for the paired devices is to connect network namespaces, of course, and support has been implemented in projects like Cilium [0]. For the rxq leasing the plan is to support two main scenarios related to single device mode: * For the use-case of io_uring zero-copy, the control plane can either set up a netkit pair where the peer device can perform rxq leasing which is then tied to the lifetime of the peer device, or the control plane can use a regular netkit pair to connect the hostns to a Pod/container and dynamically add/remove rxq leasing through a single device without having to interrupt the device pair. In the case of io_uring, the memory pool is used as skb non-linear pages, and thus the skb will go its way through the regular stack into netkit. Things like the netkit policy when no BPF is attached or skb scrubbing etc apply as-is in case the paired devices are used, or if the backend memory is tied to the single device and traffic goes through a paired device. * For the use-case of AF_XDP, the control plane needs to use netkit in the single device mode. The single device mode currently enforces only a pass policy when no BPF is attached, and does not yet support BPF link attachments for AF_XDP. skbs sent to that device get dropped at the moment. Given AF_XDP operates at a lower layer of the stack tying this to the netkit pair did not make sense. In future, the plan is to allow BPF at the XDP layer which can: i) process traffic coming from the AF_XDP application (e.g. QEMU with AF_XDP backend) to filter egress traffic or to push selected egress traffic up to the single netkit device to the local stack (e.g. DHCP requests), and ii) vice-versa skbs sent to the single netkit into the AF_XDP application (e.g. DHCP replies). Also, the control-plane can dynamically manage rxq leasing for the single netkit device without having to interrupt (e.g. down/up cycle) the main netkit pair for the Pod which has traffic going in and out. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Jordan Rife Reviewed-by: Nikolay Aleksandrov Link: https://docs.cilium.io/en/stable/operations/performance/tuning/#netkit-device-mode [0] Link: https://patch.msgid.link/20260115082603.219152-10-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- drivers/net/netkit.c | 110 +++++++++++++++++++++++++++---------------- include/uapi/linux/if_link.h | 6 +++ 2 files changed, 76 insertions(+), 40 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 0a2fef7caccb..76332a98af92 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -26,6 +26,7 @@ struct netkit { __cacheline_group_begin(netkit_slowpath); enum netkit_mode mode; + enum netkit_pairing pair; bool primary; u32 headroom; __cacheline_group_end(netkit_slowpath); @@ -135,6 +136,10 @@ static int netkit_open(struct net_device *dev) struct netkit *nk = netkit_priv(dev); struct net_device *peer = rtnl_dereference(nk->peer); + if (nk->pair == NETKIT_DEVICE_SINGLE) { + netif_carrier_on(dev); + return 0; + } if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { @@ -335,6 +340,7 @@ static int netkit_new_link(struct net_device *dev, enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; + enum netkit_pairing pair = NETKIT_DEVICE_PAIR; enum netkit_action policy_prim = NETKIT_PASS; enum netkit_action policy_peer = NETKIT_PASS; struct nlattr **data = params->data; @@ -343,7 +349,8 @@ static int netkit_new_link(struct net_device *dev, struct nlattr **tb = params->tb; u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; - struct net_device *peer; + struct net_device *peer = NULL; + bool seen_peer = false; char ifname[IFNAMSIZ]; struct netkit *nk; int err; @@ -380,6 +387,12 @@ static int netkit_new_link(struct net_device *dev, headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); if (data[IFLA_NETKIT_TAILROOM]) tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); + if (data[IFLA_NETKIT_PAIRING]) + pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]); + + seen_peer = data[IFLA_NETKIT_PEER_INFO] || + data[IFLA_NETKIT_PEER_SCRUB] || + data[IFLA_NETKIT_PEER_POLICY]; } if (ifmp && tbp[IFLA_IFNAME]) { @@ -392,45 +405,46 @@ static int netkit_new_link(struct net_device *dev, if (mode != NETKIT_L2 && (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) return -EOPNOTSUPP; + if (pair == NETKIT_DEVICE_SINGLE && + (tb != tbp || seen_peer || policy_prim != NETKIT_PASS)) + return -EOPNOTSUPP; - peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, - &netkit_link_ops, tbp, extack); - if (IS_ERR(peer)) - return PTR_ERR(peer); - - netif_inherit_tso_max(peer, dev); - if (headroom) { - peer->needed_headroom = headroom; - dev->needed_headroom = headroom; - } - if (tailroom) { - peer->needed_tailroom = tailroom; - dev->needed_tailroom = tailroom; - } - - if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) - eth_hw_addr_random(peer); - if (ifmp && dev->ifindex) - peer->ifindex = ifmp->ifi_index; - - nk = netkit_priv(peer); - nk->primary = false; - nk->policy = policy_peer; - nk->scrub = scrub_peer; - nk->mode = mode; - nk->headroom = headroom; - bpf_mprog_bundle_init(&nk->bundle); + if (pair == NETKIT_DEVICE_PAIR) { + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, + &netkit_link_ops, tbp, extack); + if (IS_ERR(peer)) + return PTR_ERR(peer); + + netif_inherit_tso_max(peer, dev); + if (headroom) + peer->needed_headroom = headroom; + if (tailroom) + peer->needed_tailroom = tailroom; + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) + eth_hw_addr_random(peer); + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; - err = register_netdevice(peer); - if (err < 0) - goto err_register_peer; - netif_carrier_off(peer); - if (mode == NETKIT_L2) - dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); + nk = netkit_priv(peer); + nk->primary = false; + nk->policy = policy_peer; + nk->scrub = scrub_peer; + nk->mode = mode; + nk->pair = pair; + nk->headroom = headroom; + bpf_mprog_bundle_init(&nk->bundle); + + err = register_netdevice(peer); + if (err < 0) + goto err_register_peer; + netif_carrier_off(peer); + if (mode == NETKIT_L2) + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); - err = rtnl_configure_link(peer, NULL, 0, NULL); - if (err < 0) - goto err_configure_peer; + err = rtnl_configure_link(peer, NULL, 0, NULL); + if (err < 0) + goto err_configure_peer; + } if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -438,12 +452,17 @@ static int netkit_new_link(struct net_device *dev, nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else strscpy(dev->name, "nk%d", IFNAMSIZ); + if (headroom) + dev->needed_headroom = headroom; + if (tailroom) + dev->needed_tailroom = tailroom; nk = netkit_priv(dev); nk->primary = true; nk->policy = policy_prim; nk->scrub = scrub_prim; nk->mode = mode; + nk->pair = pair; nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); @@ -455,10 +474,12 @@ static int netkit_new_link(struct net_device *dev, dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); rcu_assign_pointer(netkit_priv(dev)->peer, peer); - rcu_assign_pointer(netkit_priv(peer)->peer, dev); + if (peer) + rcu_assign_pointer(netkit_priv(peer)->peer, dev); return 0; err_configure_peer: - unregister_netdevice(peer); + if (peer) + unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); @@ -518,6 +539,8 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi nk = netkit_priv(dev); if (!nk->primary) return ERR_PTR(-EACCES); + if (nk->pair == NETKIT_DEVICE_SINGLE) + return ERR_PTR(-EOPNOTSUPP); if (which == BPF_NETKIT_PEER) { dev = rcu_dereference_rtnl(nk->peer); if (!dev) @@ -879,6 +902,7 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], { IFLA_NETKIT_PEER_INFO, "peer info" }, { IFLA_NETKIT_HEADROOM, "headroom" }, { IFLA_NETKIT_TAILROOM, "tailroom" }, + { IFLA_NETKIT_PAIRING, "pairing" }, }; if (!nk->primary) { @@ -898,9 +922,11 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_NETKIT_POLICY]) { + err = -EOPNOTSUPP; attr = data[IFLA_NETKIT_POLICY]; policy = nla_get_u32(attr); - err = netkit_check_policy(policy, attr, extack); + if (nk->pair == NETKIT_DEVICE_PAIR) + err = netkit_check_policy(policy, attr, extack); if (err) return err; WRITE_ONCE(nk->policy, policy); @@ -931,6 +957,7 @@ static size_t netkit_get_size(const struct net_device *dev) nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */ 0; } @@ -951,6 +978,8 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair)) + return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); @@ -972,6 +1001,7 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), + [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, .reject_message = "Primary attribute is read-only" }, }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3b491d96e52e..bbd565757298 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1296,6 +1296,11 @@ enum netkit_mode { NETKIT_L3, }; +enum netkit_pairing { + NETKIT_DEVICE_PAIR, + NETKIT_DEVICE_SINGLE, +}; + /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to * the BPF program if attached. This also means the latter can * consume the two fields if they were populated earlier. @@ -1320,6 +1325,7 @@ enum { IFLA_NETKIT_PEER_SCRUB, IFLA_NETKIT_HEADROOM, IFLA_NETKIT_TAILROOM, + IFLA_NETKIT_PAIRING, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) -- cgit v1.2.3 From 8766d61a1d33cb5f15bfdd6ce9832bbe1fc649c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Jan 2026 18:04:55 -0800 Subject: Revert "Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'" This reverts commit 77b9c4a438fc66e2ab004c411056b3fb71a54f2c, reversing changes made to 4515ec4ad58a37e70a9e1256c0b993958c9b7497: 931420a2fc36 ("selftests/net: Add netkit container tests") ab771c938d9a ("selftests/net: Make NetDrvContEnv support queue leasing") 6be87fbb2776 ("selftests/net: Add env for container based tests") 61d99ce3dfc2 ("selftests/net: Add bpf skb forwarding program") 920da3634194 ("netkit: Add xsk support for af_xdp applications") eef51113f8af ("netkit: Add netkit notifier to check for unregistering devices") b5ef109d22d4 ("netkit: Implement rtnl_link_ops->alloc and ndo_queue_create") b5c3fa4a0b16 ("netkit: Add single device mode for netkit") 0073d2fd679d ("xsk: Proxy pool management for leased queues") 1ecea95dd3b5 ("xsk: Extend xsk_rcv_check validation") 804bf334d08a ("net: Proxy netdev_queue_get_dma_dev for leased queues") 0caa9a8ddec3 ("net: Proxy net_mp_{open,close}_rxq for leased queues") ff8889ff9107 ("net, ethtool: Disallow leased real rxqs to be resized") 9e2103f36110 ("net: Add lease info to queue-get response") 31127deddef4 ("net: Implement netdev_nl_queue_create_doit") a5546e18f77c ("net: Add queue-create operation") The series will conflict with io_uring work, and the code needs more polish. Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 44 --- drivers/net/netkit.c | 360 ++++----------------- include/linux/netdevice.h | 6 - include/net/netdev_queues.h | 19 +- include/net/netdev_rx_queue.h | 21 +- include/net/page_pool/memory_provider.h | 4 +- include/net/xdp_sock_drv.h | 2 +- include/uapi/linux/if_link.h | 6 - include/uapi/linux/netdev.h | 11 - net/core/dev.c | 7 - net/core/dev.h | 2 - net/core/netdev-genl-gen.c | 20 -- net/core/netdev-genl-gen.h | 2 - net/core/netdev-genl.c | 185 ----------- net/core/netdev_queues.c | 74 +---- net/core/netdev_rx_queue.c | 169 ++-------- net/ethtool/channels.c | 12 +- net/ethtool/ioctl.c | 9 +- net/xdp/xsk.c | 79 +---- tools/include/uapi/linux/netdev.h | 11 - tools/testing/selftests/drivers/net/README.rst | 7 - tools/testing/selftests/drivers/net/hw/Makefile | 2 - .../selftests/drivers/net/hw/lib/py/__init__.py | 7 +- .../selftests/drivers/net/hw/nk_forward.bpf.c | 49 --- tools/testing/selftests/drivers/net/hw/nk_netns.py | 23 -- .../testing/selftests/drivers/net/hw/nk_qlease.py | 55 ---- .../selftests/drivers/net/lib/py/__init__.py | 7 +- tools/testing/selftests/drivers/net/lib/py/env.py | 157 --------- 28 files changed, 117 insertions(+), 1233 deletions(-) delete mode 100644 tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_netns.py delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_qlease.py (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index b86db8656eac..596c306ce52b 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -339,15 +339,6 @@ attribute-sets: doc: XSK information for this queue, if any. type: nest nested-attributes: xsk-info - - - name: lease - doc: | - A queue from a virtual device can have a lease which refers to - another queue from a physical device. This is useful for memory - providers and AF_XDP operations which take an ifindex and queue id - to allow applications to bind against virtual devices in containers. - type: nest - nested-attributes: lease - name: qstats doc: | @@ -546,24 +537,6 @@ attribute-sets: name: id - name: type - - - name: lease - attributes: - - - name: ifindex - doc: The netdev ifindex to lease the queue from. - type: u32 - checks: - min: 1 - - - name: queue - doc: The netdev queue to lease from. - type: nest - nested-attributes: queue-id - - - name: netns-id - doc: The network namespace id of the netdev. - type: s32 - name: dmabuf attributes: @@ -713,7 +686,6 @@ operations: - dmabuf - io-uring - xsk - - lease dump: request: attributes: @@ -825,22 +797,6 @@ operations: reply: attributes: - id - - - name: queue-create - doc: | - Create a new queue for the given netdevice. Whether this operation - is supported depends on the device and the driver. - attribute-set: queue - flags: [admin-perm] - do: - request: - attributes: - - ifindex - - type - - lease - reply: &queue-create-op - attributes: - - id kernel-family: headers: ["net/netdev_netlink.h"] diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 0519f855d062..0a2fef7caccb 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -9,21 +9,11 @@ #include #include -#include -#include -#include -#include #include #include #include -#define NETKIT_DRV_NAME "netkit" - -#define NETKIT_NUM_RX_QUEUES_MAX 1024 -#define NETKIT_NUM_TX_QUEUES_MAX 1 - -#define NETKIT_NUM_RX_QUEUES_REAL 1 -#define NETKIT_NUM_TX_QUEUES_REAL 1 +#define DRV_NAME "netkit" struct netkit { __cacheline_group_begin(netkit_fastpath); @@ -36,7 +26,6 @@ struct netkit { __cacheline_group_begin(netkit_slowpath); enum netkit_mode mode; - enum netkit_pairing pair; bool primary; u32 headroom; __cacheline_group_end(netkit_slowpath); @@ -47,8 +36,6 @@ struct netkit_link { struct net_device *dev; }; -static struct rtnl_link_ops netkit_link_ops; - static __always_inline int netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, enum netkit_action ret) @@ -148,10 +135,6 @@ static int netkit_open(struct net_device *dev) struct netkit *nk = netkit_priv(dev); struct net_device *peer = rtnl_dereference(nk->peer); - if (nk->pair == NETKIT_DEVICE_SINGLE) { - netif_carrier_on(dev); - return 0; - } if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { @@ -236,86 +219,9 @@ static void netkit_get_stats(struct net_device *dev, stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); } -static bool netkit_xsk_supported_at_phys(const struct net_device *dev) -{ - if (!dev->netdev_ops->ndo_bpf || - !dev->netdev_ops->ndo_xdp_xmit || - !dev->netdev_ops->ndo_xsk_wakeup) - return false; - if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) - return false; - return true; -} - -static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp) -{ - struct netkit *nk = netkit_priv(dev); - struct netdev_bpf xdp_lower; - struct netdev_rx_queue *rxq; - struct net_device *phys; - int ret = -EBUSY; - - switch (xdp->command) { - case XDP_SETUP_XSK_POOL: - if (nk->pair == NETKIT_DEVICE_PAIR) - return -EOPNOTSUPP; - if (xdp->xsk.queue_id >= dev->real_num_rx_queues) - return -EINVAL; - - rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id); - if (!rxq->lease) - return -EOPNOTSUPP; - - phys = rxq->lease->dev; - if (!netkit_xsk_supported_at_phys(phys)) - return -EOPNOTSUPP; - - memcpy(&xdp_lower, xdp, sizeof(xdp_lower)); - xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease); - break; - case XDP_SETUP_PROG: - return -EPERM; - default: - return -EINVAL; - } - - netdev_lock(phys); - if (!dev_get_min_mp_channel_count(phys)) - ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower); - netdev_unlock(phys); - return ret; -} - -static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) -{ - struct netdev_rx_queue *rxq; - struct net_device *phys; - - if (queue_id >= dev->real_num_rx_queues) - return -EINVAL; - - rxq = __netif_get_rx_queue(dev, queue_id); - if (!rxq->lease) - return -EOPNOTSUPP; - - phys = rxq->lease->dev; - if (!netkit_xsk_supported_at_phys(phys)) - return -EOPNOTSUPP; - - return phys->netdev_ops->ndo_xsk_wakeup(phys, - get_netdev_rx_queue_index(rxq->lease), flags); -} - -static int netkit_init(struct net_device *dev) -{ - netdev_lockdep_set_classes(dev); - return 0; -} - static void netkit_uninit(struct net_device *dev); static const struct net_device_ops netkit_netdev_ops = { - .ndo_init = netkit_init, .ndo_open = netkit_open, .ndo_stop = netkit_close, .ndo_start_xmit = netkit_xmit, @@ -326,95 +232,19 @@ static const struct net_device_ops netkit_netdev_ops = { .ndo_get_peer_dev = netkit_peer_dev, .ndo_get_stats64 = netkit_get_stats, .ndo_uninit = netkit_uninit, - .ndo_bpf = netkit_xsk, - .ndo_xsk_wakeup = netkit_xsk_wakeup, .ndo_features_check = passthru_features_check, }; static void netkit_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { - strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver)); + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); } static const struct ethtool_ops netkit_ethtool_ops = { .get_drvinfo = netkit_get_drvinfo, }; -static int netkit_queue_create(struct net_device *dev) -{ - struct netkit *nk = netkit_priv(dev); - u32 rxq_count_old, rxq_count_new; - int err; - - rxq_count_old = dev->real_num_rx_queues; - rxq_count_new = rxq_count_old + 1; - - /* Only allow to lease a queue in single device mode or to - * lease against the peer device which then ends up in the - * target netns. - */ - if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) - return -EOPNOTSUPP; - - if (netif_running(dev)) - netif_carrier_off(dev); - err = netif_set_real_num_rx_queues(dev, rxq_count_new); - if (netif_running(dev)) - netif_carrier_on(dev); - - return err ? : rxq_count_old; -} - -static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = { - .ndo_queue_create = netkit_queue_create, -}; - -static struct net_device *netkit_alloc(struct nlattr *tb[], - const char *ifname, - unsigned char name_assign_type, - unsigned int num_tx_queues, - unsigned int num_rx_queues) -{ - const struct rtnl_link_ops *ops = &netkit_link_ops; - struct net_device *dev; - - if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX || - num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX) - return ERR_PTR(-EOPNOTSUPP); - - dev = alloc_netdev_mqs(ops->priv_size, ifname, - name_assign_type, ops->setup, - num_tx_queues, num_rx_queues); - if (dev) { - dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL; - dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL; - } - return dev; -} - -static void netkit_queue_unlease(struct net_device *dev) -{ - struct netdev_rx_queue *rxq, *rxq_lease; - struct net_device *dev_lease; - int i; - - if (dev->real_num_rx_queues == 1) - return; - - netdev_lock(dev); - for (i = 1; i < dev->real_num_rx_queues; i++) { - rxq = __netif_get_rx_queue(dev, i); - rxq_lease = rxq->lease; - dev_lease = rxq_lease->dev; - - netdev_lock(dev_lease); - netdev_rx_queue_unlease(rxq, rxq_lease); - netdev_unlock(dev_lease); - } - netdev_unlock(dev); -} - static void netkit_setup(struct net_device *dev) { static const netdev_features_t netkit_features_hw_vlan = @@ -445,20 +275,18 @@ static void netkit_setup(struct net_device *dev) dev->priv_flags |= IFF_DISABLE_NETPOLL; dev->lltx = true; - dev->netdev_ops = &netkit_netdev_ops; - dev->ethtool_ops = &netkit_ethtool_ops; - dev->queue_mgmt_ops = &netkit_queue_mgmt_ops; + dev->ethtool_ops = &netkit_ethtool_ops; + dev->netdev_ops = &netkit_netdev_ops; dev->features |= netkit_features; dev->hw_features = netkit_features; dev->hw_enc_features = netkit_features; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; dev->vlan_features = dev->features & ~netkit_features_hw_vlan; + dev->needs_free_netdev = true; netif_set_tso_max_size(dev, GSO_MAX_SIZE); - - xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK); } static struct net *netkit_get_link_net(const struct net_device *dev) @@ -497,6 +325,8 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } +static struct rtnl_link_ops netkit_link_ops; + static int netkit_new_link(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) @@ -505,7 +335,6 @@ static int netkit_new_link(struct net_device *dev, enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; - enum netkit_pairing pair = NETKIT_DEVICE_PAIR; enum netkit_action policy_prim = NETKIT_PASS; enum netkit_action policy_peer = NETKIT_PASS; struct nlattr **data = params->data; @@ -514,8 +343,7 @@ static int netkit_new_link(struct net_device *dev, struct nlattr **tb = params->tb; u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; - struct net_device *peer = NULL; - bool seen_peer = false; + struct net_device *peer; char ifname[IFNAMSIZ]; struct netkit *nk; int err; @@ -552,12 +380,6 @@ static int netkit_new_link(struct net_device *dev, headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); if (data[IFLA_NETKIT_TAILROOM]) tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); - if (data[IFLA_NETKIT_PAIRING]) - pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]); - - seen_peer = data[IFLA_NETKIT_PEER_INFO] || - data[IFLA_NETKIT_PEER_SCRUB] || - data[IFLA_NETKIT_PEER_POLICY]; } if (ifmp && tbp[IFLA_IFNAME]) { @@ -570,46 +392,45 @@ static int netkit_new_link(struct net_device *dev, if (mode != NETKIT_L2 && (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) return -EOPNOTSUPP; - if (pair == NETKIT_DEVICE_SINGLE && - (tb != tbp || seen_peer || policy_prim != NETKIT_PASS)) - return -EOPNOTSUPP; - if (pair == NETKIT_DEVICE_PAIR) { - peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, - &netkit_link_ops, tbp, extack); - if (IS_ERR(peer)) - return PTR_ERR(peer); - - netif_inherit_tso_max(peer, dev); - if (headroom) - peer->needed_headroom = headroom; - if (tailroom) - peer->needed_tailroom = tailroom; - if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) - eth_hw_addr_random(peer); - if (ifmp && dev->ifindex) - peer->ifindex = ifmp->ifi_index; + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, + &netkit_link_ops, tbp, extack); + if (IS_ERR(peer)) + return PTR_ERR(peer); - nk = netkit_priv(peer); - nk->primary = false; - nk->policy = policy_peer; - nk->scrub = scrub_peer; - nk->mode = mode; - nk->pair = pair; - nk->headroom = headroom; - bpf_mprog_bundle_init(&nk->bundle); - - err = register_netdevice(peer); - if (err < 0) - goto err_register_peer; - netif_carrier_off(peer); - if (mode == NETKIT_L2) - dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); - - err = rtnl_configure_link(peer, NULL, 0, NULL); - if (err < 0) - goto err_configure_peer; + netif_inherit_tso_max(peer, dev); + if (headroom) { + peer->needed_headroom = headroom; + dev->needed_headroom = headroom; } + if (tailroom) { + peer->needed_tailroom = tailroom; + dev->needed_tailroom = tailroom; + } + + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) + eth_hw_addr_random(peer); + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; + + nk = netkit_priv(peer); + nk->primary = false; + nk->policy = policy_peer; + nk->scrub = scrub_peer; + nk->mode = mode; + nk->headroom = headroom; + bpf_mprog_bundle_init(&nk->bundle); + + err = register_netdevice(peer); + if (err < 0) + goto err_register_peer; + netif_carrier_off(peer); + if (mode == NETKIT_L2) + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); + + err = rtnl_configure_link(peer, NULL, 0, NULL); + if (err < 0) + goto err_configure_peer; if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -617,17 +438,12 @@ static int netkit_new_link(struct net_device *dev, nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else strscpy(dev->name, "nk%d", IFNAMSIZ); - if (headroom) - dev->needed_headroom = headroom; - if (tailroom) - dev->needed_tailroom = tailroom; nk = netkit_priv(dev); nk->primary = true; nk->policy = policy_prim; nk->scrub = scrub_prim; nk->mode = mode; - nk->pair = pair; nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); @@ -639,12 +455,10 @@ static int netkit_new_link(struct net_device *dev, dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); rcu_assign_pointer(netkit_priv(dev)->peer, peer); - if (peer) - rcu_assign_pointer(netkit_priv(peer)->peer, dev); + rcu_assign_pointer(netkit_priv(peer)->peer, dev); return 0; err_configure_peer: - if (peer) - unregister_netdevice(peer); + unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); @@ -704,8 +518,6 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi nk = netkit_priv(dev); if (!nk->primary) return ERR_PTR(-EACCES); - if (nk->pair == NETKIT_DEVICE_SINGLE) - return ERR_PTR(-EOPNOTSUPP); if (which == BPF_NETKIT_PEER) { dev = rcu_dereference_rtnl(nk->peer); if (!dev) @@ -1032,7 +844,6 @@ static void netkit_release_all(struct net_device *dev) static void netkit_uninit(struct net_device *dev) { netkit_release_all(dev); - netkit_queue_unlease(dev); } static void netkit_del_link(struct net_device *dev, struct list_head *head) @@ -1068,7 +879,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], { IFLA_NETKIT_PEER_INFO, "peer info" }, { IFLA_NETKIT_HEADROOM, "headroom" }, { IFLA_NETKIT_TAILROOM, "tailroom" }, - { IFLA_NETKIT_PAIRING, "pairing" }, }; if (!nk->primary) { @@ -1088,11 +898,9 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_NETKIT_POLICY]) { - err = -EOPNOTSUPP; attr = data[IFLA_NETKIT_POLICY]; policy = nla_get_u32(attr); - if (nk->pair == NETKIT_DEVICE_PAIR) - err = netkit_check_policy(policy, attr, extack); + err = netkit_check_policy(policy, attr, extack); if (err) return err; WRITE_ONCE(nk->policy, policy); @@ -1113,48 +921,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return 0; } -static void netkit_check_lease_unregister(struct net_device *dev) -{ - LIST_HEAD(list_kill); - u32 q_idx; - - if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING || - !dev->dev.parent) - return; - - netdev_lock_ops(dev); - for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) { - struct net_device *tmp = dev; - u32 tmp_q_idx = q_idx; - - if (netif_rx_queue_lease_get_owner(&tmp, &tmp_q_idx)) { - if (tmp->netdev_ops != &netkit_netdev_ops) - continue; - /* A single phys device can have multiple queues leased - * to one netkit device. We can only queue that netkit - * device once to the list_kill. Queues of that phys - * device can be leased with different individual netkit - * devices, hence we batch via list_kill. - */ - if (unregister_netdevice_queued(tmp)) - continue; - netkit_del_link(tmp, &list_kill); - } - } - netdev_unlock_ops(dev); - unregister_netdevice_many(&list_kill); -} - -static int netkit_notifier(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - if (event == NETDEV_UNREGISTER) - netkit_check_lease_unregister(dev); - return NOTIFY_DONE; -} - static size_t netkit_get_size(const struct net_device *dev) { return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ @@ -1165,7 +931,6 @@ static size_t netkit_get_size(const struct net_device *dev) nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ - nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */ 0; } @@ -1186,8 +951,6 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) return -EMSGSIZE; - if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair)) - return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); @@ -1209,15 +972,13 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), - [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, .reject_message = "Primary attribute is read-only" }, }; static struct rtnl_link_ops netkit_link_ops = { - .kind = NETKIT_DRV_NAME, + .kind = DRV_NAME, .priv_size = sizeof(struct netkit), - .alloc = netkit_alloc, .setup = netkit_setup, .newlink = netkit_new_link, .dellink = netkit_del_link, @@ -1231,39 +992,26 @@ static struct rtnl_link_ops netkit_link_ops = { .maxtype = IFLA_NETKIT_MAX, }; -static struct notifier_block netkit_netdev_notifier = { - .notifier_call = netkit_notifier, -}; - -static __init int netkit_mod_init(void) +static __init int netkit_init(void) { - int ret; - BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || (int)NETKIT_PASS != (int)TCX_PASS || (int)NETKIT_DROP != (int)TCX_DROP || (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); - ret = rtnl_link_register(&netkit_link_ops); - if (ret) - return ret; - ret = register_netdevice_notifier(&netkit_netdev_notifier); - if (ret) - rtnl_link_unregister(&netkit_link_ops); - return ret; + return rtnl_link_register(&netkit_link_ops); } -static __exit void netkit_mod_exit(void) +static __exit void netkit_exit(void) { - unregister_netdevice_notifier(&netkit_netdev_notifier); rtnl_link_unregister(&netkit_link_ops); } -module_init(netkit_mod_init); -module_exit(netkit_mod_exit); +module_init(netkit_init); +module_exit(netkit_exit); MODULE_DESCRIPTION("BPF-programmable network device"); MODULE_AUTHOR("Daniel Borkmann "); MODULE_AUTHOR("Nikolay Aleksandrov "); MODULE_LICENSE("GPL"); -MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME); +MODULE_ALIAS_RTNL_LINK(DRV_NAME); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d146c000e21..d99b0fbc1942 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3400,17 +3400,11 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); - static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); } -static inline bool unregister_netdevice_queued(const struct net_device *dev) -{ - return !list_empty(&dev->unreg_list); -} - int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 81dc7cb2360c..b55d3b9cb9c2 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -130,11 +130,6 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used * for this queue. Return NULL on error. * - * @ndo_queue_create: Create a new RX queue which can be leased to another queue. - * Ops on this queue are redirected to the leased queue e.g. - * when opening a memory provider. Return the new queue id on - * success. Return negative error code on failure. - * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -154,12 +149,9 @@ struct netdev_queue_mgmt_ops { int idx); struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, int idx); - int (*ndo_queue_create)(struct net_device *dev); }; -bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); -bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx); -bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx); +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); /** * DOC: Lockless queue stopping / waking helpers. @@ -348,10 +340,5 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq) }) struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); -bool netdev_can_create_queue(const struct net_device *dev, - struct netlink_ext_ack *extack); -bool netdev_can_lease_queue(const struct net_device *dev, - struct netlink_ext_ack *extack); -bool netdev_queue_busy(struct net_device *dev, int idx, - struct netlink_ext_ack *extack); -#endif /* _LINUX_NET_QUEUES_H */ + +#endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 508d11afaecb..8cdcd138b33f 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -28,8 +28,6 @@ struct netdev_rx_queue { #endif struct napi_struct *napi; struct pp_memory_provider_params mp_params; - struct netdev_rx_queue *lease; - netdevice_tracker lease_tracker; } ____cacheline_aligned_in_smp; /* @@ -59,22 +57,5 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue) } int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); -void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src); -void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src); -bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq); -enum netif_lease_dir { - NETIF_VIRT_TO_PHYS, - NETIF_PHYS_TO_VIRT, -}; - -struct netdev_rx_queue * -__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, - enum netif_lease_dir dir); -struct netdev_rx_queue * -netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq); -void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, - struct net_device *dev); -#endif /* _LINUX_NETDEV_RX_QUEUE_H */ +#endif diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index b6f811c3416b..ada4f968960a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -23,12 +23,12 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); -int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *p); int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack); -void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p); void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *old_p); diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index c07cfb431eac..242e34f771cc 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -28,7 +28,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); -struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bbd565757298..3b491d96e52e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1296,11 +1296,6 @@ enum netkit_mode { NETKIT_L3, }; -enum netkit_pairing { - NETKIT_DEVICE_PAIR, - NETKIT_DEVICE_SINGLE, -}; - /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to * the BPF program if attached. This also means the latter can * consume the two fields if they were populated earlier. @@ -1325,7 +1320,6 @@ enum { IFLA_NETKIT_PEER_SCRUB, IFLA_NETKIT_HEADROOM, IFLA_NETKIT_TAILROOM, - IFLA_NETKIT_PAIRING, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7df1056a35fd..e0b579a1df4f 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -160,7 +160,6 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, - NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -203,15 +202,6 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; -enum { - NETDEV_A_LEASE_IFINDEX = 1, - NETDEV_A_LEASE_QUEUE, - NETDEV_A_LEASE_NETNS_ID, - - __NETDEV_A_LEASE_MAX, - NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) -}; - enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -238,7 +228,6 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, - NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 13a3de63a825..2661b68f5be3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1114,13 +1114,6 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) return __netdev_put_lock_ops_compat(dev, net); } -struct net_device * -netdev_put_lock(struct net_device *dev, netdevice_tracker *tracker) -{ - netdev_tracker_free(dev, tracker); - return __netdev_put_lock(dev, dev_net(dev)); -} - struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) diff --git a/net/core/dev.h b/net/core/dev.h index 9bcb76b325d0..da18536cbd35 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,8 +30,6 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); -struct net_device *netdev_put_lock(struct net_device *dev, - netdevice_tracker *tracker); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 52ba99c019e7..ba673e81716f 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -28,12 +28,6 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range }; /* Common nested types */ -const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = { - [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), - [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), - [NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, }, -}; - const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), @@ -113,13 +107,6 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, }; -/* NETDEV_CMD_QUEUE_CREATE - do */ -static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = { - [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), - [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), - [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy), -}; - /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -218,13 +205,6 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_CMD_CAP_DO, }, - { - .cmd = NETDEV_CMD_QUEUE_CREATE, - .doit = netdev_nl_queue_create_doit, - .policy = netdev_queue_create_nl_policy, - .maxattr = NETDEV_A_QUEUE_LEASE, - .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, - }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index d71b435d72c1..cffc08517a41 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -14,7 +14,6 @@ #include /* Common nested types */ -extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1]; extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; @@ -37,7 +36,6 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); -int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 51c830f88f10..470fabbeacd9 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -391,11 +391,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; - struct net_device *orig_netdev = netdev; - struct nlattr *nest_lease, *nest_queue; struct netdev_rx_queue *rxq; struct netdev_queue *txq; - u32 lease_q_idx = q_idx; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -413,37 +410,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; - if (netif_rx_queue_lease_get_owner(&netdev, &lease_q_idx)) { - struct net *net, *peer_net; - - nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE); - if (!nest_lease) - goto nla_put_failure; - nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE); - if (!nest_queue) - goto nla_put_failure; - if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, lease_q_idx)) - goto nla_put_failure; - if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type)) - goto nla_put_failure; - nla_nest_end(rsp, nest_queue); - if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX, - READ_ONCE(netdev->ifindex))) - goto nla_put_failure; - rcu_read_lock(); - peer_net = dev_net_rcu(netdev); - net = dev_net_rcu(orig_netdev); - if (!net_eq(net, peer_net)) { - s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC); - - if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id)) - goto nla_put_failure_unlock; - } - rcu_read_unlock(); - nla_nest_end(rsp, nest_lease); - netdev = orig_netdev; - } - params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) @@ -471,8 +437,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, return 0; -nla_put_failure_unlock: - rcu_read_unlock(); nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; @@ -1156,155 +1120,6 @@ err_genlmsg_free: return err; } -int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) -{ - const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; - const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1; - int err, ifindex, ifindex_lease, queue_id, queue_id_lease; - struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; - struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)]; - struct netdev_rx_queue *rxq, *rxq_lease; - struct net_device *dev, *dev_lease; - netdevice_tracker dev_tracker; - struct nlattr *nest; - struct sk_buff *rsp; - void *hdr; - - if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) || - GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || - GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE)) - return -EINVAL; - if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) != - NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]); - return -EINVAL; - } - - ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); - - nest = info->attrs[NETDEV_A_QUEUE_LEASE]; - err = nla_parse_nested(ltb, lmaxtype, nest, - netdev_lease_nl_policy, info->extack); - if (err < 0) - return err; - if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) || - NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE)) - return -EINVAL; - if (ltb[NETDEV_A_LEASE_NETNS_ID]) { - NL_SET_BAD_ATTR(info->extack, ltb[NETDEV_A_LEASE_NETNS_ID]); - return -EINVAL; - } - - ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]); - - nest = ltb[NETDEV_A_LEASE_QUEUE]; - err = nla_parse_nested(qtb, qmaxtype, nest, - netdev_queue_id_nl_policy, info->extack); - if (err < 0) - return err; - if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) || - NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE)) - return -EINVAL; - if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]); - return -EINVAL; - } - if (ifindex == ifindex_lease) { - NL_SET_ERR_MSG(info->extack, - "Lease ifindex cannot be the same as queue creation ifindex"); - return -EINVAL; - } - - queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]); - - rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!rsp) - return -ENOMEM; - - hdr = genlmsg_iput(rsp, info); - if (!hdr) { - err = -EMSGSIZE; - goto err_genlmsg_free; - } - - /* Locking order is always from the virtual to the physical device - * since this is also the same order when applications open the - * memory provider later on. - */ - dev = netdev_get_by_index_lock(genl_info_net(info), ifindex); - if (!dev) { - err = -ENODEV; - goto err_genlmsg_free; - } - if (!netdev_can_create_queue(dev, info->extack)) { - err = -EINVAL; - goto err_unlock_dev; - } - - dev_lease = netdev_get_by_index(genl_info_net(info), ifindex_lease, - &dev_tracker, GFP_KERNEL); - if (!dev_lease) { - err = -ENODEV; - goto err_unlock_dev; - } - if (!netdev_can_lease_queue(dev_lease, info->extack)) { - netdev_put(dev_lease, &dev_tracker); - err = -EINVAL; - goto err_unlock_dev; - } - - dev_lease = netdev_put_lock(dev_lease, &dev_tracker); - if (!dev_lease) { - err = -ENODEV; - goto err_unlock_dev; - } - if (queue_id_lease >= dev_lease->real_num_rx_queues) { - err = -ERANGE; - NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]); - goto err_unlock_dev_lease; - } - if (netdev_queue_busy(dev_lease, queue_id_lease, info->extack)) { - err = -EBUSY; - goto err_unlock_dev_lease; - } - - rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease); - rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1); - - if (rxq->lease && rxq->lease->dev != dev_lease) { - err = -EOPNOTSUPP; - NL_SET_ERR_MSG(info->extack, - "Leasing multiple queues from different devices not supported"); - goto err_unlock_dev_lease; - } - - err = queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev); - if (err < 0) { - NL_SET_ERR_MSG(info->extack, - "Device is unable to create a new queue"); - goto err_unlock_dev_lease; - } - - rxq = __netif_get_rx_queue(dev, queue_id); - netdev_rx_queue_lease(rxq, rxq_lease); - - nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id); - genlmsg_end(rsp, hdr); - - netdev_unlock(dev_lease); - netdev_unlock(dev); - - return genlmsg_reply(rsp, info); - -err_unlock_dev_lease: - netdev_unlock(dev_lease); -err_unlock_dev: - netdev_unlock(dev); -err_genlmsg_free: - nlmsg_free(rsp); - return err; -} - void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c index 97acf6440829..251f27a8307f 100644 --- a/net/core/netdev_queues.c +++ b/net/core/netdev_queues.c @@ -1,37 +1,22 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include /** * netdev_queue_get_dma_dev() - get dma device for zero-copy operations * @dev: net_device * @idx: queue index * - * Get dma device for zero-copy operations to be used for this queue. If the - * queue is leased to a physical queue, we retrieve the latter's dma device. + * Get dma device for zero-copy operations to be used for this queue. * When such device is not available or valid, the function will return NULL. * * Return: Device or NULL on error */ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) { - const struct netdev_queue_mgmt_ops *queue_ops; + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; struct device *dma_dev; - if (idx < dev->real_num_rx_queues) { - struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); - - if (rxq->lease) { - rxq = rxq->lease; - dev = rxq->dev; - idx = get_netdev_rx_queue_index(rxq); - } - } - - queue_ops = dev->queue_mgmt_ops; - if (queue_ops && queue_ops->ndo_queue_get_dma_dev) dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); else @@ -40,58 +25,3 @@ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; } -bool netdev_can_create_queue(const struct net_device *dev, - struct netlink_ext_ack *extack) -{ - if (dev->dev.parent) { - NL_SET_ERR_MSG(extack, "Device is not a virtual device"); - return false; - } - if (!dev->queue_mgmt_ops || - !dev->queue_mgmt_ops->ndo_queue_create) { - NL_SET_ERR_MSG(extack, "Device does not support queue creation"); - return false; - } - if (dev->real_num_rx_queues < 1 || - dev->real_num_tx_queues < 1) { - NL_SET_ERR_MSG(extack, "Device must have at least one real queue"); - return false; - } - return true; -} - -bool netdev_can_lease_queue(const struct net_device *dev, - struct netlink_ext_ack *extack) -{ - if (!dev->dev.parent) { - NL_SET_ERR_MSG(extack, "Lease device is a virtual device"); - return false; - } - if (!netif_device_present(dev)) { - NL_SET_ERR_MSG(extack, "Lease device has been removed from the system"); - return false; - } - if (!dev->queue_mgmt_ops) { - NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations"); - return false; - } - return true; -} - -bool netdev_queue_busy(struct net_device *dev, int idx, - struct netlink_ext_ack *extack) -{ - if (netif_rxq_is_leased(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue is already leased"); - return true; - } - if (xsk_get_pool_from_qid(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue in use by AF_XDP"); - return true; - } - if (netif_rxq_has_mp(dev, idx)) { - NL_SET_ERR_MSG(extack, "Lease device queue in use by memory provider"); - return true; - } - return false; -} diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 75c7a68cb90d..c7d9341b7630 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,120 +9,14 @@ #include "page_pool_priv.h" -void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src) -{ - netdev_assert_locked(rxq_src->dev); - netdev_assert_locked(rxq_dst->dev); - - netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL); - - WRITE_ONCE(rxq_src->lease, rxq_dst); - WRITE_ONCE(rxq_dst->lease, rxq_src); -} - -void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, - struct netdev_rx_queue *rxq_src) -{ - netdev_assert_locked(rxq_dst->dev); - netdev_assert_locked(rxq_src->dev); - - WRITE_ONCE(rxq_src->lease, NULL); - WRITE_ONCE(rxq_dst->lease, NULL); - - netdev_put(rxq_src->dev, &rxq_src->lease_tracker); -} - -bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) -{ - if (rxq_idx < dev->real_num_rx_queues) - return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease); - return false; -} - -static bool netif_lease_dir_ok(const struct net_device *dev, - enum netif_lease_dir dir) -{ - if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent) - return true; - if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent) - return true; - return false; -} - -struct netdev_rx_queue * -__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, - enum netif_lease_dir dir) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx); - - if (rxq->lease) { - if (!netif_lease_dir_ok(orig_dev, dir)) - return NULL; - rxq = rxq->lease; - *rxq_idx = get_netdev_rx_queue_index(rxq); - *dev = rxq->dev; - } - return rxq; -} - -struct netdev_rx_queue * -netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq; - - /* Locking order is always from the virtual to the physical device - * see netdev_nl_queue_create_doit(). - */ - netdev_ops_assert_locked(orig_dev); - rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS); - if (rxq && orig_dev != *dev) - netdev_lock(*dev); - return rxq; -} - -void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, - struct net_device *dev) -{ - if (orig_dev != dev) - netdev_unlock(dev); -} - -bool netif_rx_queue_lease_get_owner(struct net_device **dev, - unsigned int *rxq_idx) -{ - struct net_device *orig_dev = *dev; - struct netdev_rx_queue *rxq; - - /* The physical device needs to be locked. If there is indeed a lease, - * then the virtual device holds a reference on the physical device - * and the lease stays active until the virtual device is torn down. - * When queues get {un,}leased both devices are always locked. - */ - netdev_ops_assert_locked(orig_dev); - rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_PHYS_TO_VIRT); - if (rxq && orig_dev != *dev) - return true; - return false; -} - /* See also page_pool_is_unreadable() */ -bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) { - if (rxq_idx < dev->real_num_rx_queues) - return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops; - return false; -} -EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); -bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx) -{ - if (rxq_idx < dev->real_num_rx_queues) - return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv; - return false; + return !!rxq->mp_params.mp_ops; } +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { @@ -206,63 +100,49 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack) { - struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int ret; if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; + if (rxq_idx >= dev->real_num_rx_queues) { NL_SET_ERR_MSG(extack, "rx queue index out of range"); return -ERANGE; } - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); - rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); - if (!rxq) { - NL_SET_ERR_MSG(extack, "rx queue peered to a virtual netdev"); - return -EBUSY; - } - if (!dev->dev.parent) { - NL_SET_ERR_MSG(extack, "rx queue is mapped to a virtual netdev"); - ret = -EBUSY; - goto out; - } + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); - ret = -EINVAL; - goto out; + return -EINVAL; } if (dev->cfg->hds_thresh) { NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); - ret = -EINVAL; - goto out; + return -EINVAL; } if (dev_xdp_prog_count(dev)) { NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); - ret = -EEXIST; - goto out; + return -EEXIST; } + + rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); - ret = -EEXIST; - goto out; + return -EEXIST; } #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) { NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); - ret = -EBUSY; - goto out; + return -EBUSY; } #endif + rxq->mp_params = *p; ret = netdev_rx_queue_restart(dev, rxq_idx); if (ret) { rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; } -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); return ret; } @@ -277,43 +157,38 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, return ret; } -void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, const struct pp_memory_provider_params *old_p) { - struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int err; - if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues)) + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) return; - rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); - if (WARN_ON_ONCE(!rxq)) - return; + rxq = __netif_get_rx_queue(dev, ifq_idx); /* Callers holding a netdev ref may get here after we already * went thru shutdown via dev_memory_provider_uninstall(). */ if (dev->reg_state > NETREG_REGISTERED && !rxq->mp_params.mp_ops) - goto out; + return; if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || rxq->mp_params.mp_priv != old_p->mp_priv)) - goto out; + return; rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; - err = netdev_rx_queue_restart(dev, rxq_idx); + err = netdev_rx_queue_restart(dev, ifq_idx); WARN_ON(err && err != -ENETDOWN); -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); } -void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p) { netdev_lock(dev); - __net_mp_close_rxq(dev, rxq_idx, old_p); + __net_mp_close_rxq(dev, ifq_idx, old_p); netdev_unlock(dev); } diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 797d2a08c515..ca4f80282448 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include "netlink.h" #include "common.h" @@ -169,16 +169,14 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) if (ret) return ret; - /* ensure channels are not busy at the moment */ + /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); - for (i = from_channel; i < old_total; i++) { - if (netdev_queue_busy(dev, i, NULL)) { - GENL_SET_ERR_MSG(info, - "requested channel counts are too low due to busy queues (AF_XDP or queue leasing)"); + for (i = from_channel; i < old_total; i++) + if (xsk_get_pool_from_qid(dev, i)) { + GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); return -EINVAL; } - } ret = dev->ethtool_ops->set_channels(dev, &channels); return ret < 0 ? ret : 1; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 02a3454234d6..9431e305b233 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -27,13 +27,12 @@ #include #include #include -#include #include #include +#include #include #include -#include - +#include #include "common.h" /* State held across locks and calls for commands which have devlink fallback */ @@ -2283,12 +2282,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (ret) return ret; - /* Disabling channels, query busy queues (AF_XDP, queue leasing) */ + /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); for (i = from_channel; i < to_channel; i++) - if (netdev_queue_busy(dev, i, NULL)) + if (xsk_get_pool_from_qid(dev, i)) return -EINVAL; ret = dev->ethtool_ops->set_channels(dev, &channels); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 92f791433725..3b46bc635c43 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,8 +23,6 @@ #include #include #include - -#include #include #include #include @@ -105,7 +103,7 @@ bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) } EXPORT_SYMBOL(xsk_uses_need_wakeup); -struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->real_num_rx_queues) @@ -119,18 +117,10 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - struct net_device *orig_dev = dev; - unsigned int id = queue_id; - - if (id < dev->real_num_rx_queues) - WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id)); - - if (id < dev->real_num_rx_queues) - dev->_rx[id].pool = NULL; - if (id < dev->real_num_tx_queues) - dev->_tx[id].pool = NULL; - - netif_put_rx_queue_lease_locked(orig_dev, dev); + if (queue_id < dev->num_rx_queues) + dev->_rx[queue_id].pool = NULL; + if (queue_id < dev->num_tx_queues) + dev->_tx[queue_id].pool = NULL; } /* The buffer pool is stored both in the _rx struct and the _tx struct as we do @@ -140,29 +130,17 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { - struct net_device *orig_dev = dev; - unsigned int id = queue_id; - int ret = 0; - - if (id >= max(dev->real_num_rx_queues, - dev->real_num_tx_queues)) + if (queue_id >= max_t(unsigned int, + dev->real_num_rx_queues, + dev->real_num_tx_queues)) return -EINVAL; - if (id < dev->real_num_rx_queues) { - if (!netif_get_rx_queue_lease_locked(&dev, &id)) - return -EBUSY; - if (xsk_get_pool_from_qid(dev, id)) { - ret = -EBUSY; - goto out; - } - } - if (id < dev->real_num_rx_queues) - dev->_rx[id].pool = pool; - if (id < dev->real_num_tx_queues) - dev->_tx[id].pool = pool; -out: - netif_put_rx_queue_lease_locked(orig_dev, dev); - return ret; + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].pool = pool; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].pool = pool; + + return 0; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, @@ -346,37 +324,14 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static bool xsk_dev_queue_valid(const struct xdp_sock *xs, - const struct xdp_rxq_info *info) -{ - struct net_device *dev = xs->dev; - u32 queue_index = xs->queue_id; - struct netdev_rx_queue *rxq; - - if (info->dev == dev && - info->queue_index == queue_index) - return true; - - if (queue_index < dev->real_num_rx_queues) { - rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); - if (!rxq) - return false; - - dev = rxq->dev; - queue_index = get_netdev_rx_queue_index(rxq); - - return info->dev == dev && - info->queue_index == queue_index; - } - return false; -} - static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; - if (!xsk_dev_queue_valid(xs, xdp->rxq)) + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7df1056a35fd..e0b579a1df4f 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -160,7 +160,6 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, - NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -203,15 +202,6 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; -enum { - NETDEV_A_LEASE_IFINDEX = 1, - NETDEV_A_LEASE_QUEUE, - NETDEV_A_LEASE_NETNS_ID, - - __NETDEV_A_LEASE_MAX, - NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) -}; - enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -238,7 +228,6 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, - NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index b94e81c2e030..eb838ae94844 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -62,13 +62,6 @@ LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6 Local and remote endpoint IP addresses. -LOCAL_PREFIX_V4, LOCAL_PREFIX_V6 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Local IP prefix/subnet which can be used to allocate extra IP addresses (for -network name spaces behind macvlan, veth, netkit devices). DUT must be -reachable using these addresses from the endpoint. - REMOTE_TYPE ~~~~~~~~~~~ diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 39ad86d693b3..9c163ba6feee 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -32,8 +32,6 @@ TEST_PROGS = \ irq.py \ loopback.sh \ nic_timestamp.py \ - nk_netns.py \ - nk_qlease.py \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index 022008249313..d5d247eca6b7 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -3,7 +3,6 @@ """ Driver test environment (hardware-only tests). NetDrvEnv and NetDrvEpEnv are the main environment classes. -NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -30,7 +29,7 @@ try: from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner - from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv + from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", "EthtoolFamily", "NetdevFamily", "NetshaperFamily", @@ -45,8 +44,8 @@ try: "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt", "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none", - "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", - "Remote", "Iperf3Runner"] + "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c deleted file mode 100644 index 86ebfc1445b6..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include - -#define TC_ACT_OK 0 -#define ETH_P_IPV6 0x86DD - -#define ctx_ptr(field) ((void *)(long)(field)) - -#define v6_p64_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \ - a.s6_addr32[1] == b.s6_addr32[1]) - -volatile __u32 netkit_ifindex; -volatile __u8 ipv6_prefix[16]; - -SEC("tc/ingress") -int tc_redirect_peer(struct __sk_buff *skb) -{ - void *data_end = ctx_ptr(skb->data_end); - void *data = ctx_ptr(skb->data); - struct in6_addr *peer_addr; - struct ipv6hdr *ip6h; - struct ethhdr *eth; - - peer_addr = (struct in6_addr *)ipv6_prefix; - - if (skb->protocol != bpf_htons(ETH_P_IPV6)) - return TC_ACT_OK; - - eth = data; - if ((void *)(eth + 1) > data_end) - return TC_ACT_OK; - - ip6h = data + sizeof(struct ethhdr); - if ((void *)(ip6h + 1) > data_end) - return TC_ACT_OK; - - if (!v6_p64_equal(ip6h->daddr, (*peer_addr))) - return TC_ACT_OK; - - return bpf_redirect_peer(netkit_ifindex, 0); -} - -char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/drivers/net/hw/nk_netns.py b/tools/testing/selftests/drivers/net/hw/nk_netns.py deleted file mode 100755 index afa8638195d8..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_netns.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 - -from lib.py import ksft_run, ksft_exit -from lib.py import NetDrvContEnv -from lib.py import cmd - - -def test_ping(cfg) -> None: - cfg.require_ipver("6") - - cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote) - cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns) - - -def main() -> None: - with NetDrvContEnv(__file__) as cfg: - ksft_run([test_ping], args=(cfg,)) - ksft_exit() - - -if __name__ == "__main__": - main() diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py deleted file mode 100755 index 738a46d2d20c..000000000000 --- a/tools/testing/selftests/drivers/net/hw/nk_qlease.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 - -import re -from os import path -from lib.py import ksft_run, ksft_exit -from lib.py import NetDrvContEnv -from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen - - -def create_rss_ctx(cfg): - output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout - values = re.search(r'New RSS context is (\d+)', output).group(1) - return int(values) - - -def set_flow_rule(cfg): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout - values = re.search(r'ID (\d+)', output).group(1) - return int(values) - - -def set_flow_rule_rss(cfg, rss_ctx_id): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout - values = re.search(r'ID (\d+)', output).group(1) - return int(values) - - -def test_iou_zcrx(cfg) -> None: - cfg.require_ipver('6') - - ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") - defer(ethtool, f"-X {cfg.ifname} default") - - flow_rule_id = set_flow_rule(cfg) - defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - - rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" - with bkg(rx_cmd, exit_wait=True): - wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) - cmd(tx_cmd, host=cfg.remote) - - -def main() -> None: - with NetDrvContEnv(__file__, lease=True) as cfg: - cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") - cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - cfg.port = rand_port() - ksft_run([test_iou_zcrx], args=(cfg,)) - ksft_exit() - - -if __name__ == "__main__": - main() diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index be3a8a936882..8b75faa9af6d 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -3,7 +3,6 @@ """ Driver test environment. NetDrvEnv and NetDrvEpEnv are the main environment classes. -NetDrvContEnv extends NetDrvEpEnv with netkit container support. Former is for local host only tests, latter creates / connects to a remote endpoint. See NIPA wiki for more information about running and writing driver tests. @@ -44,12 +43,12 @@ try: "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt", "ksft_not_none", "ksft_not_none"] - from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv + from .env import NetDrvEnv, NetDrvEpEnv from .load import GenerateTraffic, Iperf3Runner from .remote import Remote - __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic", - "Remote", "Iperf3Runner"] + __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote", + "Iperf3Runner"] except ModuleNotFoundError as e: print("Failed importing `net` library from kernel sources") print(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 7066d78395c6..41cc248ac848 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -1,17 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 -import ipaddress import os -import re import time from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_setup, wait_file from lib.py import cmd, ethtool, ip, CmdExitFailure from lib.py import NetNS, NetdevSimDev -from lib.py import NetdevFamily, EthtoolFamily from .remote import Remote -from . import bpftool class NetDrvEnvBase: @@ -293,156 +289,3 @@ class NetDrvEpEnv(NetDrvEnvBase): data.get('stats-block-usecs', 0) / 1000 / 1000 time.sleep(self._stats_settle_time) - - -class NetDrvContEnv(NetDrvEpEnv): - """ - Class for an environment with a netkit pair setup for forwarding traffic - between the physical interface and a network namespace. - """ - - def __init__(self, src_path, lease=False, **kwargs): - super().__init__(src_path, **kwargs) - - self.require_ipver("6") - local_prefix = self.env.get("LOCAL_PREFIX_V6") - if not local_prefix: - raise KsftSkipEx("LOCAL_PREFIX_V6 required") - - self.netdevnl = NetdevFamily() - self.ethnl = EthtoolFamily() - - local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":") - self.ipv6_prefix = f"{local_prefix}::" - self.nk_host_ipv6 = f"{local_prefix}::2:1" - self.nk_guest_ipv6 = f"{local_prefix}::2:2" - - self.netns = None - self._nk_host_ifname = None - self._nk_guest_ifname = None - self._tc_attached = False - self._bpf_prog_pref = None - self._bpf_prog_id = None - self._leased = False - - nk_rxqueues = 1 - if lease: - nk_rxqueues = 2 - ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}") - - all_links = ip("-d link show", json=True) - netkit_links = [link for link in all_links - if link.get('linkinfo', {}).get('info_kind') == 'netkit' - and 'UP' not in link.get('flags', [])] - - if len(netkit_links) != 2: - raise KsftSkipEx("Failed to create netkit pair") - - netkit_links.sort(key=lambda x: x['ifindex']) - self._nk_host_ifname = netkit_links[1]['ifname'] - self._nk_guest_ifname = netkit_links[0]['ifname'] - self.nk_host_ifindex = netkit_links[1]['ifindex'] - self.nk_guest_ifindex = netkit_links[0]['ifindex'] - - if lease: - self._lease_queues() - - self._setup_ns() - self._attach_bpf() - - def __del__(self): - if self._tc_attached: - cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}") - self._tc_attached = False - - if self._nk_host_ifname: - cmd(f"ip link del dev {self._nk_host_ifname}") - self._nk_host_ifname = None - self._nk_guest_ifname = None - - if self.netns: - del self.netns - self.netns = None - - if self._leased: - self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, - 'tcp-data-split': 'unknown', - 'hds-thresh': self._hds_thresh, - 'rx': self._rx_rings}) - self._leased = False - - super().__del__() - - def _lease_queues(self): - channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}}) - channels = channels['combined-count'] - if channels < 2: - raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') - - rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}}) - self._rx_rings = rings['rx'] - self._hds_thresh = rings.get('hds-thresh', 0) - self.ethnl.rings_set({'header': {'dev-index': self.ifindex}, - 'tcp-data-split': 'enabled', - 'hds-thresh': 0, - 'rx': 64}) - self.src_queue = channels - 1 - bind_result = self.netdevnl.queue_create( - { - "ifindex": self.nk_guest_ifindex, - "type": "rx", - "lease": { - "ifindex": self.ifindex, - "queue": {"id": self.src_queue, "type": "rx"}, - }, - } - ) - self.nk_queue = bind_result['id'] - self._leased = True - - def _setup_ns(self): - self.netns = NetNS() - ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") - ip(f"link set dev {self._nk_host_ifname} up") - ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad") - ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}") - - ip("link set lo up", ns=self.netns) - ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns) - ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns) - ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns) - ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns) - - def _attach_bpf(self): - bpf_obj = self.test_dir / "nk_forward.bpf.o" - if not bpf_obj.exists(): - raise KsftSkipEx("BPF prog not found") - - cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action") - self._tc_attached = True - - tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout - match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info) - if not match: - raise Exception("Failed to get BPF prog ID") - self._bpf_prog_pref = int(match.group(1)) - self._bpf_prog_id = int(match.group(2)) - - prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True) - map_ids = prog_info.get("map_ids", []) - - bss_map_id = None - for map_id in map_ids: - map_info = bpftool(f"map show id {map_id}", json=True) - if map_info.get("name").endswith("bss"): - bss_map_id = map_id - - if bss_map_id is None: - raise Exception("Failed to find .bss map") - - ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix) - ipv6_bytes = ipv6_addr.packed - ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little') - value = ipv6_bytes + ifindex_bytes - value_hex = ' '.join(f'{b:02x}' for b in value) - bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}") -- cgit v1.2.3 From 64dd89ae01f2708a508e028c28b7906e4702a9a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 15 Dec 2025 12:57:53 -0500 Subject: mm/block/fs: remove laptop_mode Laptop mode was introduced to save battery, by delaying and consolidating writes and thereby maximize the time rotating hard drives wouldn't have to spin. Luckily, rotating hard drives, with their high spin-up times and power draw, are a thing of the past for battery-powered devices. Reclaim has also since changed to not write single filesystem pages anymore, and regular filesystem writeback is lumpy by design. The juice doesn't appear worth the squeeze anymore. The footprint of the feature is small, but nevertheless it's a complicating factor in mm, block, filesystems. Developers don't think about it, and it likely hasn't been tested with new reclaim and writeback changes in years. Let's sunset it. Keep the sysctl with a deprecation warning around for a few more cycles, but remove all functionality behind it. [akpm@linux-foundation.org: fix Documentation/admin-guide/laptops/index.rst] Link: https://lkml.kernel.org/r/20251216185201.GH905277@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Deepanshu Kartikey Signed-off-by: Andrew Morton --- Documentation/admin-guide/laptops/index.rst | 1 - Documentation/admin-guide/laptops/laptop-mode.rst | 770 ---------------------- Documentation/admin-guide/sysctl/vm.rst | 8 - block/blk-mq.c | 3 - fs/ext4/inode.c | 3 +- fs/sync.c | 2 - fs/xfs/xfs_super.c | 9 - include/linux/backing-dev-defs.h | 3 - include/linux/writeback.h | 4 - include/trace/events/writeback.h | 1 - include/uapi/linux/sysctl.h | 2 +- mm/backing-dev.c | 3 - mm/page-writeback.c | 74 +-- mm/vmscan.c | 30 +- 14 files changed, 25 insertions(+), 888 deletions(-) delete mode 100644 Documentation/admin-guide/laptops/laptop-mode.rst (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst index 6432c251dc95..c0b911d05c59 100644 --- a/Documentation/admin-guide/laptops/index.rst +++ b/Documentation/admin-guide/laptops/index.rst @@ -10,7 +10,6 @@ Laptop Drivers alienware-wmi asus-laptop disk-shock-protection - laptop-mode lg-laptop samsung-galaxybook sony-laptop diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst deleted file mode 100644 index 66eb9cd918b5..000000000000 --- a/Documentation/admin-guide/laptops/laptop-mode.rst +++ /dev/null @@ -1,770 +0,0 @@ -=============================================== -How to conserve battery power using laptop-mode -=============================================== - -Document Author: Bart Samwel (bart@samwel.tk) - -Date created: January 2, 2004 - -Last modified: December 06, 2004 - -Introduction ------------- - -Laptop mode is used to minimize the time that the hard disk needs to be spun up, -to conserve battery power on laptops. It has been reported to cause significant -power savings. - -.. Contents - - * Introduction - * Installation - * Caveats - * The Details - * Tips & Tricks - * Control script - * ACPI integration - * Monitoring tool - - -Installation ------------- - -To use laptop mode, you don't need to set any kernel configuration options -or anything. Simply install all the files included in this document, and -laptop mode will automatically be started when you're on battery. For -your convenience, a tarball containing an installer can be downloaded at: - - http://www.samwel.tk/laptop_mode/laptop_mode/ - -To configure laptop mode, you need to edit the configuration file, which is -located in /etc/default/laptop-mode on Debian-based systems, or in -/etc/sysconfig/laptop-mode on other systems. - -Unfortunately, automatic enabling of laptop mode does not work for -laptops that don't have ACPI. On those laptops, you need to start laptop -mode manually. To start laptop mode, run "laptop_mode start", and to -stop it, run "laptop_mode stop". (Note: The laptop mode tools package now -has experimental support for APM, you might want to try that first.) - - -Caveats -------- - -* The downside of laptop mode is that you have a chance of losing up to 10 - minutes of work. If you cannot afford this, don't use it! The supplied ACPI - scripts automatically turn off laptop mode when the battery almost runs out, - so that you won't lose any data at the end of your battery life. - -* Most desktop hard drives have a very limited lifetime measured in spindown - cycles, typically about 50.000 times (it's usually listed on the spec sheet). - Check your drive's rating, and don't wear down your drive's lifetime if you - don't need to. - -* If you mount some of your ext3 filesystems with the -n option, then - the control script will not be able to remount them correctly. You must set - DO_REMOUNTS=0 in the control script, otherwise it will remount them with the - wrong options -- or it will fail because it cannot write to /etc/mtab. - -* If you have your filesystems listed as type "auto" in fstab, like I did, then - the control script will not recognize them as filesystems that need remounting. - You must list the filesystems with their true type instead. - -* It has been reported that some versions of the mutt mail client use file access - times to determine whether a folder contains new mail. If you use mutt and - experience this, you must disable the noatime remounting by setting the option - DO_REMOUNT_NOATIME to 0 in the configuration file. - - -The Details ------------ - -Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is -present for all kernels that have the laptop mode patch, regardless of any -configuration options. When the knob is set, any physical disk I/O (that might -have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The -result of this is that after a disk has spun down, it will not be spun up -anymore to write dirty blocks, because those blocks had already been written -immediately after the most recent read operation. The value of the laptop_mode -knob determines the time between the occurrence of disk I/O and when the flush -is triggered. A sensible value for the knob is 5 seconds. Setting the knob to -0 disables laptop mode. - -To increase the effectiveness of the laptop_mode strategy, the laptop_mode -control script increases dirty_expire_centisecs and dirty_writeback_centisecs in -/proc/sys/vm to about 10 minutes (by default), which means that pages that are -dirtied are not forced to be written to disk as often. The control script also -changes the dirty background ratio, so that background writeback of dirty pages -is not done anymore. Combined with a higher commit value (also 10 minutes) for -ext3 filesystem (also done automatically by the control script), -this results in concentration of disk activity in a small time interval which -occurs only once every 10 minutes, or whenever the disk is forced to spin up by -a cache miss. The disk can then be spun down in the periods of inactivity. - - -Configuration -------------- - -The laptop mode configuration file is located in /etc/default/laptop-mode on -Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It -contains the following options: - -MAX_AGE: - -Maximum time, in seconds, of hard drive spindown time that you are -comfortable with. Worst case, it's possible that you could lose this -amount of work if your battery fails while you're in laptop mode. - -MINIMUM_BATTERY_MINUTES: - -Automatically disable laptop mode if the remaining number of minutes of -battery power is less than this value. Default is 10 minutes. - -AC_HD/BATT_HD: - -The idle timeout that should be set on your hard drive when laptop mode -is active (BATT_HD) and when it is not active (AC_HD). The defaults are -20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The -possible values are those listed in the manual page for "hdparm" for the -"-S" option. - -HD: - -The devices for which the spindown timeout should be adjusted by laptop mode. -Default is /dev/hda. If you specify multiple devices, separate them by a space. - -READAHEAD: - -Disk readahead, in 512-byte sectors, while laptop mode is active. A large -readahead can prevent disk accesses for things like executable pages (which are -loaded on demand while the application executes) and sequentially accessed data -(MP3s). - -DO_REMOUNTS: - -The control script automatically remounts any mounted journaled filesystems -with appropriate commit interval options. When this option is set to 0, this -feature is disabled. - -DO_REMOUNT_NOATIME: - -When remounting, should the filesystems be remounted with the noatime option? -Normally, this is set to "1" (enabled), but there may be programs that require -access time recording. - -DIRTY_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -before a writeback is forced, while laptop mode is active. Corresponds to -the /proc/sys/vm/dirty_ratio sysctl. - -DIRTY_BACKGROUND_RATIO: - -The percentage of memory that is allowed to contain "dirty" or unsaved data -after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set -this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio -sysctl. - -Note that the behaviour of dirty_background_ratio is quite different -when laptop mode is active and when it isn't. When laptop mode is inactive, -dirty_background_ratio is the threshold percentage at which background writeouts -start taking place. When laptop mode is active, however, background writeouts -are disabled, and the dirty_background_ratio only determines how much writeback -is done when dirty_ratio is reached. - -DO_CPU: - -Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup. -See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.) - -CPU_MAXFREQ: - -When on battery, what is the maximum CPU speed that the system should use? Legal -values are "slowest" for the slowest speed that your CPU is able to operate at, -or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies. - - -Tips & Tricks -------------- - -* Bartek Kania reports getting up to 50 minutes of extra battery life (on top - of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1). - -* You can spin down the disk while playing MP3, by setting disk readahead - to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at - once, and will then spin down while the MP3 is playing. (Thanks to Bartek - Kania.) - -* Drew Scott Daniels observed: "I don't know why, but when I decrease the number - of colours that my display uses it consumes less battery power. I've seen - this on powerbooks too. I hope that this is a piece of information that - might be useful to the Laptop Mode patch or its users." - -* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the - file after every logging. When you're using laptop-mode and your disk doesn't - spin down, this is a likely culprit. - -* Richard Atterer observed that laptop mode does not work well with noflushd - (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode - from doing its thing. - -* If you're worried about your data, you might want to consider using a USB - memory stick or something like that as a "working area". (Be aware though - that flash memory can only handle a limited number of writes, and overuse - may wear out your memory stick pretty quickly. Do _not_ use journalling - filesystems on flash memory sticks.) - - -Configuration file for control and ACPI battery scripts -------------------------------------------------------- - -This allows the tunables to be changed for the scripts via an external -configuration file - -It should be installed as /etc/default/laptop-mode on Debian, and as -/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes. - -Config file:: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - #MAX_AGE=600 - - # Automatically disable laptop mode when the number of minutes of battery - # that you have left goes below this threshold. - MINIMUM_BATTERY_MINUTES=10 - - # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG - # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk - # will read a complete MP3 at once, and will then spin down while the MP3/OGG is - # playing. - #READAHEAD=4096 - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - #DO_REMOUNTS=1 - - # And shall we add the "noatime" option to that as well? (1=yes) - #DO_REMOUNT_NOATIME=1 - - # Dirty synchronous ratio. At this percentage of dirty pages the process - # which - # calls write() does its own writeback - #DIRTY_RATIO=40 - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - #DIRTY_BACKGROUND_RATIO=5 - - # kernel default dirty buffer age - #DEF_AGE=30 - #DEF_UPDATE=5 - #DEF_DIRTY_BACKGROUND_RATIO=10 - #DEF_DIRTY_RATIO=40 - #DEF_XFS_AGE_BUFFER=15 - #DEF_XFS_SYNC_INTERVAL=30 - #DEF_XFS_BUFD_INTERVAL=1 - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still - # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for - # external interfaces, and that is currently always set to 100. So you don't - # need to change this on 2.6. - #XFS_HZ=100 - - # Should the maximum CPU frequency be adjusted down while on battery? - # Requires CPUFreq to be setup. - # See Documentation/admin-guide/pm/cpufreq.rst for more info - #DO_CPU=0 - - # When on battery what is the maximum CPU speed that the system should - # use? Legal values are "slowest" for the slowest speed that your - # CPU is able to operate at, or a value listed in: - # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies - # Only applicable if DO_CPU=1. - #CPU_MAXFREQ=slowest - - # Idle timeout for your hard drive (man hdparm for valid values, -S option) - # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4). - #AC_HD=244 - #BATT_HD=4 - - # The drives for which to adjust the idle timeout. Separate them by a space, - # e.g. HD="/dev/hda /dev/hdb". - #HD="/dev/hda" - - # Set the spindown timeout on a hard drive? - #DO_HD=1 - - -Control script --------------- - -Please note that this control script works for the Linux 2.4 and 2.6 series (thanks -to Kiko Piris). - -Control script:: - - #!/bin/bash - - # start or stop laptop_mode, best run by a power management daemon when - # ac gets connected/disconnected from a laptop - # - # install as /sbin/laptop_mode - # - # Contributors to this script: Kiko Piris - # Bart Samwel - # Micha Feigin - # Andrew Morton - # Herve Eychenne - # Dax Kelson - # - # Original Linux 2.4 version by: Jens Axboe - - ############################################################################# - - # Source config - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - - # Don't raise an error if the config file is incomplete - # set defaults instead: - - # Maximum time, in seconds, of hard drive spindown time that you are - # comfortable with. Worst case, it's possible that you could lose this - # amount of work if your battery fails you while in laptop mode. - MAX_AGE=${MAX_AGE:-'600'} - - # Read-ahead, in kilobytes - READAHEAD=${READAHEAD:-'4096'} - - # Shall we remount journaled fs. with appropriate commit interval? (1=yes) - DO_REMOUNTS=${DO_REMOUNTS:-'1'} - - # And shall we add the "noatime" option to that as well? (1=yes) - DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'} - - # Shall we adjust the idle timeout on a hard drive? - DO_HD=${DO_HD:-'1'} - - # Adjust idle timeout on which hard drive? - HD="${HD:-'/dev/hda'}" - - # spindown time for HD (hdparm -S values) - AC_HD=${AC_HD:-'244'} - BATT_HD=${BATT_HD:-'4'} - - # Dirty synchronous ratio. At this percentage of dirty pages the process which - # calls write() does its own writeback - DIRTY_RATIO=${DIRTY_RATIO:-'40'} - - # cpu frequency scaling - # See Documentation/admin-guide/pm/cpufreq.rst for more info - DO_CPU=${CPU_MANAGE:-'0'} - CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'} - - # - # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been - # exceeded, the kernel will wake flusher threads which will then reduce the - # amount of dirty memory to dirty_background_ratio. Set this nice and low, - # so once some writeout has commenced, we do a lot of it. - # - DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'} - - # kernel default dirty buffer age - DEF_AGE=${DEF_AGE:-'30'} - DEF_UPDATE=${DEF_UPDATE:-'5'} - DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'} - DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'} - DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'} - DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'} - DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'} - - # This must be adjusted manually to the value of HZ in the running kernel - # on 2.4, until the XFS people change their 2.4 external interfaces to work in - # centisecs. This can be automated, but it's a work in progress that still needs - # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external - # interfaces, and that is currently always set to 100. So you don't need to - # change this on 2.6. - XFS_HZ=${XFS_HZ:-'100'} - - ############################################################################# - - KLEVEL="$(uname -r | - { - IFS='.' read a b c - echo $a.$b - } - )" - case "$KLEVEL" in - "2.4"|"2.6") - ;; - *) - echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2 - exit 1 - ;; - esac - - if [ ! -e /proc/sys/vm/laptop_mode ] ; then - echo "Kernel is not patched with laptop_mode patch." >&2 - exit 1 - fi - - if [ ! -w /proc/sys/vm/laptop_mode ] ; then - echo "You do not have enough privileges to enable laptop_mode." >&2 - exit 1 - fi - - # Remove an option (the first parameter) of the form option= from - # a mount options string (the rest of the parameters). - parse_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"'=[0-9]*,/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Remove an option (the first parameter) without any arguments from - # a mount option string (the rest of the parameters). - parse_nonumber_mount_opts () { - OPT="$1" - shift - echo ",$*," | sed \ - -e 's/,'"$OPT"',/,/g' \ - -e 's/,,*/,/g' \ - -e 's/^,//' \ - -e 's/,$//' - } - - # Find out the state of a yes/no option (e.g. "atime"/"noatime") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, the option name the second, and the default - # value the third. The remainder is the mount options string. - # - # Example: - # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime - # - # If fstab contains, say, "rw" for this filesystem, then the result - # will be "defaults,atime". - parse_yesno_opts_wfstab () { - L_DEV="$1" - OPT="$2" - DEF_OPT="$3" - shift 3 - L_OPTS="$*" - PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)" - PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)" - # Watch for a default atime in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then - # option specified in fstab: extract the value and use it - if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then - echo "$PARSEDOPTS1,no$OPT" - else - # no$OPT not found -- so we must have $OPT. - echo "$PARSEDOPTS1,$OPT" - fi - else - # option not specified in fstab -- choose the default. - echo "$PARSEDOPTS1,$DEF_OPT" - fi - } - - # Find out the state of a numbered option (e.g. "commit=NNN") in - # fstab for a given filesystem, and use this state to replace the - # value of the option in another mount options string. The device - # is the first argument, and the option name the second. The - # remainder is the mount options string in which the replacement - # must be done. - # - # Example: - # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7 - # - # If fstab contains, say, "commit=3,rw" for this filesystem, then the - # result will be "rw,commit=3". - parse_mount_opts_wfstab () { - L_DEV="$1" - OPT="$2" - shift 2 - L_OPTS="$*" - PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)" - # Watch for a default commit in fstab - FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)" - if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then - # option specified in fstab: extract the value, and use it - echo -n "$PARSEDOPTS1,$OPT=" - echo ",$FSTAB_OPTS," | sed \ - -e 's/.*,'"$OPT"'=//' \ - -e 's/,.*//' - else - # option not specified in fstab: set it to 0 - echo "$PARSEDOPTS1,$OPT=0" - fi - } - - deduce_fstype () { - MP="$1" - # My root filesystem unfortunately has - # type "unknown" in /etc/mtab. If we encounter - # "unknown", we try to get the type from fstab. - cat /etc/fstab | - grep -v '^#' | - while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do - if [ "$FSTAB_MP" = "$MP" ]; then - echo $FSTAB_FST - exit 0 - fi - done - } - - if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then - NOATIME_OPT=",noatime" - fi - - case "$1" in - start) - AGE=$((100*$MAX_AGE)) - XFS_AGE=$(($XFS_HZ*$MAX_AGE)) - echo -n "Starting laptop_mode" - - if [ -d /proc/sys/vm/pagebuf ] ; then - # (For 2.4 and early 2.6.) - # This only needs to be set, not reset -- it is only used when - # laptop mode is enabled. - echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # (A couple of early 2.6 laptop mode patches had these.) - # The same goes for these. - echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then - # (2.6.6) - # But not for these -- they are also used in normal - # operation. - echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer - echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # (2.6.7 upwards) - # And not for these either. These are in centisecs, - # not USER_HZ, so we have to use $AGE, not $XFS_AGE. - echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs - echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs - echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - - case "$KLEVEL" in - "2.4") - echo 1 > /proc/sys/vm/laptop_mode - echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo 5 > /proc/sys/vm/laptop_mode - echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ]; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - PARSEDOPTS="$(parse_mount_opts "$OPTS")" - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3") - PARSEDOPTS="$(parse_mount_opts commit "$OPTS")" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT - ;; - "xfs") - mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra $(($READAHEAD * 2)) $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - if [ $CPU_MAXFREQ = 'slowest' ]; then - CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq` - fi - echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - stop) - U_AGE=$((100*$DEF_UPDATE)) - B_AGE=$((100*$DEF_AGE)) - echo -n "Stopping laptop_mode" - echo 0 > /proc/sys/vm/laptop_mode - if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then - # These need to be restored, if there are no lm_*. - echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer - echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval - elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then - # These need to be restored as well. - echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs - echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs - echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs - fi - case "$KLEVEL" in - "2.4") - echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush - ;; - "2.6") - echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs - echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs - echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio - echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio - ;; - esac - if [ $DO_REMOUNTS -eq 1 ] ; then - cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do - # Reset commit and atime options to defaults. - if [ "$FST" = 'unknown' ]; then - FST=$(deduce_fstype $MP) - fi - case "$FST" in - "ext3") - PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)" - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - "xfs") - PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)" - mount $DEV -t $FST $MP -o remount,$PARSEDOPTS - ;; - esac - if [ -b $DEV ] ; then - blockdev --setra 256 $DEV - fi - done - fi - if [ $DO_HD -eq 1 ] ; then - for THISHD in $HD ; do - /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1 - /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1 - done - fi - if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then - echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq - fi - echo "." - ;; - *) - echo "Usage: $0 {start|stop}" 2>&1 - exit 1 - ;; - - esac - - exit 0 - - -ACPI integration ----------------- - -Dax Kelson submitted this so that the ACPI acpid daemon will -kick off the laptop_mode script and run hdparm. The part that -automatically disables laptop mode when the battery is low was -written by Jan Topinski. - -/etc/acpi/events/ac_adapter:: - - event=ac_adapter - action=/etc/acpi/actions/ac.sh %e - -/etc/acpi/events/battery:: - - event=battery.* - action=/etc/acpi/actions/battery.sh %e - -/etc/acpi/actions/ac.sh:: - - #!/bin/bash - - # ac on/offline event handler - - status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state` - - case $status in - "on-line") - /sbin/laptop_mode stop - exit 0 - ;; - "off-line") - /sbin/laptop_mode start - exit 0 - ;; - esac - - -/etc/acpi/actions/battery.sh:: - - #! /bin/bash - - # Automatically disable laptop mode when the battery almost runs out. - - BATT_INFO=/proc/acpi/battery/$2/state - - if [[ -f /proc/sys/vm/laptop_mode ]] - then - LM=`cat /proc/sys/vm/laptop_mode` - if [[ $LM -gt 0 ]] - then - if [[ -f $BATT_INFO ]] - then - # Source the config file only now that we know we need - if [ -f /etc/default/laptop-mode ] ; then - # Debian - . /etc/default/laptop-mode - elif [ -f /etc/sysconfig/laptop-mode ] ; then - # Others - . /etc/sysconfig/laptop-mode - fi - MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'} - - ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`" - if [[ ACTION -eq "discharging" ]] - then - PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" ` - fi - if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES)) - then - /sbin/laptop_mode stop - fi - else - logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path." - fi - fi - fi - - -Monitoring tool ---------------- - -Bartek Kania submitted this, it can be used to measure how much time your disk -spends spun up/down. See tools/laptop/dslm/dslm.c diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 245bf6394935..ca6ebeb5171c 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -41,7 +41,6 @@ Currently, these files are in /proc/sys/vm: - extfrag_threshold - highmem_is_dirtyable - hugetlb_shm_group -- laptop_mode - legacy_va_layout - lowmem_reserve_ratio - max_map_count @@ -363,13 +362,6 @@ hugetlb_shm_group contains group id that is allowed to create SysV shared memory segment using hugetlb page. -laptop_mode -=========== - -laptop_mode is a knob that controls "laptop mode". All the things that are -controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst. - - legacy_va_layout ================ diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..4bae7c4c664e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -811,9 +811,6 @@ void blk_mq_free_request(struct request *rq) blk_mq_finish_request(rq); - if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->disk->bdi); - rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c466ccbed69..15eb463d5a9b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3305,8 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode) /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is - * not strictly speaking necessary (and for users of - * laptop_mode, not even desirable). However, to do otherwise + * not strictly speaking necessary. However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> diff --git a/fs/sync.c b/fs/sync.c index 431fc5f5be06..6330150792f6 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -104,8 +104,6 @@ void ksys_sync(void) iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); sync_bdevs(true); - if (unlikely(laptop_mode)) - laptop_sync_completion(); } SYSCALL_DEFINE0(sync) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bc71aa9dcee8..a2014fb1bc66 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -845,15 +845,6 @@ xfs_fs_sync_fs( if (error) return error; - if (laptop_mode) { - /* - * The disk must be active because we're syncing. - * We schedule log work now (now that the disk is - * active) instead of later (when it might not be). - */ - flush_delayed_work(&mp->m_log->l_work); - } - /* * If we are called with page faults frozen out, it means we are about * to freeze the transaction subsystem. Take the opportunity to shut diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 0217c1073735..c88fd4d37d1f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -46,7 +46,6 @@ enum wb_reason { WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, - WB_REASON_LAPTOP_TIMER, WB_REASON_FS_FREE_SPACE, /* * There is no bdi forker thread any more and works are done @@ -204,8 +203,6 @@ struct backing_dev_info { char dev_name[64]; struct device *owner; - struct timer_list laptop_mode_wb_timer; - #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; #endif diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f48e8ccffe81..e530112c4b3a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -328,9 +328,6 @@ struct dirty_throttle_control { bool dirty_exceeded; }; -void laptop_io_completion(struct backing_dev_info *info); -void laptop_sync_completion(void); -void laptop_mode_timer_fn(struct timer_list *t); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK @@ -342,7 +339,6 @@ extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; -extern int laptop_mode; void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 311a341e6fe4..b6f94e97788a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -42,7 +42,6 @@ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ - EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 63d1464cb71c..6ea9ea8413fa 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -183,7 +183,7 @@ enum VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ - VM_LAPTOP_MODE=23, /* vm laptop mode */ + VM_BLOCK_DUMP=24, /* block dump mode */ VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c5740c6d37a2..a0e26d1b717f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1034,7 +1034,6 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; - timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -1156,8 +1155,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - timer_delete_sync(&bdi->laptop_mode_wb_timer); - /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ccdeb0e84d39..601a5e048d12 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -109,14 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval); */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ -/* - * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: - * a full sync is triggered after this time elapses without any disk activity. - */ -int laptop_mode; - -EXPORT_SYMBOL(laptop_mode); - /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; @@ -1843,17 +1835,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, balance_domain_limits(mdtc, strictlimit); } - /* - * In laptop mode, we wait until hitting the higher threshold - * before starting background writeout, and then write out all - * the way down to the lower threshold. So slow writers cause - * minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (!laptop_mode && nr_dirty > gdtc->bg_thresh && - !writeback_in_progress(wb)) + if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* @@ -1876,10 +1858,6 @@ free_running: break; } - /* Start writeback even when in laptop mode */ - if (unlikely(!writeback_in_progress(wb))) - wb_start_background_writeback(wb); - mem_cgroup_flush_foreign(wb); /* @@ -2198,41 +2176,6 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int } #endif -void laptop_mode_timer_fn(struct timer_list *t) -{ - struct backing_dev_info *backing_dev_info = - timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); - - wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); -} - -/* - * We've spun up the disk and we're in laptop mode: schedule writeback - * of all dirty data a few seconds from now. If the flush is already scheduled - * then push it back - the user is still using the disk. - */ -void laptop_io_completion(struct backing_dev_info *info) -{ - mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); -} - -/* - * We're in laptop mode and we've just synced. The sync's writes will have - * caused another writeback to be scheduled by laptop_io_completion. - * Nothing needs to be written back anymore, so we unschedule the writeback. - */ -void laptop_sync_completion(void) -{ - struct backing_dev_info *bdi; - - rcu_read_lock(); - - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - timer_delete(&bdi->laptop_mode_wb_timer); - - rcu_read_unlock(); -} - /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. @@ -2263,6 +2206,19 @@ static int page_writeback_cpu_online(unsigned int cpu) #ifdef CONFIG_SYSCTL +static int laptop_mode; +static int laptop_mode_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos); + + if (!ret && write) + pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n", + current->comm); + + return ret; +} + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -2332,7 +2288,7 @@ static const struct ctl_table vm_page_writeback_sysctls[] = { .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = laptop_mode_handler, }, }; #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c87945fa761..fc5691afb998 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -104,13 +104,13 @@ struct scan_control { unsigned int force_deactivate:1; unsigned int skipped_deactivate:1; - /* Writepage batching in laptop mode; RECLAIM_WRITE */ + /* zone_reclaim_mode, boost reclaim */ unsigned int may_writepage:1; - /* Can mapped folios be reclaimed? */ + /* zone_reclaim_mode */ unsigned int may_unmap:1; - /* Can folios be swapped as part of reclaim? */ + /* zome_reclaim_mode, boost reclaim, cgroup restrictions */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ @@ -6365,13 +6365,6 @@ retry: if (sc->compaction_ready) break; - - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc->priority < DEF_PRIORITY - 2) - sc->may_writepage = 1; } while (--sc->priority >= 0); last_pgdat = NULL; @@ -6580,7 +6573,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = 1, }; @@ -6624,7 +6617,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, @@ -6670,7 +6663,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), @@ -7051,7 +7044,7 @@ restart: * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ - sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_writepage = !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* @@ -7061,13 +7054,6 @@ restart: */ kswapd_age_node(pgdat, &sc); - /* - * If we're getting trouble reclaiming, start doing writepage - * even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; @@ -7799,7 +7785,7 @@ int user_proactive_reclaim(char *buf, .reclaim_idx = gfp_zone(gfp_mask), .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), .may_unmap = 1, .may_swap = 1, -- cgit v1.2.3 From fa05705107a40131a8335ad37817153709261738 Mon Sep 17 00:00:00 2001 From: Detlev Casanova Date: Fri, 9 Jan 2026 11:15:18 -0500 Subject: media: v4l2-ctrls: Add hevc_ext_sps_[ls]t_rps controls The vdpu381 decoder found on newer Rockchip SoC need the information from the long term and short term ref pic sets from the SPS. So far, it wasn't included in the v4l2 API, so add it with new dynamic sized controls. Each element of the hevc_ext_sps_lt_rps array contains the long term ref pic set at that index. Each element of the hevc_ext_sps_st_rps contains the short term ref pic set at that index, as the raw data. It is the role of the drivers to calculate the reference sets values. Reviewed-by: Nicolas Dufresne Signed-off-by: Detlev Casanova Signed-off-by: Nicolas Dufresne Signed-off-by: Hans Verkuil --- drivers/media/v4l2-core/v4l2-ctrls-core.c | 28 ++++++++++++++ drivers/media/v4l2-core/v4l2-ctrls-defs.c | 10 +++++ include/uapi/linux/v4l2-controls.h | 61 +++++++++++++++++++++++++++++++ include/uapi/linux/videodev2.h | 2 + 4 files changed, 101 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/media/v4l2-core/v4l2-ctrls-core.c b/drivers/media/v4l2-core/v4l2-ctrls-core.c index 1d4b5859f0e2..79a157975f70 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-core.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-core.c @@ -424,6 +424,12 @@ void v4l2_ctrl_type_op_log(const struct v4l2_ctrl *ctrl) case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: pr_cont("HEVC_SLICE_PARAMS"); break; + case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS: + pr_cont("HEVC_EXT_SPS_ST_RPS"); + break; + case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS: + pr_cont("HEVC_EXT_SPS_LT_RPS"); + break; case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX: pr_cont("HEVC_SCALING_MATRIX"); break; @@ -961,6 +967,8 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx, struct v4l2_ctrl_h264_pred_weights *p_h264_pred_weights; struct v4l2_ctrl_h264_slice_params *p_h264_slice_params; struct v4l2_ctrl_h264_decode_params *p_h264_dec_params; + struct v4l2_ctrl_hevc_ext_sps_lt_rps *p_hevc_lt_rps; + struct v4l2_ctrl_hevc_ext_sps_st_rps *p_hevc_st_rps; struct v4l2_ctrl_hevc_sps *p_hevc_sps; struct v4l2_ctrl_hevc_pps *p_hevc_pps; struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering; @@ -1254,6 +1262,20 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx, case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: break; + case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS: + p_hevc_st_rps = p; + + if (p_hevc_st_rps->flags & ~V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED) + return -EINVAL; + break; + + case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS: + p_hevc_lt_rps = p; + + if (p_hevc_lt_rps->flags & ~V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT) + return -EINVAL; + break; + case V4L2_CTRL_TYPE_HDR10_CLL_INFO: break; @@ -2006,6 +2028,12 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl, case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_hevc_slice_params); break; + case V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS: + elem_size = sizeof(struct v4l2_ctrl_hevc_ext_sps_st_rps); + break; + case V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS: + elem_size = sizeof(struct v4l2_ctrl_hevc_ext_sps_lt_rps); + break; case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX: elem_size = sizeof(struct v4l2_ctrl_hevc_scaling_matrix); break; diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c index 765aeeec84fe..551426c4cd01 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c @@ -1235,6 +1235,8 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_STATELESS_HEVC_DECODE_MODE: return "HEVC Decode Mode"; case V4L2_CID_STATELESS_HEVC_START_CODE: return "HEVC Start Code"; case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS: return "HEVC Entry Point Offsets"; + case V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS: return "HEVC Short Term Ref Sets"; + case V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS: return "HEVC Long Term Ref Sets"; case V4L2_CID_STATELESS_AV1_SEQUENCE: return "AV1 Sequence Parameters"; case V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY: return "AV1 Tile Group Entry"; case V4L2_CID_STATELESS_AV1_FRAME: return "AV1 Frame Parameters"; @@ -1581,6 +1583,14 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, *type = V4L2_CTRL_TYPE_U32; *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; break; + case V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS: + *type = V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS; + *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; + break; + case V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS: + *type = V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS; + *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; + break; case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR: *type = V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR; break; diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 572622e4535e..68dd0c4e47b2 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -2101,6 +2101,8 @@ struct v4l2_ctrl_mpeg2_quantisation { #define V4L2_CID_STATELESS_HEVC_DECODE_MODE (V4L2_CID_CODEC_STATELESS_BASE + 405) #define V4L2_CID_STATELESS_HEVC_START_CODE (V4L2_CID_CODEC_STATELESS_BASE + 406) #define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407) +#define V4L2_CID_STATELESS_HEVC_EXT_SPS_ST_RPS (V4L2_CID_CODEC_STATELESS_BASE + 408) +#define V4L2_CID_STATELESS_HEVC_EXT_SPS_LT_RPS (V4L2_CID_CODEC_STATELESS_BASE + 409) enum v4l2_stateless_hevc_decode_mode { V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED, @@ -2556,6 +2558,65 @@ struct v4l2_ctrl_hevc_scaling_matrix { __u8 scaling_list_dc_coef_32x32[2]; }; +#define V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_INTER_REF_PIC_SET_PRED 0x1 + +/* + * struct v4l2_ctrl_hevc_ext_sps_st_rps - HEVC short term RPS parameters + * + * Dynamic size 1-dimension array for short term RPS. The number of elements + * is v4l2_ctrl_hevc_sps::num_short_term_ref_pic_sets. It can contain up to 65 elements. + * + * @delta_idx_minus1: Specifies the delta compare to the index. See details in section 7.4.8 + * "Short-term reference picture set semantics" of the specification. + * @delta_rps_sign: Sign of the delta as specified in section 7.4.8 "Short-term reference picture + * set semantics" of the specification. + * @abs_delta_rps_minus1: Absolute delta RPS as specified in section 7.4.8 "Short-term reference + * picture set semantics" of the specification. + * @num_negative_pics: Number of short-term RPS entries that have picture order count values less + * than the picture order count value of the current picture. + * @num_positive_pics: Number of short-term RPS entries that have picture order count values + * greater than the picture order count value of the current picture. + * @used_by_curr_pic: Bit j specifies if short-term RPS j is used by the current picture. + * @use_delta_flag: Bit j equals to 1 specifies that the j-th entry in the source candidate + * short-term RPS is included in this candidate short-term RPS. + * @delta_poc_s0_minus1: Specifies the negative picture order count delta for the i-th entry in + * the short-term RPS. See details in section 7.4.8 "Short-term reference + * picture set semantics" of the specification. + * @delta_poc_s1_minus1: Specifies the positive picture order count delta for the i-th entry in + * the short-term RPS. See details in section 7.4.8 "Short-term reference + * picture set semantics" of the specification. + * @flags: See V4L2_HEVC_EXT_SPS_ST_RPS_FLAG_{} + */ +struct v4l2_ctrl_hevc_ext_sps_st_rps { + __u8 delta_idx_minus1; + __u8 delta_rps_sign; + __u8 num_negative_pics; + __u8 num_positive_pics; + __u32 used_by_curr_pic; + __u32 use_delta_flag; + __u16 abs_delta_rps_minus1; + __u16 delta_poc_s0_minus1[16]; + __u16 delta_poc_s1_minus1[16]; + __u16 flags; +}; + +#define V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_USED_LT 0x1 + +/* + * struct v4l2_ctrl_hevc_ext_sps_lt_rps - HEVC long term RPS parameters + * + * Dynamic size 1-dimension array for long term RPS. The number of elements + * is v4l2_ctrl_hevc_sps::num_long_term_ref_pics_sps. It can contain up to 65 elements. + * + * @lt_ref_pic_poc_lsb_sps: picture order count modulo MaxPicOrderCntLsb of the i-th candidate + * long-term reference picture. + * @flags: See V4L2_HEVC_EXT_SPS_LT_RPS_FLAG_{} + */ +struct v4l2_ctrl_hevc_ext_sps_lt_rps { + __u16 lt_ref_pic_poc_lsb_sps; + __u16 flags; +}; + /* Stateless VP9 controls */ #define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED 0x1 diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 848e86617d5c..eda4492e40dc 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1986,6 +1986,8 @@ enum v4l2_ctrl_type { V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS = 0x0272, V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX = 0x0273, V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS = 0x0274, + V4L2_CTRL_TYPE_HEVC_EXT_SPS_ST_RPS = 0x0275, + V4L2_CTRL_TYPE_HEVC_EXT_SPS_LT_RPS = 0x0276, V4L2_CTRL_TYPE_AV1_SEQUENCE = 0x280, V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY = 0x281, -- cgit v1.2.3 From f5f2bad67a45cd1ef6f5b727da104694a81b3666 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Jan 2026 08:31:49 +0100 Subject: block: make the new blkzoned UAPI constants discoverable The Linux 6.19 merge window added the new BLKREPORTZONESV2 ioctl, and with it the new BLK_ZONE_REP_CACHED and BLK_ZONE_COND_ACTIVE constants. The two constants are defined as part of enums, which makes it very painful for userspace to discover if they are present in the installed system headers. Use the #define to the same name trick to make them trivially discoverable using CPP directives. Fixes: 0bf0e2e46668 ("block: track zone conditions") Fixes: b30ffcdc0c15 ("block: introduce BLKREPORTZONESV2 ioctl") Reported-by: Andrey Albershteyn Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index e33f02703350..663836120966 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -81,7 +81,8 @@ enum blk_zone_cond { BLK_ZONE_COND_FULL = 0xE, BLK_ZONE_COND_OFFLINE = 0xF, - BLK_ZONE_COND_ACTIVE = 0xFF, + BLK_ZONE_COND_ACTIVE = 0xFF, /* added in Linux 6.19 */ +#define BLK_ZONE_COND_ACTIVE BLK_ZONE_COND_ACTIVE }; /** @@ -100,7 +101,8 @@ enum blk_zone_report_flags { BLK_ZONE_REP_CAPACITY = (1U << 0), /* Input flags */ - BLK_ZONE_REP_CACHED = (1U << 31), + BLK_ZONE_REP_CACHED = (1U << 31), /* added in Linux 6.19 */ +#define BLK_ZONE_REP_CACHED BLK_ZONE_REP_CACHED }; /** -- cgit v1.2.3 From d7a5da7a0f7fa7ff081140c4f6f971db98882703 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:04 +0100 Subject: rseq: Add fields and constants for time slice extension Aside of a Kconfig knob add the following items: - Two flag bits for the rseq user space ABI, which allow user space to query the availability and enablement without a syscall. - A new member to the user space ABI struct rseq, which is going to be used to communicate request and grant between kernel and user space. - A rseq state struct to hold the kernel state of this - Documentation of the new mechanism Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.669472597@linutronix.de --- Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/rseq.rst | 135 ++++++++++++++++++++++++++++++++++ include/linux/rseq_types.h | 28 ++++++- include/uapi/linux/rseq.h | 38 ++++++++++ init/Kconfig | 12 +++ kernel/rseq.c | 7 ++ 6 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 Documentation/userspace-api/rseq.rst (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index 8a61ac4c1bf1..fa0fe8ada68e 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -21,6 +21,7 @@ System calls ebpf/index ioctl/index mseal + rseq Security-related interfaces =========================== diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst new file mode 100644 index 000000000000..e1fdb0d5ce69 --- /dev/null +++ b/Documentation/userspace-api/rseq.rst @@ -0,0 +1,135 @@ +===================== +Restartable Sequences +===================== + +Restartable Sequences allow to register a per thread userspace memory area +to be used as an ABI between kernel and userspace for three purposes: + + * userspace restartable sequences + + * quick access to read the current CPU number, node ID from userspace + + * scheduler time slice extensions + +Restartable sequences (per-cpu atomics) +--------------------------------------- + +Restartable sequences allow userspace to perform update operations on +per-cpu data without requiring heavyweight atomic operations. The actual +ABI is unfortunately only available in the code and selftests. + +Quick access to CPU number, node ID +----------------------------------- + +Allows to implement per CPU data efficiently. Documentation is in code and +selftests. :( + +Scheduler time slice extensions +------------------------------- + +This allows a thread to request a time slice extension when it enters a +critical section to avoid contention on a resource when the thread is +scheduled out inside of the critical section. + +The prerequisites for this functionality are: + + * Enabled in Kconfig + + * Enabled at boot time (default is enabled) + + * A rseq userspace pointer has been registered for the thread + +The thread has to enable the functionality via prctl(2):: + + prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0); + +prctl() returns 0 on success or otherwise with the following error codes: + +========= ============================================================== +Errorcode Meaning +========= ============================================================== +EINVAL Functionality not available or invalid function arguments. + Note: arg4 and arg5 must be zero +ENOTSUPP Functionality was disabled on the kernel command line +ENXIO Available, but no rseq user struct registered +========= ============================================================== + +The state can be also queried via prctl(2):: + + prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_GET, 0, 0, 0); + +prctl() returns ``PR_RSEQ_SLICE_EXT_ENABLE`` when it is enabled or 0 if +disabled. Otherwise it returns with the following error codes: + +========= ============================================================== +Errorcode Meaning +========= ============================================================== +EINVAL Functionality not available or invalid function arguments. + Note: arg3 and arg4 and arg5 must be zero +========= ============================================================== + +The availability and status is also exposed via the rseq ABI struct flags +field via the ``RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT`` and the +``RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT``. These bits are read-only for user +space and only for informational purposes. + +If the mechanism was enabled via prctl(), the thread can request a time +slice extension by setting rseq::slice_ctrl::request to 1. If the thread is +interrupted and the interrupt results in a reschedule request in the +kernel, then the kernel can grant a time slice extension and return to +userspace instead of scheduling out. The length of the extension is +determined by the ``rseq_slice_extension_nsec`` sysctl. + +The kernel indicates the grant by clearing rseq::slice_ctrl::request and +setting rseq::slice_ctrl::granted to 1. If there is a reschedule of the +thread after granting the extension, the kernel clears the granted bit to +indicate that to userspace. + +If the request bit is still set when the leaving the critical section, +userspace can clear it and continue. + +If the granted bit is set, then userspace invokes rseq_slice_yield(2) when +leaving the critical section to relinquish the CPU. The kernel enforces +this by arming a timer to prevent misbehaving userspace from abusing this +mechanism. + +If both the request bit and the granted bit are false when leaving the +critical section, then this indicates that a grant was revoked and no +further action is required by userspace. + +The required code flow is as follows:: + + rseq->slice_ctrl.request = 1; + barrier(); // Prevent compiler reordering + critical_section(); + barrier(); // Prevent compiler reordering + rseq->slice_ctrl.request = 0; + if (rseq->slice_ctrl.granted) + rseq_slice_yield(); + +As all of this is strictly CPU local, there are no atomicity requirements. +Checking the granted state is racy, but that cannot be avoided at all:: + + if (rseq->slice_ctrl.granted) + -> Interrupt results in schedule and grant revocation + rseq_slice_yield(); + +So there is no point in pretending that this might be solved by an atomic +operation. + +If the thread issues a syscall other than rseq_slice_yield(2) within the +granted timeslice extension, the grant is also revoked and the CPU is +relinquished immediately when entering the kernel. This is required as +syscalls might consume arbitrary CPU time until they reach a scheduling +point when the preemption model is either NONE or VOLUNTARY and therefore +might exceed the grant by far. + +The preferred solution for user space is to use rseq_slice_yield(2) which +is side effect free. The support for arbitrary syscalls is required to +support onion layer architectured applications, where the code handling the +critical section and requesting the time slice extension has no control +over the code within the critical section. + +The kernel enforces flag consistency and terminates the thread with SIGSEGV +if it detects a violation. diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 332dc14b81c9..67e40c059b1b 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -72,13 +72,36 @@ struct rseq_ids { }; }; +/** + * union rseq_slice_state - Status information for rseq time slice extension + * @state: Compound to access the overall state + * @enabled: Time slice extension is enabled for the task + * @granted: Time slice extension was granted to the task + */ +union rseq_slice_state { + u16 state; + struct { + u8 enabled; + u8 granted; + }; +}; + +/** + * struct rseq_slice - Status information for rseq time slice extension + * @state: Time slice extension state + */ +struct rseq_slice { + union rseq_slice_state state; +}; + /** * struct rseq_data - Storage for all rseq related data * @usrptr: Pointer to the registered user space RSEQ memory * @len: Length of the RSEQ region - * @sig: Signature of critial section abort IPs + * @sig: Signature of critical section abort IPs * @event: Storage for event management * @ids: Storage for cached CPU ID and MM CID + * @slice: Storage for time slice extension data */ struct rseq_data { struct rseq __user *usrptr; @@ -86,6 +109,9 @@ struct rseq_data { u32 sig; struct rseq_event event; struct rseq_ids ids; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + struct rseq_slice slice; +#endif }; #else /* CONFIG_RSEQ */ diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 1b76d508400c..6afc219d1545 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -23,9 +23,15 @@ enum rseq_flags { }; enum rseq_cs_flags_bit { + /* Historical and unsupported bits */ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, + /* (3) Intentional gap to put new bits into a separate byte */ + + /* User read only feature flags */ + RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT = 4, + RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT = 5, }; enum rseq_cs_flags { @@ -35,6 +41,11 @@ enum rseq_cs_flags { (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), + + RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE = + (1U << RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT), + RSEQ_CS_FLAG_SLICE_EXT_ENABLED = + (1U << RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT), }; /* @@ -53,6 +64,27 @@ struct rseq_cs { __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); +/** + * rseq_slice_ctrl - Time slice extension control structure + * @all: Compound value + * @request: Request for a time slice extension + * @granted: Granted time slice extension + * + * @request is set by user space and can be cleared by user space or kernel + * space. @granted is set and cleared by the kernel and must only be read + * by user space. + */ +struct rseq_slice_ctrl { + union { + __u32 all; + struct { + __u8 request; + __u8 granted; + __u16 __reserved; + }; + }; +}; + /* * struct rseq is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. @@ -141,6 +173,12 @@ struct rseq { */ __u32 mm_cid; + /* + * Time slice extension control structure. CPU local updates from + * kernel and user space. + */ + struct rseq_slice_ctrl slice_ctrl; + /* * Flexible array member at end of structure, after last feature field. */ diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..00c6fbb66a5a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1938,6 +1938,18 @@ config RSEQ If unsure, say Y. +config RSEQ_SLICE_EXTENSION + bool "Enable rseq-based time slice extension mechanism" + depends on RSEQ && HIGH_RES_TIMERS && GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS + help + Allows userspace to request a limited time slice extension when + returning from an interrupt to user space via the RSEQ shared + data ABI. If granted, that allows to complete a critical section, + so that other threads are not stuck on a conflicted resource, + while the task is scheduled out. + + If unsure, say N. + config RSEQ_STATS default n bool "Enable lightweight statistics of restartable sequences" if EXPERT diff --git a/kernel/rseq.c b/kernel/rseq.c index 395d8b002350..07c324d5a201 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -389,6 +389,8 @@ static bool rseq_reset_ids(void) */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { + u32 rseqfl = 0; + if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; @@ -440,6 +442,9 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + scoped_user_write_access(rseq, efault) { /* * If the rseq_cs pointer is non-NULL on registration, clear it to @@ -449,11 +454,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * clearing the fields. Don't bother reading it, just reset it. */ unsafe_put_user(0UL, &rseq->rseq_cs, efault); + unsafe_put_user(rseqfl, &rseq->flags, efault); /* Initialize IDs in user space */ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); } /* -- cgit v1.2.3 From 28621ec2d46c6adf7d33a6facbd83e2fa566bd34 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:12 +0100 Subject: rseq: Add prctl() to enable time slice extensions Implement a prctl() so that tasks can enable the time slice extension mechanism. This fails, when time slice extensions are disabled at compile time or on the kernel command line and when no rseq pointer is registered in the kernel. That allows to implement a single trivial check in the exit to user mode hotpath, to decide whether the whole mechanism needs to be invoked. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.858717691@linutronix.de --- include/linux/rseq.h | 9 ++++++++ include/uapi/linux/prctl.h | 10 +++++++++ kernel/rseq.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 6 ++++++ 4 files changed, 77 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 2266f4dc77b6..3c194a02ad0a 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -163,4 +163,13 @@ void rseq_syscall(struct pt_regs *regs); static inline void rseq_syscall(struct pt_regs *regs) { } #endif /* !CONFIG_DEBUG_RSEQ */ +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3); +#else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + return -ENOTSUPP; +} +#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ + #endif /* _LINUX_RSEQ_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 51c4e8c82b1e..79944b7ae50a 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -386,4 +386,14 @@ struct prctl_mm_map { # define PR_FUTEX_HASH_SET_SLOTS 1 # define PR_FUTEX_HASH_GET_SLOTS 2 +/* RSEQ time slice extensions */ +#define PR_RSEQ_SLICE_EXTENSION 79 +# define PR_RSEQ_SLICE_EXTENSION_GET 1 +# define PR_RSEQ_SLICE_EXTENSION_SET 2 +/* + * Bits for RSEQ_SLICE_EXTENSION_GET/SET + * PR_RSEQ_SLICE_EXT_ENABLE: Enable + */ +# define PR_RSEQ_SLICE_EXT_ENABLE 0x01 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 415d75b6df2c..09848bb14ec2 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -71,6 +71,7 @@ #define RSEQ_BUILD_SLOW_PATH #include +#include #include #include #include @@ -501,6 +502,57 @@ efault: #ifdef CONFIG_RSEQ_SLICE_EXTENSION DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + switch (arg2) { + case PR_RSEQ_SLICE_EXTENSION_GET: + if (arg3) + return -EINVAL; + return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; + + case PR_RSEQ_SLICE_EXTENSION_SET: { + u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); + + if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) + return -EINVAL; + if (!rseq_slice_extension_enabled()) + return -ENOTSUPP; + if (!current->rseq.usrptr) + return -ENXIO; + + /* No change? */ + if (enable == !!current->rseq.slice.state.enabled) + return 0; + + if (get_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + if (current->rseq.slice.state.enabled) + valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if ((rflags & valid) != valid) + goto die; + + rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (enable) + rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if (put_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + current->rseq.slice.state.enabled = enable; + return 0; + } + default: + return -EINVAL; + } +die: + force_sig(SIGSEGV); + return -EFAULT; +} + static int __init rseq_slice_cmdline(char *str) { bool on; diff --git a/kernel/sys.c b/kernel/sys.c index 8b58eece4e58..af71987df81c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -2868,6 +2869,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_FUTEX_HASH: error = futex_hash_prctl(arg2, arg3, arg4); break; + case PR_RSEQ_SLICE_EXTENSION: + if (arg4 || arg5) + return -EINVAL; + error = rseq_slice_extension_prctl(arg2, arg3); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; -- cgit v1.2.3 From d6200245c75e832af2087bc60ba2e6641a90eee9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Jan 2026 11:23:57 +0100 Subject: rseq: Allow registering RSEQ with slice extension Since glibc cares about the number of syscalls required to initialize a new thread, allow initializing rseq with slice extension on. This avoids having to do another prctl(). Requested-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143207.814193010@infradead.org --- include/uapi/linux/rseq.h | 3 ++- kernel/rseq.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 6afc219d1545..863c4a00a66b 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -19,7 +19,8 @@ enum rseq_cpu_id_state { }; enum rseq_flags { - RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_SLICE_EXT_DEFAULT_ON = (1 << 1), }; enum rseq_cs_flags_bit { diff --git a/kernel/rseq.c b/kernel/rseq.c index 275d70114107..1c5490a172a8 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -424,7 +424,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return 0; } - if (unlikely(flags)) + if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) return -EINVAL; if (current->rseq.usrptr) { @@ -459,8 +459,12 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (rseq_slice_extension_enabled() && + (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } scoped_user_write_access(rseq, efault) { /* @@ -488,6 +492,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 current->rseq.len = rseq_len; current->rseq.sig = sig; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); +#endif + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields -- cgit v1.2.3 From f174a9ffcd48d78a45d560c02ce4071ded036b53 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 19 Jan 2026 10:29:22 +0800 Subject: KVM: arm64: Add exit to userspace on {LD,ST}64B* outside of memslots The main use of {LD,ST}64B* is to talk to a device, which is hopefully directly assigned to the guest and requires no additional handling. However, this does not preclude a VMM from exposing a virtual device to the guest, and to allow 64 byte accesses as part of the programming interface. A direct consequence of this is that we need to be able to forward such access to userspace. Given that such a contraption is very unlikely to ever exist, we choose to offer a limited service: userspace gets (as part of a new exit reason) the ESR, the IPA, and that's it. It is fully expected to handle the full semantics of the instructions, deal with ACCDATA, the return values and increment PC. Much fun. A canonical implementation can also simply inject an abort and be done with it. Frankly, don't try to do anything else unless you have time to waste. Acked-by: Arnd Bergmann Acked-by: Oliver Upton Signed-off-by: Marc Zyngier Signed-off-by: Yicong Yang Signed-off-by: Zhou Wang Signed-off-by: Will Deacon --- arch/arm64/kvm/mmio.c | 27 ++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 3 ++- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 54f9358c9e0e..e2285ed8c91d 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -159,6 +159,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) bool is_write; int len; u8 data_buf[8]; + u64 esr; + + esr = kvm_vcpu_get_esr(vcpu); /* * No valid syndrome? Ask userspace for help if it has @@ -168,7 +171,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) * though, so directly deliver an exception to the guest. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { - trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), + trace_kvm_mmio_nisv(*vcpu_pc(vcpu), esr, kvm_vcpu_get_hfar(vcpu), fault_ipa); if (vcpu_is_protected(vcpu)) @@ -185,6 +188,28 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) return -ENOSYS; } + /* + * When (DFSC == 0b00xxxx || DFSC == 0b10101x) && DFSC != 0b0000xx + * ESR_EL2[12:11] describe the Load/Store Type. This allows us to + * punt the LD64B/ST64B/ST64BV/ST64BV0 instructions to userspace, + * which will have to provide a full emulation of these 4 + * instructions. No, we don't expect this do be fast. + * + * We rely on traps being set if the corresponding features are not + * enabled, so if we get here, userspace has promised us to handle + * it already. + */ + switch (kvm_vcpu_trap_get_fault(vcpu)) { + case 0b000100 ... 0b001111: + case 0b101010 ... 0b101011: + if (FIELD_GET(GENMASK(12, 11), esr)) { + run->exit_reason = KVM_EXIT_ARM_LDST64B; + run->arm_nisv.esr_iss = esr & ~(u64)ESR_ELx_FSC; + run->arm_nisv.fault_ipa = fault_ipa; + return 0; + } + } + /* * Prepare MMIO operation. First decode the syndrome data we get * from the CPU. Then try if some in-kernel emulation feels diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dddb781b0507..88cca0e22ece 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -180,6 +180,7 @@ struct kvm_xen_exit { #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 #define KVM_EXIT_ARM_SEA 41 +#define KVM_EXIT_ARM_LDST64B 42 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -402,7 +403,7 @@ struct kvm_run { } eoi; /* KVM_EXIT_HYPERV */ struct kvm_hyperv_exit hyperv; - /* KVM_EXIT_ARM_NISV */ + /* KVM_EXIT_ARM_NISV / KVM_EXIT_ARM_LDST64B */ struct { __u64 esr_iss; __u64 fault_ipa; -- cgit v1.2.3 From 0f7afd80d81b739c4a9a6e4e24109ba1030c9c56 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:22 -0600 Subject: PCI: Move CXL DVSEC definitions into uapi/linux/pci_regs.h The CXL DVSECs are currently defined in cxl/core/cxlpci.h. These are not accessible to other subsystems. Move these to uapi/linux/pci_regs.h. The CXL DVSEC definitions will be renamed and reformatted to fit better with existing defines. Signed-off-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Signed-off-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-2-terry.bowman@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/cxlpci.h | 53 ----------------------------------- include/uapi/linux/pci_regs.h | 64 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 58 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 1d526bea8431..cdb7cf3dbcb4 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -7,59 +7,6 @@ #define CXL_MEMORY_PROGIF 0x10 -/* - * See section 8.1 Configuration Space Registers in the CXL 2.0 - * Specification. Names are taken straight from the specification with "CXL" and - * "DVSEC" redundancies removed. When obvious, abbreviations may be used. - */ -#define PCI_DVSEC_HEADER1_LENGTH_MASK GENMASK(31, 20) - -/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ -#define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_MEM_CAPABLE BIT(2) -#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) -#define CXL_DVSEC_CTRL_OFFSET 0xC -#define CXL_DVSEC_MEM_ENABLE BIT(2) -#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) -#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) -#define CXL_DVSEC_MEM_INFO_VALID BIT(0) -#define CXL_DVSEC_MEM_ACTIVE BIT(1) -#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) -#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) -#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) -#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) - -#define CXL_DVSEC_RANGE_MAX 2 - -/* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */ -#define CXL_DVSEC_FUNCTION_MAP 2 - -/* CXL 2.0 8.1.5: CXL 2.0 Extensions DVSEC for Ports */ -#define CXL_DVSEC_PORT_EXTENSIONS 3 - -/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */ -#define CXL_DVSEC_PORT_GPF 4 -#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8) -#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8) - -/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */ -#define CXL_DVSEC_DEVICE_GPF 5 - -/* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */ -#define CXL_DVSEC_PCIE_FLEXBUS_PORT 7 - -/* CXL 2.0 8.1.9: Register Locator DVSEC */ -#define CXL_DVSEC_REG_LOCATOR 8 -#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC -#define CXL_DVSEC_REG_LOCATOR_BIR_MASK GENMASK(2, 0) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK GENMASK(15, 8) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK GENMASK(31, 16) - /* * NOTE: Currently all the functions which are enabled for CXL require their * vectors to be in the first 16. Use this as the default max. diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 3add74ae2594..6c4b6f19b18e 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1253,11 +1253,6 @@ #define PCI_DEV3_STA 0x0c /* Device 3 Status Register */ #define PCI_DEV3_STA_SEGMENT 0x8 /* Segment Captured (end-to-end flit-mode detected) */ -/* Compute Express Link (CXL r3.1, sec 8.1.5) */ -#define PCI_DVSEC_CXL_PORT 3 -#define PCI_DVSEC_CXL_PORT_CTL 0x0c -#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 - /* Integrity and Data Encryption Extended Capability */ #define PCI_IDE_CAP 0x04 #define PCI_IDE_CAP_LINK 0x1 /* Link IDE Stream Supported */ @@ -1338,4 +1333,63 @@ #define PCI_IDE_SEL_ADDR_3(x) (28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) #define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc) (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc)) +/* Compute Express Link (CXL r3.1, sec 8.1.5) */ +#define PCI_DVSEC_CXL_PORT 3 +#define PCI_DVSEC_CXL_PORT_CTL 0x0c +#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 + +/* + * Compute Express Link (CXL r3.2, sec 8.1) + * + * Note that CXL DVSEC id 3 and 7 to be ignored when the CXL link state + * is "disconnected" (CXL r3.2, sec 9.12.3). Re-enumerate these + * registers on downstream link-up events. + */ +#define PCI_DVSEC_HEADER1_LENGTH_MASK __GENMASK(31, 20) + +/* CXL 3.2 8.1.3: PCIe DVSEC for CXL Device */ +#define CXL_DVSEC_PCIE_DEVICE 0 +#define CXL_DVSEC_CAP_OFFSET 0xA +#define CXL_DVSEC_MEM_CAPABLE _BITUL(2) +#define CXL_DVSEC_HDM_COUNT_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CTRL_OFFSET 0xC +#define CXL_DVSEC_MEM_ENABLE _BITUL(2) +#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define CXL_DVSEC_MEM_INFO_VALID _BITUL(0) +#define CXL_DVSEC_MEM_ACTIVE _BITUL(1) +#define CXL_DVSEC_MEM_SIZE_LOW_MASK __GENMASK(31, 28) +#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define CXL_DVSEC_MEM_BASE_LOW_MASK __GENMASK(31, 28) + +#define CXL_DVSEC_RANGE_MAX 2 + +/* CXL 3.2 8.1.4: Non-CXL Function Map DVSEC */ +#define CXL_DVSEC_FUNCTION_MAP 2 + +/* CXL 3.2 8.1.5: Extensions DVSEC for Ports */ +#define CXL_DVSEC_PORT 3 +#define CXL_DVSEC_PORT_CTL 0x0c +#define CXL_DVSEC_PORT_CTL_UNMASK_SBR 0x00000001 + +/* CXL 3.2 8.1.6: GPF DVSEC for CXL Port */ +#define CXL_DVSEC_PORT_GPF 4 +#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK __GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK __GENMASK(11, 8) +#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK __GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK __GENMASK(11, 8) + +/* CXL 3.2 8.1.7: GPF DVSEC for CXL Device */ +#define CXL_DVSEC_DEVICE_GPF 5 + +/* CXL 3.2 8.1.9: Register Locator DVSEC */ +#define CXL_DVSEC_REG_LOCATOR 8 +#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC +#define CXL_DVSEC_REG_LOCATOR_BIR_MASK __GENMASK(2, 0) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK __GENMASK(15, 8) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK __GENMASK(31, 16) + #endif /* LINUX_PCI_REGS_H */ -- cgit v1.2.3 From 6612bd9ff0b1001cff5f5d79db6ce44427d2e99c Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:23 -0600 Subject: PCI: Update CXL DVSEC definitions CXL DVSEC definitions were recently moved into uapi/pci_regs.h, but the newly added macros do not follow the file's existing naming conventions. The current format uses CXL_DVSEC_XYZ, while the new CXL entries must instead use the PCI_DVSEC_CXL_XYZ prefix to match the conventions already established in pci_regs.h. The new CXL DVSEC macros also introduce _MASK and _OFFSET suffixes, which are not used anywhere else in the file. These suffixes lengthen the identifiers and reduce readability. Remove _MASK and _OFFSET from the recently added definitions. Additionally, remove PCI_DVSEC_HEADER1_LENGTH, as it duplicates the existing PCI_DVSEC_HEADER1_LEN() macro. Update all existing references to use the new macro names. Finally, update the inline documentation to reference the latest revision of the CXL specification. Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-3-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 58 +++++++++++++------------- drivers/cxl/core/regs.c | 14 +++---- drivers/cxl/pci.c | 2 +- include/uapi/linux/pci_regs.h | 94 ++++++++++++++++++++----------------------- 4 files changed, 81 insertions(+), 87 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 5b023a0178a4..077b386e0c8d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -86,12 +86,12 @@ static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) i = 1; do { rc = pci_read_config_dword(pdev, - d + CXL_DVSEC_RANGE_SIZE_LOW(id), + d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(id), &temp); if (rc) return rc; - valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp); + valid = FIELD_GET(PCI_DVSEC_CXL_MEM_INFO_VALID, temp); if (valid) break; msleep(1000); @@ -121,11 +121,11 @@ static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) /* Check MEM ACTIVE bit, up to 60s timeout by default */ for (i = media_ready_timeout; i; i--) { rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(id), &temp); if (rc) return rc; - active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp); + active = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE, temp); if (active) break; msleep(1000); @@ -154,11 +154,11 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) u16 cap; rc = pci_read_config_word(pdev, - d + CXL_DVSEC_CAP_OFFSET, &cap); + d + PCI_DVSEC_CXL_CAP, &cap); if (rc) return rc; - hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + hdm_count = FIELD_GET(PCI_DVSEC_CXL_HDM_COUNT, cap); for (i = 0; i < hdm_count; i++) { rc = cxl_dvsec_mem_range_valid(cxlds, i); if (rc) @@ -186,16 +186,16 @@ static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val) u16 ctrl; int rc; - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, &ctrl); if (rc < 0) return rc; - if ((ctrl & CXL_DVSEC_MEM_ENABLE) == val) + if ((ctrl & PCI_DVSEC_CXL_MEM_ENABLE) == val) return 1; - ctrl &= ~CXL_DVSEC_MEM_ENABLE; + ctrl &= ~PCI_DVSEC_CXL_MEM_ENABLE; ctrl |= val; - rc = pci_write_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, ctrl); + rc = pci_write_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, ctrl); if (rc < 0) return rc; @@ -211,7 +211,7 @@ static int devm_cxl_enable_mem(struct device *host, struct cxl_dev_state *cxlds) { int rc; - rc = cxl_set_mem_enable(cxlds, CXL_DVSEC_MEM_ENABLE); + rc = cxl_set_mem_enable(cxlds, PCI_DVSEC_CXL_MEM_ENABLE); if (rc < 0) return rc; if (rc > 0) @@ -273,11 +273,11 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, return -ENXIO; } - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CAP, &cap); if (rc) return rc; - if (!(cap & CXL_DVSEC_MEM_CAPABLE)) { + if (!(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) { dev_dbg(dev, "Not MEM Capable\n"); return -ENXIO; } @@ -288,7 +288,7 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, * driver is for a spec defined class code which must be CXL.mem * capable, there is no point in continuing to enable CXL.mem. */ - hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + hdm_count = FIELD_GET(PCI_DVSEC_CXL_HDM_COUNT, cap); if (!hdm_count || hdm_count > 2) return -EINVAL; @@ -297,11 +297,11 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, * disabled, and they will remain moot after the HDM Decoder * capability is enabled. */ - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, &ctrl); if (rc) return rc; - info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); + info->mem_enabled = FIELD_GET(PCI_DVSEC_CXL_MEM_ENABLE, ctrl); if (!info->mem_enabled) return 0; @@ -314,35 +314,35 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, return rc; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i), &temp); if (rc) return rc; size = (u64)temp << 32; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(i), &temp); if (rc) return rc; - size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK; + size |= temp & PCI_DVSEC_CXL_MEM_SIZE_LOW; if (!size) { continue; } rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), &temp); if (rc) return rc; base = (u64)temp << 32; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), &temp); if (rc) return rc; - base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK; + base |= temp & PCI_DVSEC_CXL_MEM_BASE_LOW; info->dvsec_range[ranges++] = (struct range) { .start = base, @@ -1068,7 +1068,7 @@ u16 cxl_gpf_get_dvsec(struct device *dev) is_port = false; dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, - is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF); + is_port ? PCI_DVSEC_CXL_PORT_GPF : PCI_DVSEC_CXL_DEVICE_GPF); if (!dvsec) dev_warn(dev, "%s GPF DVSEC not present\n", is_port ? "Port" : "Device"); @@ -1084,14 +1084,14 @@ static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) switch (phase) { case 1: - offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET; - base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK; - scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK; + offset = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_CONTROL; + base = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_BASE; + scale = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_SCALE; break; case 2: - offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET; - base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK; - scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK; + offset = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_CONTROL; + base = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_BASE; + scale = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_SCALE; break; default: return -EINVAL; diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 5ca7b0eed568..a010b3214342 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -271,10 +271,10 @@ EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, "CXL"); static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, struct cxl_register_map *map) { - u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); - int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); + u8 reg_type = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID, reg_lo); + int bar = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BIR, reg_lo); u64 offset = ((u64)reg_hi << 32) | - (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); + (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW); if (offset > pci_resource_len(pdev, bar)) { dev_warn(&pdev->dev, @@ -311,15 +311,15 @@ static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_ty }; regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, - CXL_DVSEC_REG_LOCATOR); + PCI_DVSEC_CXL_REG_LOCATOR); if (!regloc) return -ENXIO; pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); - regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size); + regloc_size = PCI_DVSEC_HEADER1_LEN(regloc_size); - regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET; - regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8; + regloc += PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1; + regblocks = (regloc_size - PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1) / 8; for (i = 0; i < regblocks; i++, regloc += 8) { u32 reg_lo, reg_hi; diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0be4e508affe..b7f694bda913 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -933,7 +933,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) cxlds->rcd = is_cxl_restricted(pdev); cxlds->serial = pci_get_dsn(pdev); cxlds->cxl_dvsec = pci_find_dvsec_capability( - pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); + pdev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_DEVICE); if (!cxlds->cxl_dvsec) dev_warn(&pdev->dev, "Device DVSEC not present, skip CXL.mem init\n"); diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 6c4b6f19b18e..662582bdccf0 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1333,63 +1333,57 @@ #define PCI_IDE_SEL_ADDR_3(x) (28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) #define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc) (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc)) -/* Compute Express Link (CXL r3.1, sec 8.1.5) */ -#define PCI_DVSEC_CXL_PORT 3 -#define PCI_DVSEC_CXL_PORT_CTL 0x0c -#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 - /* - * Compute Express Link (CXL r3.2, sec 8.1) + * Compute Express Link (CXL r4.0, sec 8.1) * * Note that CXL DVSEC id 3 and 7 to be ignored when the CXL link state - * is "disconnected" (CXL r3.2, sec 9.12.3). Re-enumerate these + * is "disconnected" (CXL r4.0, sec 9.12.3). Re-enumerate these * registers on downstream link-up events. */ -#define PCI_DVSEC_HEADER1_LENGTH_MASK __GENMASK(31, 20) - -/* CXL 3.2 8.1.3: PCIe DVSEC for CXL Device */ -#define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_MEM_CAPABLE _BITUL(2) -#define CXL_DVSEC_HDM_COUNT_MASK __GENMASK(5, 4) -#define CXL_DVSEC_CTRL_OFFSET 0xC -#define CXL_DVSEC_MEM_ENABLE _BITUL(2) -#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) -#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) -#define CXL_DVSEC_MEM_INFO_VALID _BITUL(0) -#define CXL_DVSEC_MEM_ACTIVE _BITUL(1) -#define CXL_DVSEC_MEM_SIZE_LOW_MASK __GENMASK(31, 28) -#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) -#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) -#define CXL_DVSEC_MEM_BASE_LOW_MASK __GENMASK(31, 28) + +/* CXL r4.0, 8.1.3: PCIe DVSEC for CXL Device */ +#define PCI_DVSEC_CXL_DEVICE 0 +#define PCI_DVSEC_CXL_CAP 0xA +#define PCI_DVSEC_CXL_MEM_CAPABLE _BITUL(2) +#define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) +#define PCI_DVSEC_CXL_CTRL 0xC +#define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) +#define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) +#define PCI_DVSEC_CXL_MEM_ACTIVE _BITUL(1) +#define PCI_DVSEC_CXL_MEM_SIZE_LOW __GENMASK(31, 28) +#define PCI_DVSEC_CXL_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define PCI_DVSEC_CXL_MEM_BASE_LOW __GENMASK(31, 28) #define CXL_DVSEC_RANGE_MAX 2 -/* CXL 3.2 8.1.4: Non-CXL Function Map DVSEC */ -#define CXL_DVSEC_FUNCTION_MAP 2 - -/* CXL 3.2 8.1.5: Extensions DVSEC for Ports */ -#define CXL_DVSEC_PORT 3 -#define CXL_DVSEC_PORT_CTL 0x0c -#define CXL_DVSEC_PORT_CTL_UNMASK_SBR 0x00000001 - -/* CXL 3.2 8.1.6: GPF DVSEC for CXL Port */ -#define CXL_DVSEC_PORT_GPF 4 -#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK __GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK __GENMASK(11, 8) -#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK __GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK __GENMASK(11, 8) - -/* CXL 3.2 8.1.7: GPF DVSEC for CXL Device */ -#define CXL_DVSEC_DEVICE_GPF 5 - -/* CXL 3.2 8.1.9: Register Locator DVSEC */ -#define CXL_DVSEC_REG_LOCATOR 8 -#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC -#define CXL_DVSEC_REG_LOCATOR_BIR_MASK __GENMASK(2, 0) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK __GENMASK(15, 8) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK __GENMASK(31, 16) +/* CXL r4.0, 8.1.4: Non-CXL Function Map DVSEC */ +#define PCI_DVSEC_CXL_FUNCTION_MAP 2 + +/* CXL r4.0, 8.1.5: Extensions DVSEC for Ports */ +#define PCI_DVSEC_CXL_PORT 3 +#define PCI_DVSEC_CXL_PORT_CTL 0x0c +#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 + +/* CXL r4.0, 8.1.6: GPF DVSEC for CXL Port */ +#define PCI_DVSEC_CXL_PORT_GPF 4 +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_CONTROL 0x0C +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_BASE __GENMASK(3, 0) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_SCALE __GENMASK(11, 8) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_CONTROL 0xE +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_BASE __GENMASK(3, 0) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_SCALE __GENMASK(11, 8) + +/* CXL r4.0, 8.1.7: GPF DVSEC for CXL Device */ +#define PCI_DVSEC_CXL_DEVICE_GPF 5 + +/* CXL r4.0, 8.1.9: Register Locator DVSEC */ +#define PCI_DVSEC_CXL_REG_LOCATOR 8 +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 0xC +#define PCI_DVSEC_CXL_REG_LOCATOR_BIR __GENMASK(2, 0) +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID __GENMASK(15, 8) +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW __GENMASK(31, 16) #endif /* LINUX_PCI_REGS_H */ -- cgit v1.2.3 From 7c29ba02210c6e4570cdce53813a1ae68fb6d049 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:24 -0600 Subject: PCI: Introduce pcie_is_cxl() CXL is a protocol that runs on top of PCIe electricals. Its error model also runs on top of the PCIe AER error model by standardizing "internal" errors as "CXL" errors. Linux has historically ignored internal errors. CXL protocol error handling is then a task of enhancing the PCIe AER core to understand that PCIe ports (upstream and downstream) and endpoints may throw internal errors that represent standard CXL protocol errors. The proposed method to make that determination is to teach 'struct pci_dev' to cache when its link has trained the CXL.mem and/or CXL.cache protocols and then treat all internal errors as CXL errors. A design goal is to not burden the PCIe AER core with CXL knowledge beyond just enough to forward error notifications to the CXL RAS core. The forwarded notification looks up a 'struct cxl_port' or 'struct cxl_dport' companion device to the PCI device. Introduce set_pcie_cxl() with logic checking for CXL.mem or CXL.cache status in the CXL Flex Bus DVSEC status register. The CXL Flex Bus DVSEC presence is used because it is required for all the CXL PCIe devices.[1] [1] CXL 3.1 Spec, 8.1.1 PCIe Designated Vendor-Specific Extended Capability (DVSEC) ID Assignment, Table 8-2 Signed-off-by: Terry Bowman Reviewed-by: Ira Weiny Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alejandro Lucero Reviewed-by: Ben Cheatham Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-4-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/pci/probe.c | 31 +++++++++++++++++++++++++++++++ include/linux/pci.h | 6 ++++++ include/uapi/linux/pci_regs.h | 6 ++++++ 3 files changed, 43 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 41183aed8f5d..bd7ce41d0c7a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1735,6 +1735,35 @@ static void set_pcie_thunderbolt(struct pci_dev *dev) dev->is_thunderbolt = 1; } +static void set_pcie_cxl(struct pci_dev *dev) +{ + struct pci_dev *bridge; + u16 dvsec, cap; + + if (!pci_is_pcie(dev)) + return; + + /* + * Update parent's CXL state because alternate protocol training + * may have changed + */ + bridge = pci_upstream_bridge(dev); + if (bridge) + set_pcie_cxl(bridge); + + dvsec = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_FLEXBUS_PORT); + if (!dvsec) + return; + + pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS, + &cap); + + dev->is_cxl = FIELD_GET(PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_CACHE, cap) || + FIELD_GET(PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_MEM, cap); + +} + static void set_pcie_untrusted(struct pci_dev *dev) { struct pci_dev *parent = pci_upstream_bridge(dev); @@ -2065,6 +2094,8 @@ int pci_setup_device(struct pci_dev *dev) /* Need to have dev->cfg_size ready */ set_pcie_thunderbolt(dev); + set_pcie_cxl(dev); + set_pcie_untrusted(dev); if (pci_is_pcie(dev)) diff --git a/include/linux/pci.h b/include/linux/pci.h index 864775651c6f..f8e8b3df794d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -463,6 +463,7 @@ struct pci_dev { unsigned int is_pciehp:1; unsigned int shpc_managed:1; /* SHPC owned by shpchp */ unsigned int is_thunderbolt:1; /* Thunderbolt controller */ + unsigned int is_cxl:1; /* Compute Express Link (CXL) */ /* * Devices marked being untrusted are the ones that can potentially * execute DMA attacks and similar. They are typically connected @@ -791,6 +792,11 @@ static inline bool pci_is_display(struct pci_dev *pdev) return (pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY; } +static inline bool pcie_is_cxl(struct pci_dev *pci_dev) +{ + return pci_dev->is_cxl; +} + #define for_each_pci_bridge(dev, bus) \ list_for_each_entry(dev, &bus->devices, bus_list) \ if (!pci_is_bridge(dev)) {} else diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 662582bdccf0..b6622fd60fd9 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1379,6 +1379,12 @@ /* CXL r4.0, 8.1.7: GPF DVSEC for CXL Device */ #define PCI_DVSEC_CXL_DEVICE_GPF 5 +/* CXL r4.0, 8.1.8: Flex Bus DVSEC */ +#define PCI_DVSEC_CXL_FLEXBUS_PORT 7 +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS 0xE +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_CACHE _BITUL(0) +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_MEM _BITUL(2) + /* CXL r4.0, 8.1.9: Register Locator DVSEC */ #define PCI_DVSEC_CXL_REG_LOCATOR 8 #define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 0xC -- cgit v1.2.3 From 5247c034a67f5a93cc1faa15e9867eec5b22f38a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 20 Jan 2026 20:47:40 +0000 Subject: io_uring: introduce non-circular SQ Outside of SQPOLL, normally SQ entries are consumed by the time the submission syscall returns. For those cases we don't need a circular buffer and the head/tail tracking, instead the kernel can assume that entries always start from the beginning of the SQ at index 0. This patch introduces a setup flag doing exactly that. It's a simpler and helps to keeps SQEs hot in cache. The feature is optional and enabled by setting IORING_SETUP_SQ_REWIND. The flag is rejected if passed together with SQPOLL as it'd require waiting for SQ before each submission. It also requires IORING_SETUP_NO_SQARRAY, which can be supported but it's unlikely there will be users, so leave more space for future optimisations. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ++++++++++++ io_uring/io_uring.c | 29 ++++++++++++++++++++++------- io_uring/io_uring.h | 3 ++- 3 files changed, 36 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b5b23c0d5283..475094c7a668 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -237,6 +237,18 @@ enum io_uring_sqe_flags_bit { */ #define IORING_SETUP_SQE_MIXED (1U << 19) +/* + * When set, io_uring ignores SQ head and tail and fetches SQEs to submit + * starting from index 0 instead from the index stored in the head pointer. + * IOW, the user should place all SQE at the beginning of the SQ memory + * before issuing a submission syscall. + * + * It requires IORING_SETUP_NO_SQARRAY and is incompatible with + * IORING_SETUP_SQPOLL. The user must also never change the SQ head and tail + * values and keep it set to 0. Any other value is undefined behaviour. + */ +#define IORING_SETUP_SQ_REWIND (1U << 20) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a50459238bee..0f88ec74e55d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1945,12 +1945,16 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); + if (ctx->flags & IORING_SETUP_SQ_REWIND) { + ctx->cached_sq_head = 0; + } else { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); + } } /* @@ -1996,10 +2000,15 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { - unsigned int entries = io_sqring_entries(ctx); + unsigned int entries; unsigned int left; int ret; + if (ctx->flags & IORING_SETUP_SQ_REWIND) + entries = ctx->sq_entries; + else + entries = io_sqring_entries(ctx); + entries = min(nr, entries); if (unlikely(!entries)) return 0; @@ -2728,6 +2737,12 @@ static int io_uring_sanitise_params(struct io_uring_params *p) if (flags & ~IORING_SETUP_FLAGS) return -EINVAL; + if (flags & IORING_SETUP_SQ_REWIND) { + if ((flags & IORING_SETUP_SQPOLL) || + !(flags & IORING_SETUP_NO_SQARRAY)) + return -EINVAL; + } + /* There is no way to mmap rings without a real fd */ if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && !(flags & IORING_SETUP_NO_MMAP)) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 29b8f90fdabf..acdc39b9f8d6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -69,7 +69,8 @@ struct io_ctx_config { IORING_SETUP_NO_SQARRAY |\ IORING_SETUP_HYBRID_IOPOLL |\ IORING_SETUP_CQE_MIXED |\ - IORING_SETUP_SQE_MIXED) + IORING_SETUP_SQE_MIXED |\ + IORING_SETUP_SQ_REWIND) #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ IORING_ENTER_SQ_WAKEUP |\ -- cgit v1.2.3 From e86f89ab24f5ec595879a01eebb5df84f5ed6d2b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:36 +0800 Subject: ublk: add new batch command UBLK_U_IO_PREP_IO_CMDS & UBLK_U_IO_COMMIT_IO_CMDS Add new command UBLK_U_IO_PREP_IO_CMDS, which is the batch version of UBLK_IO_FETCH_REQ. Add new command UBLK_U_IO_COMMIT_IO_CMDS, which is for committing io command result only, still the batch version. The new command header type is `struct ublk_batch_io`. This patch doesn't actually implement these commands yet, just validates the SQE fields. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 87 ++++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 49 ++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0f9fcd16258b..22c7296d90f3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -91,6 +91,11 @@ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \ UBLK_PARAM_TYPE_INTEGRITY) +#define UBLK_BATCH_F_ALL \ + (UBLK_BATCH_F_HAS_ZONE_LBA | \ + UBLK_BATCH_F_HAS_BUF_ADDR | \ + UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) + struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -114,6 +119,13 @@ struct ublk_uring_cmd_pdu { u16 tag; }; +struct ublk_batch_io_data { + struct ublk_device *ub; + struct io_uring_cmd *cmd; + struct ublk_batch_io header; + unsigned int issue_flags; +}; + /* * io command is active: sqe cmd is received, and its cqe isn't done * @@ -2687,10 +2699,83 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) +{ + unsigned elem_bytes = sizeof(struct ublk_elem_header); + + if (uc->flags & ~UBLK_BATCH_F_ALL) + return -EINVAL; + + /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */ + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)) + return -EINVAL; + + elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) + + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0); + if (uc->elem_bytes != elem_bytes) + return -EINVAL; + return 0; +} + +static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) +{ + + const struct ublk_batch_io *uc = &data->header; + + if (uc->nr_elem > data->ub->dev_info.queue_depth) + return -E2BIG; + + if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) && + !ublk_dev_is_zoned(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) && + !ublk_dev_need_map_io(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + !ublk_dev_support_auto_buf_reg(data->ub)) + return -EINVAL; + + return ublk_check_batch_cmd_flags(uc); +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - return -EOPNOTSUPP; + const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + struct ublk_batch_io_data data = { + .ub = ub, + .cmd = cmd, + .header = (struct ublk_batch_io) { + .q_id = READ_ONCE(uc->q_id), + .flags = READ_ONCE(uc->flags), + .nr_elem = READ_ONCE(uc->nr_elem), + .elem_bytes = READ_ONCE(uc->elem_bytes), + }, + .issue_flags = issue_flags, + }; + u32 cmd_op = cmd->cmd_op; + int ret = -EINVAL; + + if (data.header.q_id >= ub->dev_info.nr_hw_queues) + goto out; + + switch (cmd_op) { + case UBLK_U_IO_PREP_IO_CMDS: + case UBLK_U_IO_COMMIT_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = -EOPNOTSUPP; + break; + default: + ret = -EOPNOTSUPP; + } +out: + return ret; } static inline bool ublk_check_ubuf_dir(const struct request *req, diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 90f47da4f435..0cc58e19d401 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -103,6 +103,10 @@ _IOWR('u', 0x23, struct ublksrv_io_cmd) #define UBLK_U_IO_UNREGISTER_IO_BUF \ _IOWR('u', 0x24, struct ublksrv_io_cmd) +#define UBLK_U_IO_PREP_IO_CMDS \ + _IOWR('u', 0x25, struct ublk_batch_io) +#define UBLK_U_IO_COMMIT_IO_CMDS \ + _IOWR('u', 0x26, struct ublk_batch_io) /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 @@ -544,6 +548,51 @@ struct ublksrv_io_cmd { }; }; +struct ublk_elem_header { + __u16 tag; /* IO tag */ + + /* + * Buffer index for incoming io command, only valid iff + * UBLK_F_AUTO_BUF_REG is set + */ + __u16 buf_index; + __s32 result; /* I/O completion result (commit only) */ +}; + +/* + * uring_cmd buffer structure for batch commands + * + * buffer includes multiple elements, which number is specified by + * `nr_elem`. Each element buffer is organized in the following order: + * + * struct ublk_elem_buffer { + * // Mandatory fields (8 bytes) + * struct ublk_elem_header header; + * + * // Optional fields (8 bytes each, included based on flags) + * + * // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data + * // between ublk request and ublk server buffer + * __u64 buf_addr; + * + * // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA) + * __u64 zone_lba; + * } + * + * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS` + */ +struct ublk_batch_io { + __u16 q_id; +#define UBLK_BATCH_F_HAS_ZONE_LBA (1 << 0) +#define UBLK_BATCH_F_HAS_BUF_ADDR (1 << 1) +#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK (1 << 2) + __u16 flags; + __u16 nr_elem; + __u8 elem_bytes; + __u8 reserved; + __u64 reserved2; +}; + struct ublk_param_basic { #define UBLK_ATTR_READ_ONLY (1 << 0) #define UBLK_ATTR_ROTATIONAL (1 << 1) -- cgit v1.2.3 From b256795b3606e9a67c725dde8eaae91dd9d21de4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:37 +0800 Subject: ublk: handle UBLK_U_IO_PREP_IO_CMDS This commit implements the handling of the UBLK_U_IO_PREP_IO_CMDS command, which allows userspace to prepare a batch of I/O requests. The core of this change is the `ublk_walk_cmd_buf` function, which iterates over the elements in the uring_cmd fixed buffer. For each element, it parses the I/O details, finds the corresponding `ublk_io` structure, and prepares it for future dispatch. Add per-io lock for protecting concurrent delivery and committing. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 191 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 5 ++ 2 files changed, 195 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 22c7296d90f3..a3840b3f1081 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -208,6 +208,7 @@ struct ublk_io { unsigned task_registered_buffers; void *buf_ctx_handle; + spinlock_t lock; } ____cacheline_aligned_in_smp; struct ublk_queue { @@ -280,6 +281,16 @@ static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) return false; } +static inline void ublk_io_lock(struct ublk_io *io) +{ + spin_lock(&io->lock); +} + +static inline void ublk_io_unlock(struct ublk_io *io) +{ + spin_unlock(&io->lock); +} + static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { @@ -2699,6 +2710,171 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) + return *(const __u64 *)(buf + sizeof(*elem)); + return 0; +} + +static struct ublk_auto_buf_reg +ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + struct ublk_auto_buf_reg reg = { + .index = elem->buf_index, + .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ? + UBLK_AUTO_BUF_REG_FALLBACK : 0, + }; + + return reg; +} + +/* + * 48 can hold any type of buffer element(8, 16 and 24 bytes) because + * it is the least common multiple(LCM) of 8, 16 and 24 + */ +#define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10) +struct ublk_batch_io_iter { + void __user *uaddr; + unsigned done, total; + unsigned char elem_bytes; + /* copy to this buffer from user space */ + unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ]; +}; + +static inline int +__ublk_walk_cmd_buf(struct ublk_queue *ubq, + struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + unsigned bytes, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < bytes; i += iter->elem_bytes) { + const struct ublk_elem_header *elem = + (const struct ublk_elem_header *)&iter->buf[i]; + + if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) { + ret = -EINVAL; + break; + } + + ret = cb(ubq, data, elem); + if (unlikely(ret)) + break; + } + + iter->done += i; + return ret; +} + +static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + int ret = 0; + + while (iter->done < iter->total) { + unsigned int len = min(sizeof(iter->buf), iter->total - iter->done); + + if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) { + pr_warn("ublk%d: read batch cmd buffer failed\n", + data->ub->dev_info.dev_id); + return -EFAULT; + } + + ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb); + if (ret) + return ret; + } + return 0; +} + +static int ublk_batch_unprep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + + data->ub->nr_io_ready--; + ublk_io_lock(io); + io->flags = 0; + ublk_io_unlock(io); + return 0; +} + +static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data) +{ + int ret; + + /* Re-process only what we've already processed, starting from beginning */ + iter->total = iter->done; + iter->done = 0; + + ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io); + WARN_ON_ONCE(ret); +} + +static int ublk_batch_prep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + union ublk_io_buf buf = { 0 }; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + else if (ublk_dev_need_map_io(data->ub)) { + buf.addr = ublk_batch_buf_addr(uc, elem); + + ret = ublk_check_fetch_buf(data->ub, buf.addr); + if (ret) + return ret; + } + + ublk_io_lock(io); + ret = __ublk_fetch(data->cmd, data->ub, io); + if (!ret) + io->buf = buf; + ublk_io_unlock(io); + + return ret; +} + +static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + int ret; + + mutex_lock(&data->ub->mutex); + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io); + + if (ret && iter.done) + ublk_batch_revert_prep_cmd(&iter, data); + mutex_unlock(&data->ub->mutex); + return ret; +} + static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) { unsigned elem_bytes = sizeof(struct ublk_elem_header); @@ -2765,6 +2941,11 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, switch (cmd_op) { case UBLK_U_IO_PREP_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_prep_cmd(&data); + break; case UBLK_U_IO_COMMIT_IO_CMDS: ret = ublk_check_batch_cmd(&data); if (ret) @@ -2952,7 +3133,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) struct ublk_queue *ubq; struct page *page; int numa_node; - int size; + int size, i; /* Determine NUMA node based on queue's CPU affinity */ numa_node = ublk_get_queue_numa_node(ub, q_id); @@ -2977,6 +3158,9 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) } ubq->io_cmd_buf = page_address(page); + for (i = 0; i < ubq->q_depth; i++) + spin_lock_init(&ubq->ios[i].lock); + ub->queues[q_id] = ubq; ubq->dev = ub; return 0; @@ -3220,6 +3404,11 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, return -EINVAL; mutex_lock(&ub->mutex); + /* device may become not ready in case of F_BATCH */ + if (!ublk_dev_ready(ub)) { + ret = -EINVAL; + goto out_unlock; + } if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { ret = -EEXIST; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 0cc58e19d401..1a3d4d33c1d1 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -103,6 +103,11 @@ _IOWR('u', 0x23, struct ublksrv_io_cmd) #define UBLK_U_IO_UNREGISTER_IO_BUF \ _IOWR('u', 0x24, struct ublksrv_io_cmd) + +/* + * return 0 if the command is run successfully, otherwise failure code + * is returned + */ #define UBLK_U_IO_PREP_IO_CMDS \ _IOWR('u', 0x25, struct ublk_batch_io) #define UBLK_U_IO_COMMIT_IO_CMDS \ -- cgit v1.2.3 From 1e500e106d5a82280db59dba06f0108085beba65 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:38 +0800 Subject: ublk: handle UBLK_U_IO_COMMIT_IO_CMDS Handle UBLK_U_IO_COMMIT_IO_CMDS by walking the uring_cmd fixed buffer: - read each element into one temp buffer in batch style - parse and apply each element for committing io result Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 103 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 8 ++++ 2 files changed, 109 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a3840b3f1081..162b46c74f16 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2267,7 +2267,7 @@ static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd return 0; } -static int ublk_handle_auto_buf_reg(struct ublk_io *io, +static void ublk_clear_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd, u16 *buf_idx) { @@ -2287,7 +2287,13 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io, if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) *buf_idx = io->buf.auto_reg.index; } +} +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) +{ + ublk_clear_auto_buf_reg(io, cmd, buf_idx); return ublk_set_auto_buf_reg(io, cmd); } @@ -2720,6 +2726,17 @@ static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, return 0; } +static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) + return *(const __u64 *)(buf + sizeof(*elem) + + 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)); + return -1; +} + static struct ublk_auto_buf_reg ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, const struct ublk_elem_header *elem) @@ -2875,6 +2892,84 @@ static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) return ret; } +static int ublk_batch_commit_io_check(const struct ublk_queue *ubq, + struct ublk_io *io, + union ublk_io_buf *buf) +{ + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EBUSY; + + /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */ + if (ublk_need_map_io(ubq) && !buf->addr) + return -EINVAL; + return 0; +} + +static int ublk_batch_commit_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + u16 buf_idx = UBLK_INVALID_BUF_IDX; + union ublk_io_buf buf = { 0 }; + struct request *req = NULL; + bool auto_reg = false; + bool compl = false; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) { + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + auto_reg = true; + } else if (ublk_dev_need_map_io(data->ub)) + buf.addr = ublk_batch_buf_addr(uc, elem); + + ublk_io_lock(io); + ret = ublk_batch_commit_io_check(ubq, io, &buf); + if (!ret) { + io->res = elem->result; + io->buf = buf; + req = ublk_fill_io_cmd(io, data->cmd); + + if (auto_reg) + ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx); + compl = ublk_need_complete_req(data->ub, io); + } + ublk_io_unlock(io); + + if (unlikely(ret)) { + pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n", + __func__, data->ub->dev_info.dev_id, ubq->q_id, + elem->tag, ret); + return ret; + } + + /* can't touch 'ublk_io' any more */ + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ublk_batch_zone_lba(uc, elem); + if (compl) + __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub)); + return 0; +} + +static int ublk_handle_batch_commit_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + int ret; + + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io); + + return iter.done == 0 ? ret : iter.done; +} + static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) { unsigned elem_bytes = sizeof(struct ublk_elem_header); @@ -2950,7 +3045,7 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_check_batch_cmd(&data); if (ret) goto out; - ret = -EOPNOTSUPP; + ret = ublk_handle_batch_commit_cmd(&data); break; default: ret = -EOPNOTSUPP; @@ -3659,6 +3754,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) UBLK_F_AUTO_BUF_REG)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* UBLK_F_BATCH_IO doesn't support GET_DATA */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for * returning write_append_lba, which is only allowed in case of diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 1a3d4d33c1d1..3894d676dd02 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -110,6 +110,14 @@ */ #define UBLK_U_IO_PREP_IO_CMDS \ _IOWR('u', 0x25, struct ublk_batch_io) +/* + * If failure code is returned, nothing in the command buffer is handled. + * Otherwise, the returned value means how many bytes in command buffer + * are handled actually, then number of handled IOs can be calculated with + * `elem_bytes` for each IO. IOs in the remained bytes are not committed, + * userspace has to check return value for dealing with partial committing + * correctly. + */ #define UBLK_U_IO_COMMIT_IO_CMDS \ _IOWR('u', 0x26, struct ublk_batch_io) -- cgit v1.2.3 From a4d88375539920b7401ead59d2f944ac23c668ea Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:41 +0800 Subject: ublk: add UBLK_U_IO_FETCH_IO_CMDS for batch I/O processing Add UBLK_U_IO_FETCH_IO_CMDS command to enable efficient batch processing of I/O requests. This multishot uring_cmd allows the ublk server to fetch multiple I/O commands in a single operation, significantly reducing submission overhead compared to individual FETCH_REQ* commands. Key Design Features: 1. Multishot Operation: One UBLK_U_IO_FETCH_IO_CMDS can fetch many I/O commands, with the batch size limited by the provided buffer length. 2. Dynamic Load Balancing: Multiple fetch commands can be submitted simultaneously, but only one is active at any time. This enables efficient load distribution across multiple server task contexts. 3. Implicit State Management: The implementation uses three key variables to track state: - evts_fifo: Queue of request tags awaiting processing - fcmd_head: List of available fetch commands - active_fcmd: Currently active fetch command (NULL = none active) States are derived implicitly: - IDLE: No fetch commands available - READY: Fetch commands available, none active - ACTIVE: One fetch command processing events 4. Lockless Reader Optimization: The active fetch command can read from evts_fifo without locking (single reader guarantee), while writers (ublk_queue_rq/ublk_queue_rqs) use evts_lock protection. The memory barrier pairing plays key role for the single lockless reader optimization. Implementation Details: - ublk_queue_rq() and ublk_queue_rqs() save request tags to evts_fifo - __ublk_acquire_fcmd() selects an available fetch command when events arrive and no command is currently active - ublk_batch_dispatch() moves tags from evts_fifo to the fetch command's buffer and posts completion via io_uring_mshot_cmd_post_cqe() - State transitions are coordinated via evts_lock to maintain consistency Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 394 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 7 + 2 files changed, 393 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1b5721c7a536..0a0210f9d417 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -99,6 +99,7 @@ /* ublk batch fetch uring_cmd */ struct ublk_batch_fetch_cmd { + struct list_head node; struct io_uring_cmd *cmd; unsigned short buf_group; }; @@ -123,7 +124,10 @@ struct ublk_uring_cmd_pdu { */ struct ublk_queue *ubq; - u16 tag; + union { + u16 tag; + struct ublk_batch_fetch_cmd *fcmd; /* batch io only */ + }; }; struct ublk_batch_io_data { @@ -245,10 +249,37 @@ struct ublk_queue { * Make sure just one reader for fetching request from task work * function to ublk server, so no need to grab the lock in reader * side. + * + * Batch I/O State Management: + * + * The batch I/O system uses implicit state management based on the + * combination of three key variables below. + * + * - IDLE: list_empty(&fcmd_head) && !active_fcmd + * No fetch commands available, events queue in evts_fifo + * + * - READY: !list_empty(&fcmd_head) && !active_fcmd + * Fetch commands available but none processing events + * + * - ACTIVE: active_fcmd + * One fetch command actively processing events from evts_fifo + * + * Key Invariants: + * - At most one active_fcmd at any time (single reader) + * - active_fcmd is always from fcmd_head list when non-NULL + * - evts_fifo can be read locklessly by the single active reader + * - All state transitions require evts_lock protection + * - Multiple writers to evts_fifo require lock protection */ struct { DECLARE_KFIFO_PTR(evts_fifo, unsigned short); spinlock_t evts_lock; + + /* List of fetch commands available to process events */ + struct list_head fcmd_head; + + /* Currently active fetch command (NULL = none active) */ + struct ublk_batch_fetch_cmd *active_fcmd; }____cacheline_aligned_in_smp; struct ublk_io ios[] __counted_by(q_depth); @@ -303,12 +334,20 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io); static inline unsigned int ublk_req_build_flags(struct request *req); +static void ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd); static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) { return false; } +static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) +{ + return false; +} + static inline void ublk_io_lock(struct ublk_io *io) { spin_lock(&io->lock); @@ -664,13 +703,45 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +static struct ublk_batch_fetch_cmd * +ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd) +{ + struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO); + + if (fcmd) { + fcmd->cmd = cmd; + fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index); + } + return fcmd; +} + +static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd) +{ + kfree(fcmd); +} + +static void __ublk_release_fcmd(struct ublk_queue *ubq) +{ + WRITE_ONCE(ubq->active_fcmd, NULL); +} -static void ublk_batch_deinit_fetch_buf(const struct ublk_batch_io_data *data, +/* + * Nothing can move on, so clear ->active_fcmd, and the caller should stop + * dispatching + */ +static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, struct ublk_batch_fetch_cmd *fcmd, int res) { + spin_lock(&ubq->evts_lock); + list_del(&fcmd->node); + WARN_ON_ONCE(fcmd != ubq->active_fcmd); + __ublk_release_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); - fcmd->cmd = NULL; + ublk_batch_free_fcmd(fcmd); } static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd, @@ -1637,6 +1708,8 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq, bool needs_filter; int ret; + WARN_ON_ONCE(data->cmd != fcmd->cmd); + sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len, data->issue_flags); if (sel.val < 0) @@ -1700,21 +1773,93 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq, return ret; } -static __maybe_unused void +static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd( + struct ublk_queue *ubq) +{ + struct ublk_batch_fetch_cmd *fcmd; + + lockdep_assert_held(&ubq->evts_lock); + + /* + * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd. + * + * The pair is the smp_mb() in ublk_batch_dispatch(). + * + * If ubq->active_fcmd is observed as non-NULL, the new added tags + * can be visisible in ublk_batch_dispatch() with the barrier pairing. + */ + smp_mb(); + if (READ_ONCE(ubq->active_fcmd)) { + fcmd = NULL; + } else { + fcmd = list_first_entry_or_null(&ubq->fcmd_head, + struct ublk_batch_fetch_cmd, node); + WRITE_ONCE(ubq->active_fcmd, fcmd); + } + return fcmd; +} + +static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) +{ + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; + struct ublk_batch_io_data data = { + .ub = pdu->ubq->dev, + .cmd = fcmd->cmd, + .issue_flags = issue_flags, + }; + + WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd); + + ublk_batch_dispatch(pdu->ubq, &data, fcmd); +} + +static void ublk_batch_dispatch(struct ublk_queue *ubq, const struct ublk_batch_io_data *data, struct ublk_batch_fetch_cmd *fcmd) { + struct ublk_batch_fetch_cmd *new_fcmd; + unsigned tried = 0; int ret = 0; +again: while (!ublk_io_evts_empty(ubq)) { ret = __ublk_batch_dispatch(ubq, data, fcmd); if (ret <= 0) break; } - if (ret < 0) - ublk_batch_deinit_fetch_buf(data, fcmd, ret); + if (ret < 0) { + ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret); + return; + } + + __ublk_release_fcmd(ubq); + /* + * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and + * checking ubq->evts_fifo. + * + * The pair is the smp_mb() in __ublk_acquire_fcmd(). + */ + smp_mb(); + if (likely(ublk_io_evts_empty(ubq))) + return; + + spin_lock(&ubq->evts_lock); + new_fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (!new_fcmd) + return; + + /* Avoid lockup by allowing to handle at most 32 batches */ + if (new_fcmd == fcmd && tried++ < 32) + goto again; + + io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb); } static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) @@ -1726,6 +1871,21 @@ static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) ublk_dispatch_req(ubq, pdu->req); } +static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last) +{ + unsigned short tag = rq->tag; + struct ublk_batch_fetch_cmd *fcmd = NULL; + + spin_lock(&ubq->evts_lock); + kfifo_put(&ubq->evts_fifo, tag); + if (last) + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; @@ -1836,7 +1996,10 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } - ublk_queue_cmd(ubq, rq); + if (ublk_support_batch_io(ubq)) + ublk_batch_queue_cmd(ubq, rq, bd->last); + else + ublk_queue_cmd(ubq, rq); return BLK_STS_OK; } @@ -1848,6 +2011,19 @@ static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, (io->task == io2->task); } +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct ublk_batch_fetch_cmd *fcmd; + + spin_lock(&ubq->evts_lock); + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = { }; @@ -1876,6 +2052,57 @@ static void ublk_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } +static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +{ + unsigned short tags[MAX_NR_TAG]; + struct ublk_batch_fetch_cmd *fcmd; + struct request *rq; + unsigned cnt = 0; + + spin_lock(&ubq->evts_lock); + rq_list_for_each(l, rq) { + tags[cnt++] = (unsigned short)rq->tag; + if (cnt >= MAX_NR_TAG) { + kfifo_in(&ubq->evts_fifo, tags, cnt); + cnt = 0; + } + } + if (cnt) + kfifo_in(&ubq->evts_fifo, tags, cnt); + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + rq_list_init(l); + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + +static void ublk_batch_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_queue *ubq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + + if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { + rq_list_add_tail(&requeue_list, req); + continue; + } + + if (ubq && this_q != ubq && !rq_list_empty(&submit_list)) + ublk_batch_queue_cmd_list(ubq, &submit_list); + ubq = this_q; + rq_list_add_tail(&submit_list, req); + } + + if (!rq_list_empty(&submit_list)) + ublk_batch_queue_cmd_list(ubq, &submit_list); + *rqlist = requeue_list; +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1893,6 +2120,14 @@ static const struct blk_mq_ops ublk_mq_ops = { .timeout = ublk_timeout, }; +static const struct blk_mq_ops ublk_batch_mq_ops = { + .commit_rqs = ublk_commit_rqs, + .queue_rq = ublk_queue_rq, + .queue_rqs = ublk_batch_queue_rqs, + .init_hctx = ublk_init_hctx, + .timeout = ublk_timeout, +}; + static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) { int i; @@ -2290,6 +2525,56 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } +static void ublk_batch_cancel_cmd(struct ublk_queue *ubq, + struct ublk_batch_fetch_cmd *fcmd, + unsigned int issue_flags) +{ + bool done; + + spin_lock(&ubq->evts_lock); + done = (READ_ONCE(ubq->active_fcmd) != fcmd); + if (done) + list_del(&fcmd->node); + spin_unlock(&ubq->evts_lock); + + if (done) { + io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags); + ublk_batch_free_fcmd(fcmd); + } +} + +static void ublk_batch_cancel_queue(struct ublk_queue *ubq) +{ + struct ublk_batch_fetch_cmd *fcmd; + LIST_HEAD(fcmd_list); + + spin_lock(&ubq->evts_lock); + ubq->force_abort = true; + list_splice_init(&ubq->fcmd_head, &fcmd_list); + fcmd = READ_ONCE(ubq->active_fcmd); + if (fcmd) + list_move(&fcmd->node, &ubq->fcmd_head); + spin_unlock(&ubq->evts_lock); + + while (!list_empty(&fcmd_list)) { + fcmd = list_first_entry(&fcmd_list, + struct ublk_batch_fetch_cmd, node); + ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED); + } +} + +static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; + struct ublk_queue *ubq = pdu->ubq; + + ublk_start_cancel(ubq->dev); + + ublk_batch_cancel_cmd(ubq, fcmd, issue_flags); +} + /* * The ublk char device won't be closed when calling cancel fn, so both * ublk device and queue are guaranteed to be live @@ -2341,6 +2626,11 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) { int i; + if (ublk_support_batch_io(ubq)) { + ublk_batch_cancel_queue(ubq); + return; + } + for (i = 0; i < ubq->q_depth; i++) ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); } @@ -3246,6 +3536,79 @@ static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) return ublk_check_batch_cmd_flags(uc); } +static int ublk_batch_attach(struct ublk_queue *ubq, + struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + struct ublk_batch_fetch_cmd *new_fcmd = NULL; + bool free = false; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd); + + spin_lock(&ubq->evts_lock); + if (unlikely(ubq->force_abort || ubq->canceling)) { + free = true; + } else { + list_add_tail(&fcmd->node, &ubq->fcmd_head); + new_fcmd = __ublk_acquire_fcmd(ubq); + } + spin_unlock(&ubq->evts_lock); + + if (unlikely(free)) { + ublk_batch_free_fcmd(fcmd); + return -ENODEV; + } + + pdu->ubq = ubq; + pdu->fcmd = fcmd; + io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags); + + if (!new_fcmd) + goto out; + + /* + * If the two fetch commands are originated from same io_ring_ctx, + * run batch dispatch directly. Otherwise, schedule task work for + * doing it. + */ + if (io_uring_cmd_ctx_handle(new_fcmd->cmd) == + io_uring_cmd_ctx_handle(fcmd->cmd)) { + data->cmd = new_fcmd->cmd; + ublk_batch_dispatch(ubq, data, new_fcmd); + } else { + io_uring_cmd_complete_in_task(new_fcmd->cmd, + ublk_batch_tw_cb); + } +out: + return -EIOCBQUEUED; +} + +static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd); + + if (!fcmd) + return -ENOMEM; + + return ublk_batch_attach(ubq, data, fcmd); +} + +static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + + if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT)) + return -EINVAL; + + if (uc->elem_bytes != sizeof(__u16)) + return -EINVAL; + + if (uc->flags != 0) + return -EINVAL; + + return 0; +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -3265,6 +3628,11 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; + if (unlikely(issue_flags & IO_URING_F_CANCEL)) { + ublk_batch_cancel_fn(cmd, issue_flags); + return 0; + } + if (data.header.q_id >= ub->dev_info.nr_hw_queues) goto out; @@ -3281,6 +3649,12 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, goto out; ret = ublk_handle_batch_commit_cmd(&data); break; + case UBLK_U_IO_FETCH_IO_CMDS: + ret = ublk_validate_batch_fetch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_fetch_cmd(&data); + break; default: ret = -EOPNOTSUPP; } @@ -3503,6 +3877,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node); if (ret) goto fail; + INIT_LIST_HEAD(&ubq->fcmd_head); } ub->queues[q_id] = ubq; ubq->dev = ub; @@ -3625,7 +4000,10 @@ static void ublk_align_max_io_size(struct ublk_device *ub) static int ublk_add_tag_set(struct ublk_device *ub) { - ub->tag_set.ops = &ublk_mq_ops; + if (ublk_dev_support_batch_io(ub)) + ub->tag_set.ops = &ublk_batch_mq_ops; + else + ub->tag_set.ops = &ublk_mq_ops; ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 3894d676dd02..70d8ebbf4326 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -121,6 +121,13 @@ #define UBLK_U_IO_COMMIT_IO_CMDS \ _IOWR('u', 0x26, struct ublk_batch_io) +/* + * Fetch io commands to provided buffer in multishot style, + * `IORING_URING_CMD_MULTISHOT` is required for this command. + */ +#define UBLK_U_IO_FETCH_IO_CMDS \ + _IOWR('u', 0x27, struct ublk_batch_io) + /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 #define UBLK_IO_RES_NEED_GET_DATA 1 -- cgit v1.2.3 From e2723e6ce6025026b6d79d9a00048386a69e00c3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:44 +0800 Subject: ublk: add new feature UBLK_F_BATCH_IO Add new feature UBLK_F_BATCH_IO which replaces the following two per-io commands: - UBLK_U_IO_FETCH_REQ - UBLK_U_IO_COMMIT_AND_FETCH_REQ with three per-queue batch io uring_cmd: - UBLK_U_IO_PREP_IO_CMDS - UBLK_U_IO_COMMIT_IO_CMDS - UBLK_U_IO_FETCH_IO_CMDS Then ublk can deliver batch io commands to ublk server in single multishort uring_cmd, also allows to prepare & commit multiple commands in batch style via single uring_cmd, communication cost is reduced a lot. This feature also doesn't limit task context any more for all supported commands, so any allowed uring_cmd can be issued in any task context. ublk server implementation becomes much easier. Meantime load balance becomes much easier to support with this feature. The command `UBLK_U_IO_FETCH_IO_CMDS` can be issued from multiple task contexts, so each task can adjust this command's buffer length or number of inflight commands for controlling how much load is handled by current task. Later, priority parameter will be added to command `UBLK_U_IO_FETCH_IO_CMDS` for improving load balance support. UBLK_U_IO_NEED_GET_DATA isn't supported in batch io yet, but it may be enabled in future via its batch pair. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 60 +++++++++++++++++++++++++++++++++++++------ include/uapi/linux/ublk_cmd.h | 15 +++++++++++ 2 files changed, 67 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 564cf44c238f..bec34b5ab5ab 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -79,7 +79,8 @@ | UBLK_F_PER_IO_DAEMON \ | UBLK_F_BUF_REG_OFF_DAEMON \ | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ - | UBLK_F_SAFE_STOP_DEV) + | UBLK_F_SAFE_STOP_DEV \ + | UBLK_F_BATCH_IO) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -340,12 +341,12 @@ static void ublk_batch_dispatch(struct ublk_queue *ubq, static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) { - return false; + return ub->dev_info.flags & UBLK_F_BATCH_IO; } static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) { - return false; + return ubq->flags & UBLK_F_BATCH_IO; } static inline void ublk_io_lock(struct ublk_io *io) @@ -3573,9 +3574,11 @@ static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) { - const struct ublk_batch_io *uc = &data->header; + if (uc->q_id >= data->ub->dev_info.nr_hw_queues) + return -EINVAL; + if (uc->nr_elem > data->ub->dev_info.queue_depth) return -E2BIG; @@ -3655,6 +3658,9 @@ static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) { const struct ublk_batch_io *uc = &data->header; + if (uc->q_id >= data->ub->dev_info.nr_hw_queues) + return -EINVAL; + if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT)) return -EINVAL; @@ -3667,6 +3673,35 @@ static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) return 0; } +static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + unsigned tag = READ_ONCE(ub_cmd->tag); + unsigned q_id = READ_ONCE(ub_cmd->q_id); + unsigned index = READ_ONCE(ub_cmd->addr); + struct ublk_queue *ubq; + struct ublk_io *io; + + if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, index, issue_flags); + + if (q_id >= ub->dev_info.nr_hw_queues) + return -EINVAL; + + if (tag >= ub->dev_info.queue_depth) + return -EINVAL; + + if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF) + return -EOPNOTSUPP; + + ubq = ublk_get_queue(ub, q_id); + io = &ubq->ios[tag]; + return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, + issue_flags); +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -3691,9 +3726,6 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, return 0; } - if (data.header.q_id >= ub->dev_info.nr_hw_queues) - goto out; - switch (cmd_op) { case UBLK_U_IO_PREP_IO_CMDS: ret = ublk_check_batch_cmd(&data); @@ -3714,7 +3746,8 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_handle_batch_fetch_cmd(&data); break; default: - ret = -EOPNOTSUPP; + ret = ublk_handle_non_batch_cmd(cmd, issue_flags); + break; } out: return ret; @@ -4437,6 +4470,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) UBLK_F_BUF_REG_OFF_DAEMON | UBLK_F_SAFE_STOP_DEV; + /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON; + /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) @@ -4820,6 +4857,13 @@ static int ublk_wait_for_idle_io(struct ublk_device *ub, unsigned int elapsed = 0; int ret; + /* + * For UBLK_F_BATCH_IO ublk server can get notified with existing + * or new fetch command, so needn't wait any more + */ + if (ublk_dev_support_batch_io(ub)) + return 0; + while (elapsed < timeout_ms && !signal_pending(current)) { unsigned int queues_cancelable = 0; int i; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 70d8ebbf4326..743d31491387 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -340,6 +340,21 @@ */ #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) +/* + * Support the following commands for delivering & committing io command + * in batch. + * + * - UBLK_U_IO_PREP_IO_CMDS + * - UBLK_U_IO_COMMIT_IO_CMDS + * - UBLK_U_IO_FETCH_IO_CMDS + * - UBLK_U_IO_REGISTER_IO_BUF + * - UBLK_U_IO_UNREGISTER_IO_BUF + * + * The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and + * UBLK_U_IO_NEED_GET_DATA uring_cmd are not supported for this feature. + */ +#define UBLK_F_BATCH_IO (1ULL << 15) + /* * ublk device supports requests with integrity/metadata buffer. * Requires UBLK_F_USER_COPY. -- cgit v1.2.3 From fa9893fadbc245e179cb17f3c371c67471b5a8a8 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 9 Jan 2026 17:17:32 -0600 Subject: KVM: Introduce KVM_EXIT_SNP_REQ_CERTS for SNP certificate-fetching For SEV-SNP, the host can optionally provide a certificate table to the guest when it issues an attestation request to firmware (see GHCB 2.0 specification regarding "SNP Extended Guest Requests"). This certificate table can then be used to verify the endorsement key used by firmware to sign the attestation report. While it is possible for guests to obtain the certificates through other means, handling it via the host provides more flexibility in being able to keep the certificate data in sync with the endorsement key throughout host-side operations that might resulting in the endorsement key changing. In the case of KVM, userspace will be responsible for fetching the certificate table and keeping it in sync with any modifications to the endorsement key by other userspace management tools. Define a new KVM_EXIT_SNP_REQ_CERTS event where userspace is provided with the GPA of the buffer the guest has provided as part of the attestation request so that userspace can write the certificate data into it while relying on filesystem-based locking to keep the certificates up-to-date relative to the endorsement keys installed/utilized by firmware at the time the certificates are fetched. [Melody: Update the documentation scheme about how file locking is expected to happen.] Reviewed-by: Liam Merwick Tested-by: Liam Merwick Tested-by: Dionna Glaze Signed-off-by: Michael Roth Signed-off-by: Melody Wang Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260109231732.1160759-2-michael.roth@amd.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 44 ++++++++++++++++++++++++++++++ arch/x86/kvm/svm/sev.c | 62 ++++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/svm/svm.h | 1 + include/uapi/linux/kvm.h | 9 ++++++ 4 files changed, 110 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..428d7d9cb4d6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7353,6 +7353,50 @@ Please note that the kernel is allowed to use the kvm_run structure as the primary storage for certain register types. Therefore, the kernel may use the values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. +:: + + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; + }; + +KVM_EXIT_SNP_REQ_CERTS indicates an SEV-SNP guest with certificate-fetching +enabled (see KVM_SEV_SNP_ENABLE_REQ_CERTS) has generated an Extended Guest +Request NAE #VMGEXIT (SNP_GUEST_REQUEST) with message type MSG_REPORT_REQ, +i.e. has requested an attestation report from firmware, and would like the +certificate data corresponding to the attestation report signature to be +provided by the hypervisor as part of the request. + +To allow for userspace to provide the certificate, the 'gpa' and 'npages' +are forwarded verbatim from the guest request (the RAX and RBX GHCB fields +respectively). 'ret' is not an "output" from KVM, and is always '0' on +exit. KVM verifies the 'gpa' is 4KiB aligned prior to exiting to userspace, +but otherwise the information from the guest isn't validated. + +Upon the next KVM_RUN, e.g. after userspace has serviced the request (or not), +KVM will complete the #VMGEXIT, using the 'ret' field to determine whether to +signal success or failure to the guest, and on failure, what reason code will +be communicated via SW_EXITINFO2. If 'ret' is set to an unsupported value (see +the table below), KVM_RUN will fail with -EINVAL. For a 'ret' of 'ENOSPC', KVM +also consumes the 'npages' field, i.e. userspace can use the field to inform +the guest of the number of pages needed to hold all the certificate data. + +The supported 'ret' values and their respective SW_EXITINFO2 encodings: + + ====== ============================================================= + 0 0x0, i.e. success. KVM will emit an SNP_GUEST_REQUEST command + to SNP firmware. + ENOSPC 0x0000000100000000, i.e. not enough guest pages to hold the + certificate table and certificate data. KVM will also set the + RBX field in the GHBC to 'npages'. + EAGAIN 0x0000000200000000, i.e. the host is busy and the guest should + retry the request. + EIO 0xffffffff00000000, for all other errors (this return code is + a KVM-defined hypervisor value, as allowed by the GHCB) + ====== ============================================================= + .. _cap_enable: diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f67525007089..9e6a78e448f2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -41,6 +41,16 @@ #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) +/* + * The GHCB spec essentially states that all non-zero error codes other than + * those explicitly defined above should be treated as an error by the guest. + * Define a generic error to cover that case, and choose a value that is not + * likely to overlap with new explicit error codes should more be added to + * the GHCB spec later. KVM will use this to report generic errors when + * handling SNP guest requests. + */ +#define SNP_GUEST_VMM_ERR_GENERIC (~0U) + /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -4139,6 +4149,36 @@ out_unlock: return ret; } +static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) +{ + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0)); + + return 1; /* resume guest */ +} + +static int snp_complete_req_certs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) { + case 0: + return snp_handle_guest_req(svm, control->exit_info_1, + control->exit_info_2); + case ENOSPC: + vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages; + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN); + case EAGAIN: + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY); + case EIO: + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC); + default: + break; + } + + return -EINVAL; +} + static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) { struct kvm *kvm = svm->vcpu.kvm; @@ -4154,14 +4194,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r /* * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for * additional certificate data to be provided alongside the attestation - * report via the guest-provided data pages indicated by RAX/RBX. The - * certificate data is optional and requires additional KVM enablement - * to provide an interface for userspace to provide it, but KVM still - * needs to be able to handle extended guest requests either way. So - * provide a stub implementation that will always return an empty - * certificate table in the guest-provided data pages. + * report via the guest-provided data pages indicated by RAX/RBX. If + * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace + * to give userspace an opportunity to provide the certificate data + * before issuing/completing the attestation request. Otherwise, return + * an empty certificate table in the guest-provided data pages and + * handle the attestation request immediately. */ if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_vcpu *vcpu = &svm->vcpu; u64 data_npages; gpa_t data_gpa; @@ -4175,6 +4216,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r if (!PAGE_ALIGNED(data_gpa)) goto request_invalid; + if (sev->snp_certs_enabled) { + vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS; + vcpu->run->snp_req_certs.gpa = data_gpa; + vcpu->run->snp_req_certs.npages = data_npages; + vcpu->run->snp_req_certs.ret = 0; + vcpu->arch.complete_userspace_io = snp_complete_req_certs; + return 0; + } + /* * As per GHCB spec (see "SNP Extended Guest Request"), the * certificate table is terminated by 24-bytes of zeroes. diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 338fc4f5cc4c..ebd7b36b1ceb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -115,6 +115,7 @@ struct kvm_sev_info { void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ + bool snp_certs_enabled; /* SNP certificate-fetching support. */ }; struct kvm_svm { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dddb781b0507..8cd107cdcf0b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -135,6 +135,12 @@ struct kvm_xen_exit { } u; }; +struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -180,6 +186,7 @@ struct kvm_xen_exit { #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 #define KVM_EXIT_ARM_SEA 41 +#define KVM_EXIT_SNP_REQ_CERTS 42 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -482,6 +489,8 @@ struct kvm_run { __u64 gva; __u64 gpa; } arm_sea; + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs snp_req_certs; /* Fix the size of the union. */ char padding[256]; }; -- cgit v1.2.3 From ba1b8c97b9a0414432382a11f144a8597f6f597e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 21 Jan 2026 17:11:30 +0100 Subject: geneve: add netlink support for GRO hint Allow configuring and dumping the new device option, and cache its value into the geneve socket itself. The new option is not tie to it any code yet. Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/2295d4e4d1e919a3189425141bbc71c7850a2de0.1769011015.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 3 +++ drivers/net/geneve.c | 29 +++++++++++++++++++++++++---- include/uapi/linux/if_link.h | 1 + 3 files changed, 29 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 6beeb6ee5adf..df4b56beb818 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -1914,6 +1914,9 @@ attribute-sets: name: port-range type: binary struct: ifla-geneve-port-range + - + name: gro-hint + type: flag - name: linkinfo-hsr-attrs name-prefix: ifla-hsr- diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 64ea4b970376..8719ad66837e 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -56,6 +56,7 @@ struct geneve_config { bool collect_md; bool use_udp6_rx_checksums; bool ttl_inherit; + bool gro_hint; enum ifla_geneve_df df; bool inner_proto_inherit; u16 port_min; @@ -84,6 +85,7 @@ struct geneve_dev { struct geneve_sock { bool collect_md; + bool gro_hint; struct list_head list; struct socket *sock; struct rcu_head rcu; @@ -659,13 +661,15 @@ static void geneve_sock_release(struct geneve_dev *geneve) static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, sa_family_t family, - __be16 dst_port) + __be16 dst_port, + bool gro_hint) { struct geneve_sock *gs; list_for_each_entry(gs, &gn->sock_list, list) { if (inet_sk(gs->sock->sk)->inet_sport == dst_port && - geneve_get_sk_family(gs) == family) { + geneve_get_sk_family(gs) == family && + gs->gro_hint == gro_hint) { return gs; } } @@ -676,12 +680,14 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) { struct net *net = geneve->net; struct geneve_net *gn = net_generic(net, geneve_net_id); + bool gro_hint = geneve->cfg.gro_hint; struct geneve_dev_node *node; struct geneve_sock *gs; __u8 vni[3]; __u32 hash; - gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst); + gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, + geneve->cfg.info.key.tp_dst, gro_hint); if (gs) { gs->refcnt++; goto out; @@ -694,6 +700,7 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) out: gs->collect_md = geneve->cfg.collect_md; + gs->gro_hint = gro_hint; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(geneve->sock6, gs); @@ -1257,6 +1264,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_DF] = { .type = NLA_U8 }, [IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG }, [IFLA_GENEVE_PORT_RANGE] = NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)), + [IFLA_GENEVE_GRO_HINT] = { .type = NLA_FLAG }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1607,10 +1615,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], cfg->inner_proto_inherit = true; } + if (data[IFLA_GENEVE_GRO_HINT]) { + if (changelink) { + attrtype = IFLA_GENEVE_GRO_HINT; + goto change_notsup; + } + cfg->gro_hint = true; + } + return 0; change_notsup: NL_SET_ERR_MSG_ATTR(extack, data[attrtype], - "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported"); + "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, gro_hint and UDP checksum attributes are not supported"); return -EOPNOTSUPP; } @@ -1793,6 +1809,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */ nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */ + nla_total_size(0) + /* IFLA_GENEVE_GRO_HINT */ 0; } @@ -1865,6 +1882,10 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; + if (geneve->cfg.gro_hint && + nla_put_flag(skb, IFLA_GENEVE_GRO_HINT)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3b491d96e52e..e9b5f79e1ee1 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1443,6 +1443,7 @@ enum { IFLA_GENEVE_DF, IFLA_GENEVE_INNER_PROTO_INHERIT, IFLA_GENEVE_PORT_RANGE, + IFLA_GENEVE_GRO_HINT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From 795663b4d160ba652959f1a46381c5e8b1342a53 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 24 Jan 2026 10:36:17 +0000 Subject: io_uring/zcrx: implement large rx buffer support There are network cards that support receive buffers larger than 4K, and that can be vastly beneficial for performance, and benchmarks for this patch showed up to 30% CPU util improvement for 32K vs 4K buffers. Allows zcrx users to specify the size in struct io_uring_zcrx_ifq_reg::rx_buf_len. If set to zero, zcrx will use a default value. zcrx will check and fail if the memory backing the area can't be split into physically contiguous chunks of the required size. It's more restrictive as it only needs dma addresses to be contig, but that's beyond this series. Signed-off-by: Pavel Begunkov [axboe: kill duplicate netdev_queues.h include] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- io_uring/zcrx.c | 38 +++++++++++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b5b23c0d5283..3184f7e7f1f2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1082,7 +1082,7 @@ struct io_uring_zcrx_ifq_reg { struct io_uring_zcrx_offsets offsets; __u32 zcrx_id; - __u32 __resv2; + __u32 rx_buf_len; __u64 __resv[3]; }; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b99cf2c6670a..8a9df72bc094 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -55,6 +55,18 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) return area->mem.pages[net_iov_idx(niov) << niov_pages_shift]; } +static int io_area_max_shift(struct io_zcrx_mem *mem) +{ + struct sg_table *sgt = mem->sgt; + struct scatterlist *sg; + unsigned shift = -1U; + unsigned i; + + for_each_sgtable_dma_sg(sgt, sg, i) + shift = min(shift, __ffs(sg->length)); + return shift; +} + static int io_populate_area_dma(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { @@ -416,12 +428,21 @@ static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, } static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, - struct io_uring_zcrx_area_reg *area_reg) + struct io_uring_zcrx_area_reg *area_reg, + struct io_uring_zcrx_ifq_reg *reg) { + int buf_size_shift = PAGE_SHIFT; struct io_zcrx_area *area; unsigned nr_iovs; int i, ret; + if (reg->rx_buf_len) { + if (!is_power_of_2(reg->rx_buf_len) || + reg->rx_buf_len < PAGE_SIZE) + return -EINVAL; + buf_size_shift = ilog2(reg->rx_buf_len); + } + ret = -ENOMEM; area = kzalloc(sizeof(*area), GFP_KERNEL); if (!area) @@ -432,7 +453,12 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, if (ret) goto err; - ifq->niov_shift = PAGE_SHIFT; + if (buf_size_shift > io_area_max_shift(&area->mem)) { + ret = -ERANGE; + goto err; + } + + ifq->niov_shift = buf_size_shift; nr_iovs = area->mem.size >> ifq->niov_shift; area->nia.num_niovs = nr_iovs; @@ -742,8 +768,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EINVAL; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || - reg.__resv2 || reg.zcrx_id) + if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.zcrx_id) return -EINVAL; if (reg.flags & ZCRX_REG_IMPORT) return import_zcrx(ctx, arg, ®); @@ -800,10 +825,11 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, } get_device(ifq->dev); - ret = io_zcrx_create_area(ifq, &area); + ret = io_zcrx_create_area(ifq, &area, ®); if (ret) goto netdev_put_unlock; + mp_param.rx_page_size = 1U << ifq->niov_shift; mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); @@ -821,6 +847,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; } + reg.rx_buf_len = 1U << ifq->niov_shift; + if (copy_to_user(arg, ®, sizeof(reg)) || copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { -- cgit v1.2.3 From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:56 +0800 Subject: bpf: add fsession support The fsession is something that similar to kprobe session. It allow to attach a single BPF program to both the entry and the exit of the target functions. Introduce the struct bpf_fsession_link, which allows to add the link to both the fentry and fexit progs_hlist of the trampoline. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 2 + kernel/bpf/syscall.c | 18 +++++++- kernel/bpf/trampoline.c | 53 ++++++++++++++++++---- kernel/bpf/verifier.c | 12 +++-- net/bpf/test_run.c | 1 + net/core/bpf_sk_storage.c | 1 + tools/include/uapi/linux/bpf.h | 1 + .../selftests/bpf/prog_tests/tracing_failure.c | 2 +- 10 files changed, 97 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..41228b0add52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type { BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ + BPF_TRAMP_FSESSION, }; struct bpf_tramp_image { @@ -1875,6 +1876,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_fsession_link { + struct bpf_tracing_link link; + struct bpf_tramp_link fexit; +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; @@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a2ade4be60f..44e7dbc278e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d10b3404260f..8959f3bc1e92 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; @@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c5c03d43f5f..b9184545c3fd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc(sizeof(*fslink), GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc(sizeof(*link), GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 2a125d063e62..edf9da43762d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; - return (ptype == BPF_PROG_TYPE_TRACING && - (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN)) || - (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); + switch (ptype) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + return true; + return false; + case BPF_PROG_TYPE_LSM: + return eatype == BPF_LSM_MAC; + default: + return false; + } } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) @@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_TRACE_FSESSION: + return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its @@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { + struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; + struct hlist_head *prog_list; int err = 0; int cnt = 0, i; @@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } + if (kind == BPF_TRAMP_FSESSION) { + prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; + cnt++; + } else { + prog_list = &tr->progs_hlist[kind]; + } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); - tr->progs_cnt[kind]++; + hlist_add_head(&link->tramp_hlist, prog_list); + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]++; + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr->progs_cnt[BPF_TRAMP_FEXIT]++; + } else { + tr->progs_cnt[kind]++; + } err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]--; + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + } else { + tr->progs_cnt[kind]--; + } } return err; } @@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; + } else if (kind == BPF_TRAMP_FSESSION) { + struct bpf_fsession_link *fslink = + container_of(link, struct bpf_fsession_link, link.link); + + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7f5234d5fd2..41bbed6418b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -23774,6 +23775,7 @@ patch_map_ops_generic: if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); @@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", tgt_info.tgt_name); return -EINVAL; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 26cfcfdc45eb..178c4738e63b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 850dd736ccd1..de111818f3a0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b816bc53d2e1..3ca7d76e05f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c index 10e231965589..f9f9e1cb87bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -73,7 +73,7 @@ static void test_tracing_deny(void) static void test_fexit_noreturns(void) { test_tracing_fail_prog("fexit_noreturns", - "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected."); + "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected."); } void test_tracing_failure(void) -- cgit v1.2.3 From 0ac903d1bfdce8ff40657c2b7d996947b72b6645 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 9 Dec 2025 19:28:50 -0500 Subject: NFS: NFSERR_INVAL is not defined by NFSv2 A documenting comment in include/uapi/linux/nfs.h claims incorrectly that NFSv2 defines NFSERR_INVAL. There is no such definition in either RFC 1094 or https://pubs.opengroup.org/onlinepubs/9629799/chap7.htm NFS3ERR_INVAL is introduced in RFC 1813. NFSD returns NFSERR_INVAL for PROC_GETACL, which has no specification (yet). However, nfsd_map_status() maps nfserr_symlink and nfserr_wrong_type to nfserr_inval, which does not align with RFC 1094. This logic was introduced only recently by commit 438f81e0e92a ("nfsd: move error choice for incorrect object types to version-specific code."). Given that we have no INVAL or SERVERFAULT status in NFSv2, probably the only choice is NFSERR_IO. Fixes: 438f81e0e92a ("nfsd: move error choice for incorrect object types to version-specific code.") Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs2acl.c | 2 +- fs/nfsd/nfsproc.c | 2 +- include/uapi/linux/nfs.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 5fb202acb0fd..0ac538c76180 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -45,7 +45,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) inode = d_inode(fh->fh_dentry); if (argp->mask & ~NFS_ACL_MASK) { - resp->status = nfserr_inval; + resp->status = nfserr_io; goto out; } resp->mask = argp->mask; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 481e789a7697..8873033d1e82 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -33,7 +33,7 @@ static __be32 nfsd_map_status(__be32 status) break; case nfserr_symlink: case nfserr_wrong_type: - status = nfserr_inval; + status = nfserr_io; break; } return status; diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h index 71c7196d3281..e629c4953534 100644 --- a/include/uapi/linux/nfs.h +++ b/include/uapi/linux/nfs.h @@ -55,7 +55,7 @@ NFSERR_NODEV = 19, /* v2 v3 v4 */ NFSERR_NOTDIR = 20, /* v2 v3 v4 */ NFSERR_ISDIR = 21, /* v2 v3 v4 */ - NFSERR_INVAL = 22, /* v2 v3 v4 */ + NFSERR_INVAL = 22, /* v3 v4 */ NFSERR_FBIG = 27, /* v2 v3 v4 */ NFSERR_NOSPC = 28, /* v2 v3 v4 */ NFSERR_ROFS = 30, /* v2 v3 v4 */ -- cgit v1.2.3 From 1965bbb8f3c72e5f1972b5eeb6f19a36664a676d Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 22 Dec 2025 08:55:10 +0100 Subject: ipc/shm: uapi: remove dependency on libc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using libc types and headers from the UAPI headers is problematic as it introduces a dependency on a full C toolchain. shm.h does not even use any symbols from the libc header as the usage of getpagesize() was removed a decade ago in commit 060028bac94b ("ipc/shm.c: increase the defaults for SHMALL, SHMMAX") Drop the unnecessary inclusion. Link: https://lkml.kernel.org/r/20251222-uapi-shm-v1-1-270bb7f75d97@linutronix.de Signed-off-by: Thomas Weißschuh Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/uapi/linux/shm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index 8d1f17a4e08e..7269f9f402e3 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -5,9 +5,6 @@ #include #include #include -#ifndef __KERNEL__ -#include -#endif /* * SHMMNI, SHMMAX and SHMALL are default upper limits which can be -- cgit v1.2.3 From 3d702678f57edc524f73a7865382ae304269f590 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Tue, 23 Dec 2025 19:05:23 +0800 Subject: mm/mempolicy: fix mpol_rebind_nodemask() for MPOL_F_NUMA_BALANCING commit bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes") adds new flag MPOL_F_NUMA_BALANCING to enable NUMA balancing for MPOL_BIND memory policy. When the cpuset of tasks changes, the mempolicy of the task is rebound by mpol_rebind_nodemask(). When MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES are both not set, the behaviour of rebinding should be same whenever MPOL_F_NUMA_BALANCING is set or not. So, when an application calls set_mempolicy() with MPOL_F_NUMA_BALANCING set but both MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES cleared, mempolicy.w.cpuset_mems_allowed should be set to cpuset_current_mems_allowed nodemask. However, in current implementation, mpol_store_user_nodemask() wrongly returns true, causing mempolicy->w.user_nodemask to be incorrectly set to the user-specified nodemask. Later, when the cpuset of the application changes, mpol_rebind_nodemask() ends up rebinding based on the user-specified nodemask rather than the cpuset_mems_allowed nodemask as intended. I can reproduce with the following steps in qemu with 4 NUMA nodes: 1. echo '+cpuset' > /sys/fs/cgroup/cgroup.subtree_control 2. mkdir /sys/fs/cgroup/test 3. ./reproducer & 4. cat /proc/$pid/numa_maps, the task is bound to NUMA 1 5. echo $pid > /sys/fs/cgroup/test/cgroup.procs 6. cat /proc/$pid/numa_maps, the task is bound to NUMA 0 now. The reproducer code: int main() { struct bitmask *bmp; int ret; bmp = numa_parse_nodestring("1"); ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING, bmp->maskp, bmp->size + 1); if (ret < 0) { perror("Failed to call set_mempolicy"); exit(-1); } while (1); return 0; } If I call set_mempolicy() without MPOL_F_NUMA_BALANCING in the reproducer code. After step 5, the task is still bound to NUMA 1. To fix this, only set mempolicy->w.user_nodemask to the user-specified nodemask if MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES is present. Link: https://lkml.kernel.org/r/20260120011018.1256654-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20251223110523.1161421-1-tujinjiang@huawei.com Fixes: bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes") Signed-off-by: Jinjiang Tu Reviewed-by: Gregory Price Reviewed-by: Huang Ying Acked-by: David Hildenbrand (Red Hat) Cc: Alistair Popple Cc: Byungchul Park Cc: Joshua Hahn Cc: Kefeng Wang Cc: Mathew Brost Cc: Mel Gorman Cc: Rakie Kim Cc: Zi Yan Signed-off-by: Andrew Morton --- include/uapi/linux/mempolicy.h | 3 +++ mm/mempolicy.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8fbbe613611a..6c962d866e86 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -39,6 +39,9 @@ enum { #define MPOL_MODE_FLAGS \ (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING) +/* Whether the nodemask is specified by users */ +#define MPOL_USER_NODEMASK_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) + /* Flags for get_mempolicy */ #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ #define MPOL_F_ADDR (1<<1) /* look up vma using address */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 68a98ba57882..76da50425712 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -365,7 +365,7 @@ static const struct mempolicy_operations { static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & MPOL_MODE_FLAGS; + return pol->flags & MPOL_USER_NODEMASK_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, -- cgit v1.2.3 From 86c6b6e4d187652d718915e15cf126f98e24e955 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Sun, 11 Jan 2026 19:03:48 +0200 Subject: wifi: nl80211/cfg80211: add new FTM capabilities Add new capabilities to the PMSR FTM capabilities list. The new capabilities include 6 GHz support, supported number of spatial streams and supported number of LTF repetitions. Signed-off-by: Avraham Stern Tested-by: Miriam Rachel Korenblit Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260111190221.bf43785c18f6.Ic98cf9790ddee84bf88e5720b93c46c23af3c96c@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 20 +++++++++++++++++++- include/uapi/linux/nl80211.h | 29 +++++++++++++++++++++++++++++ net/wireless/nl80211.c | 23 +++++++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6d8e35a0dde4..8153b6aaa998 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5643,6 +5643,17 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type); * not limited) * @ftm.trigger_based: trigger based ranging measurement is supported * @ftm.non_trigger_based: non trigger based ranging measurement is supported + * @ftm.support_6ghz: supports ranging in 6 GHz band + * @ftm.max_tx_ltf_rep: maximum number of TX LTF repetitions supported (0 means + * only one LTF, no repetitions) + * @ftm.max_rx_ltf_rep: maximum number of RX LTF repetitions supported (0 means + * only one LTF, no repetitions) + * @ftm.max_tx_sts: maximum number of TX STS supported (zero based) + * @ftm.max_rx_sts: maximum number of RX STS supported (zero based) + * @ftm.max_total_ltf_tx: maximum total number of LTFs that can be transmitted + * (0 means unknown) + * @ftm.max_total_ltf_rx: maximum total number of LTFs that can be received + * (0 means unknown) */ struct cfg80211_pmsr_capabilities { unsigned int max_peers; @@ -5660,7 +5671,14 @@ struct cfg80211_pmsr_capabilities { request_lci:1, request_civicloc:1, trigger_based:1, - non_trigger_based:1; + non_trigger_based:1, + support_6ghz:1; + u8 max_tx_ltf_rep; + u8 max_rx_ltf_rep; + u8 max_tx_sts; + u8 max_rx_sts; + u8 max_total_ltf_tx; + u8 max_total_ltf_rx; } ftm; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b0f050e36fa4..200703c8b2c1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -7790,6 +7790,28 @@ enum nl80211_peer_measurement_attrs { * trigger based ranging measurement is supported * @NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED: flag attribute indicating * if non-trigger-based ranging measurement is supported + * @NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT: flag attribute indicating if + * ranging on the 6 GHz band is supported + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP: u32 attribute indicating + * the maximum number of LTF repetitions the device can transmit in the + * preamble of the ranging NDP (zero means only one LTF, no repetitions) + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP: u32 attribute indicating + * the maximum number of LTF repetitions the device can receive in the + * preamble of the ranging NDP (zero means only one LTF, no repetitions) + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS: u32 attribute indicating + * the maximum number of space-time streams supported for ranging NDP TX + * (zero-based) + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS: u32 attribute indicating + * the maximum number of space-time streams supported for ranging NDP RX + * (zero-based) + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX: u32 attribute indicating the + * maximum total number of LTFs the device can transmit. The total number + * of LTFs is (number of LTF repetitions) * (number of space-time streams). + * This limits the allowed combinations of LTF repetitions and STS. + * @NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX: u32 attribute indicating the + * maximum total number of LTFs the device can receive. The total number + * of LTFs is (number of LTF repetitions) * (number of space-time streams). + * This limits the allowed combinations of LTF repetitions and STS. * * @NUM_NL80211_PMSR_FTM_CAPA_ATTR: internal * @NL80211_PMSR_FTM_CAPA_ATTR_MAX: highest attribute number @@ -7807,6 +7829,13 @@ enum nl80211_peer_measurement_ftm_capa { NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST, NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED, + NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT, + NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP, + NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP, + NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS, + NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS, + NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX, + NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX, /* keep last */ NUM_NL80211_PMSR_FTM_CAPA_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 56cc5ed33ea3..74ea922a5e8a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2313,6 +2313,29 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap, if (cap->ftm.non_trigger_based && nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED)) return -ENOBUFS; + if (cap->ftm.support_6ghz && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP, + cap->ftm.max_tx_ltf_rep)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP, + cap->ftm.max_rx_ltf_rep)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS, + cap->ftm.max_tx_sts)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS, + cap->ftm.max_rx_sts)) + return -ENOBUFS; + if (cap->ftm.max_total_ltf_tx > 0 && + nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX, + cap->ftm.max_total_ltf_tx)) + return -ENOBUFS; + if (cap->ftm.max_total_ltf_rx > 0 && + nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX, + cap->ftm.max_total_ltf_rx)) + return -ENOBUFS; nla_nest_end(msg, ftm); return 0; -- cgit v1.2.3 From 853ce6943c385be2f6cccf371080e592f2e08b0f Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Sun, 11 Jan 2026 19:03:49 +0200 Subject: wifi: nl80211/cfg80211: clarify periodic FTM parameters for non-EDCA based ranging Periodic FTM request attributes are defined based on the periodic parameters used in EDCA-based ranging negotiation. However, non-EDCA based ranging (trigger-based/non-trigger-based) does not include periodic parameters in the negotiation protocol, even though upper layers may still request periodic measurements. Clarify the semantics of periodic ranging attributes when used with non-EDCA based ranging. Signed-off-by: Avraham Stern Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260111190221.b89cb3f68e1a.I7a9d8c6d1c66c77f1b43120a841101c96c3f19ad@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- include/uapi/linux/nl80211.h | 7 +++++-- net/wireless/pmsr.c | 11 ++++++----- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8153b6aaa998..8a81adbf3723 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4295,7 +4295,9 @@ struct cfg80211_pmsr_result { * @burst_period: burst period to use * @asap: indicates to use ASAP mode * @num_bursts_exp: number of bursts exponent - * @burst_duration: burst duration + * @burst_duration: burst duration. If @trigger_based or @non_trigger_based is + * set, this is the burst duration in milliseconds, and zero means the + * device should pick an appropriate value based on @ftms_per_burst. * @ftms_per_burst: number of FTMs per burst * @ftmr_retries: number of retries for FTM request * @request_lci: request LCI information diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 200703c8b2c1..71219445f5c7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -7851,12 +7851,15 @@ enum nl80211_peer_measurement_ftm_capa { * &enum nl80211_preamble), optional for DMG (u32) * @NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP: number of bursts exponent as in * 802.11-2016 9.4.2.168 "Fine Timing Measurement Parameters element" - * (u8, 0-15, optional with default 15 i.e. "no preference") + * (u8, 0-15, optional with default 15 i.e. "no preference". No limit for + * non-EDCA ranging) * @NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD: interval between bursts in units * of 100ms (u16, optional with default 0) * @NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION: burst duration, as in 802.11-2016 * Table 9-257 "Burst Duration field encoding" (u8, 0-15, optional with - * default 15 i.e. "no preference") + * default 15 i.e. "no preference"). For non-EDCA ranging, this is the + * burst duration in milliseconds (optional with default 0, i.e. let the + * device decide). * @NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST: number of successful FTM frames * requested per burst * (u8, 0-31, optional with default 0 i.e. "no preference") diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index a117f5093ca2..795683a81303 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -85,11 +85,6 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } - out->ftm.burst_duration = 15; - if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) - out->ftm.burst_duration = - nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); - out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) out->ftm.ftms_per_burst = @@ -164,6 +159,12 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } + if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) + out->ftm.burst_duration = + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); + else if (!out->ftm.non_trigger_based && !out->ftm.trigger_based) + out->ftm.burst_duration = 15; + out->ftm.lmr_feedback = !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK]; if (!out->ftm.trigger_based && !out->ftm.non_trigger_based && -- cgit v1.2.3 From cfd46d1c6f4bf232c5630b1cf5c8b317d38101c5 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Sun, 11 Jan 2026 19:03:50 +0200 Subject: wifi: nl80211/cfg80211: add negotiated burst period to FTM result The FTM result includes some of the periodic measurement negotiated parameters (like the burst duration and number of bursts), but it doesn't include the burst period. Add it to the FTM result notification. Signed-off-by: Avraham Stern Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260111190221.e0778f86edef.I3c98c1933eb639963bc3ffdef81a8788b59f2188@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 3 +++ net/wireless/pmsr.c | 1 + 3 files changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8a81adbf3723..535fd95b0d83 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4192,6 +4192,7 @@ struct cfg80211_ftm_responder_stats { * @num_bursts_exp: actual number of bursts exponent negotiated * @burst_duration: actual burst duration negotiated * @ftms_per_burst: actual FTMs per burst negotiated + * @burst_period: actual burst period negotiated in units of 100ms * @lci_len: length of LCI information (if present) * @civicloc_len: length of civic location information (if present) * @lci: LCI data (may be %NULL) @@ -4233,6 +4234,7 @@ struct cfg80211_pmsr_ftm_result { u8 num_bursts_exp; u8 burst_duration; u8 ftms_per_burst; + u16 burst_period; s32 rssi_avg; s32 rssi_spread; struct rate_info tx_rate, rx_rate; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 71219445f5c7..8910b709bfb1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -7992,6 +7992,8 @@ enum nl80211_peer_measurement_ftm_failure_reasons { * 9.4.2.22.1) starting with the Measurement Token, with Measurement * Type 11. * @NL80211_PMSR_FTM_RESP_ATTR_PAD: ignore, for u64/s64 padding only + * @NL80211_PMSR_FTM_RESP_ATTR_BURST_PERIOD: actual burst period used by + * the responder (similar to request, u16) * * @NUM_NL80211_PMSR_FTM_RESP_ATTR: internal * @NL80211_PMSR_FTM_RESP_ATTR_MAX: highest attribute number @@ -8020,6 +8022,7 @@ enum nl80211_peer_measurement_ftm_resp { NL80211_PMSR_FTM_RESP_ATTR_LCI, NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC, NL80211_PMSR_FTM_RESP_ATTR_PAD, + NL80211_PMSR_FTM_RESP_ATTR_BURST_PERIOD, /* keep last */ NUM_NL80211_PMSR_FTM_RESP_ATTR, diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 795683a81303..d5077d320098 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -454,6 +454,7 @@ static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg, PUT(u8, NUM_BURSTS_EXP, num_bursts_exp); PUT(u8, BURST_DURATION, burst_duration); PUT(u8, FTMS_PER_BURST, ftms_per_burst); + PUT(u16, BURST_PERIOD, burst_period); PUTOPT(s32, RSSI_AVG, rssi_avg); PUTOPT(s32, RSSI_SPREAD, rssi_spread); if (res->ftm.tx_rate_valid && -- cgit v1.2.3 From 853800c746d38486673ef67f461b660a01d52716 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Sun, 11 Jan 2026 19:03:51 +0200 Subject: wifi: nl80211/cfg80211: support operating as RSTA in PMSR FTM request Add an option to operate as the RSTA in an FTM measurement request. When requested, the device will dwell on the requested channel until the peer starts the FTM negotiation. This option is only valid for trigger-based/non trigger-based measurement with LMR feedback which will allow the RSTA to receive the results of the measurement. Signed-off-by: Avraham Stern Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260111190221.1f95fc0afab4.Iae2d32783b8e7c4a29089fec0f4c6bce94d303cc@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 ++++++- include/uapi/linux/nl80211.h | 12 ++++++++++++ net/wireless/nl80211.c | 4 ++++ net/wireless/pmsr.c | 15 +++++++++++++++ 4 files changed, 37 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 535fd95b0d83..ac7df439bd24 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4312,6 +4312,8 @@ struct cfg80211_pmsr_result { * EDCA based ranging will be used. * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either * @trigger_based or @non_trigger_based is set. + * @rsta: Operate as the RSTA in the measurement. Only valid if @lmr_feedback + * and either @trigger_based or @non_trigger_based is set. * @bss_color: the bss color of the responder. Optional. Set to zero to * indicate the driver should set the BSS color. Only valid if * @non_trigger_based or @trigger_based is set. @@ -4327,7 +4329,8 @@ struct cfg80211_pmsr_ftm_request_peer { request_civicloc:1, trigger_based:1, non_trigger_based:1, - lmr_feedback:1; + lmr_feedback:1, + rsta:1; u8 num_bursts_exp; u8 burst_duration; u8 ftms_per_burst; @@ -5658,6 +5661,7 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type); * (0 means unknown) * @ftm.max_total_ltf_rx: maximum total number of LTFs that can be received * (0 means unknown) + * @ftm.support_rsta: supports operating as RSTA in PMSR FTM request */ struct cfg80211_pmsr_capabilities { unsigned int max_peers; @@ -5683,6 +5687,7 @@ struct cfg80211_pmsr_capabilities { u8 max_rx_sts; u8 max_total_ltf_tx; u8 max_total_ltf_rx; + u8 support_rsta:1; } ftm; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8910b709bfb1..54ddbd9a5459 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -7812,6 +7812,8 @@ enum nl80211_peer_measurement_attrs { * maximum total number of LTFs the device can receive. The total number * of LTFs is (number of LTF repetitions) * (number of space-time streams). * This limits the allowed combinations of LTF repetitions and STS. + * @NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT: flag attribute indicating the + * device supports operating as the RSTA in PMSR FTM request * * @NUM_NL80211_PMSR_FTM_CAPA_ATTR: internal * @NL80211_PMSR_FTM_CAPA_ATTR_MAX: highest attribute number @@ -7836,6 +7838,7 @@ enum nl80211_peer_measurement_ftm_capa { NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX, + NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT, /* keep last */ NUM_NL80211_PMSR_FTM_CAPA_ATTR, @@ -7888,6 +7891,14 @@ enum nl80211_peer_measurement_ftm_capa { * @NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR: optional. The BSS color of the * responder. Only valid if %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED * or %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED is set. + * @NL80211_PMSR_FTM_REQ_ATTR_RSTA: optional. Request to perform the measurement + * as the RSTA (flag). When set, the device is expected to dwell on the + * channel specified in %NL80211_PMSR_PEER_ATTR_CHAN until it receives the + * FTM request from the peer or the timeout specified by + * %NL80211_ATTR_TIMEOUT has expired. + * Only valid if %NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK is set (so the + * RSTA will have the measurement results to report back in the FTM + * response). * * @NUM_NL80211_PMSR_FTM_REQ_ATTR: internal * @NL80211_PMSR_FTM_REQ_ATTR_MAX: highest attribute number @@ -7908,6 +7919,7 @@ enum nl80211_peer_measurement_ftm_req { NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED, NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK, NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR, + NL80211_PMSR_FTM_REQ_ATTR_RSTA, /* keep last */ NUM_NL80211_PMSR_FTM_REQ_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 74ea922a5e8a..85e30fda4c46 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -361,6 +361,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR] = { .type = NLA_U8 }, + [NL80211_PMSR_FTM_REQ_ATTR_RSTA] = { .type = NLA_FLAG }, }; static const struct nla_policy @@ -2336,6 +2337,9 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap, nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX, cap->ftm.max_total_ltf_rx)) return -ENOBUFS; + if (cap->ftm.support_rsta && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT)) + return -ENOBUFS; nla_nest_end(msg, ftm); return 0; diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index d5077d320098..60e1e31c2185 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -187,6 +187,21 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]); } + out->ftm.rsta = !!tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA]; + if (out->ftm.rsta && !capa->ftm.support_rsta) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA], + "FTM: RSTA not supported by device"); + return -EOPNOTSUPP; + } + + if (out->ftm.rsta && !out->ftm.lmr_feedback) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA], + "FTM: RSTA set without LMR feedback"); + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:10 +0000 Subject: bpf: add new BPF_CGROUP_ITER_CHILDREN control option Currently, the BPF cgroup iterator supports walking descendants in either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order (BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive depth-first search (DFS) of the hierarchy. In scenarios where a BPF program may need to inspect only the direct children of a given parent cgroup, a full DFS is unnecessarily expensive. This patch introduces a new BPF cgroup iterator control option, BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal to the immediate children of a specified parent cgroup, allowing for more targeted and efficient iteration, particularly when exhaustive depth-first search (DFS) traversal is not required. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/cgroup_iter.c | 26 +++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 8 ++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44e7dbc278e3..c8d400b7680a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index f04a468cf6a7..fd51fe3d92cc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -8,12 +8,13 @@ #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ -/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. +/* cgroup_iter provides five modes of traversal to the cgroup hierarchy. * * 1. Walk the descendants of a cgroup in pre-order. * 2. Walk the descendants of a cgroup in post-order. * 3. Walk the ancestors of a cgroup. * 4. Show the given cgroup only. + * 5. Walk the children of a given parent cgroup. * * For walking descendants, cgroup_iter can walk in either pre-order or * post-order. For walking ancestors, the iter walks up from a cgroup to @@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) return css_next_descendant_pre(NULL, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(NULL, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ return p->start_css; } @@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) return css_next_descendant_post(curr, p->start_css); else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(curr, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && - order != BPF_CGROUP_ITER_DESCENDANTS_POST && - order != BPF_CGROUP_ITER_ANCESTORS_UP && - order != BPF_CGROUP_ITER_SELF_ONLY) + switch (order) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_SELF_ONLY: + case BPF_CGROUP_ITER_CHILDREN: + break; + default: return -EINVAL; + } if (fd && id) return -EINVAL; @@ -257,6 +267,8 @@ show_order: seq_puts(seq, "order: descendants_post\n"); else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) + seq_puts(seq, "order: children\n"); else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } @@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, case BPF_CGROUP_ITER_DESCENDANTS_PRE: case BPF_CGROUP_ITER_DESCENDANTS_POST: case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_CHILDREN: break; default: return -EINVAL; @@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i case BPF_CGROUP_ITER_DESCENDANTS_POST: kit->pos = css_next_descendant_post(kit->pos, kit->start); break; + case BPF_CGROUP_ITER_CHILDREN: + kit->pos = css_next_child(kit->pos, kit->start); + break; case BPF_CGROUP_ITER_ANCESTORS_UP: kit->pos = kit->pos ? kit->pos->parent : kit->start; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca7d76e05f0..5e38b4887de6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { -- cgit v1.2.3 From d42eb05e60fea31de49897d63a1d73f933303bd4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Jan 2026 08:24:02 -0700 Subject: io_uring: add support for BPF filtering for opcode restrictions Add support for loading classic BPF programs with io_uring to provide fine-grained filtering of SQE operations. Unlike IORING_REGISTER_RESTRICTIONS which only allows bitmap-based allow/deny of opcodes, BPF filters can inspect request attributes and make dynamic decisions. The filter is registered via IORING_REGISTER_BPF_FILTER with a struct io_uring_bpf: struct io_uring_bpf_filter { __u32 opcode; /* io_uring opcode to filter */ __u32 flags; __u32 filter_len; /* number of BPF instructions */ __u32 resv; __u64 filter_ptr; /* pointer to BPF filter */ __u64 resv2[5]; }; enum { IO_URING_BPF_CMD_FILTER = 1, }; struct io_uring_bpf { __u16 cmd_type; /* IO_URING_BPF_* values */ __u16 cmd_flags; /* none so far */ __u32 resv; union { struct io_uring_bpf_filter filter; }; }; and the filters get supplied a struct io_uring_bpf_ctx: struct io_uring_bpf_ctx { __u64 user_data; __u8 opcode; __u8 sqe_flags; __u8 pdu_size; __u8 pad[5]; }; where it's possible to filter on opcode and sqe_flags, with pdu_size indicating how much extra data is being passed in beyond the pad field. This will used for specific finer grained filtering inside an opcode. An example of that for sockets is in one of the following patches. Anything the opcode supports can end up in this struct, populated by the opcode itself, and hence can be filtered for. Filters have the following semantics: - Return 1 to allow the request - Return 0 to deny the request with -EACCES - Multiple filters can be stacked per opcode. All filters must return 1 for the opcode to be allowed. - Filters are evaluated in registration order (most recent first) The implementation uses classic BPF (cBPF) rather than eBPF for as that's required for containers, and since they can be used by any user in the system. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 9 + include/uapi/linux/io_uring.h | 3 + include/uapi/linux/io_uring/bpf_filter.h | 50 +++++ io_uring/Kconfig | 5 + io_uring/Makefile | 1 + io_uring/bpf_filter.c | 321 +++++++++++++++++++++++++++++++ io_uring/bpf_filter.h | 42 ++++ io_uring/io_uring.c | 8 + io_uring/register.c | 8 + 9 files changed, 447 insertions(+) create mode 100644 include/uapi/linux/io_uring/bpf_filter.h create mode 100644 io_uring/bpf_filter.c create mode 100644 io_uring/bpf_filter.h (limited to 'include/uapi/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dc6bd6940a0d..74bf98362876 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -219,9 +219,18 @@ struct io_rings { struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; +struct io_bpf_filter; +struct io_bpf_filters { + refcount_t refs; /* ref for ->bpf_filters */ + spinlock_t lock; /* protects ->bpf_filters modifications */ + struct io_bpf_filter __rcu **filters; + struct rcu_head rcu_head; +}; + struct io_restriction { DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + struct io_bpf_filters *bpf_filters; u8 sqe_flags_allowed; u8 sqe_flags_required; /* IORING_OP_* restrictions exist */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b5b23c0d5283..94669b77fee8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -700,6 +700,9 @@ enum io_uring_register_op { /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ IORING_REGISTER_ZCRX_CTRL = 36, + /* register bpf filtering programs */ + IORING_REGISTER_BPF_FILTER = 37, + /* this goes last */ IORING_REGISTER_LAST, diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h new file mode 100644 index 000000000000..2d4d0e5743e4 --- /dev/null +++ b/include/uapi/linux/io_uring/bpf_filter.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring BPF filters. + */ +#ifndef LINUX_IO_URING_BPF_FILTER_H +#define LINUX_IO_URING_BPF_FILTER_H + +#include + +/* + * Struct passed to filters. + */ +struct io_uring_bpf_ctx { + __u64 user_data; + __u8 opcode; + __u8 sqe_flags; + __u8 pdu_size; /* size of aux data for filter */ + __u8 pad[5]; +}; + +enum { + /* + * If set, any currently unset opcode will have a deny filter attached + */ + IO_URING_BPF_FILTER_DENY_REST = 1, +}; + +struct io_uring_bpf_filter { + __u32 opcode; /* io_uring opcode to filter */ + __u32 flags; + __u32 filter_len; /* number of BPF instructions */ + __u32 resv; + __u64 filter_ptr; /* pointer to BPF filter */ + __u64 resv2[5]; +}; + +enum { + IO_URING_BPF_CMD_FILTER = 1, +}; + +struct io_uring_bpf { + __u16 cmd_type; /* IO_URING_BPF_* values */ + __u16 cmd_flags; /* none so far */ + __u32 resv; + union { + struct io_uring_bpf_filter filter; + }; +}; + +#endif diff --git a/io_uring/Kconfig b/io_uring/Kconfig index 4b949c42c0bf..a7ae23cf1035 100644 --- a/io_uring/Kconfig +++ b/io_uring/Kconfig @@ -9,3 +9,8 @@ config IO_URING_ZCRX depends on PAGE_POOL depends on INET depends on NET_RX_BUSY_POLL + +config IO_URING_BPF + def_bool y + depends on BPF + depends on NET diff --git a/io_uring/Makefile b/io_uring/Makefile index bf9eff88427a..931f9156132a 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o obj-$(CONFIG_NET) += net.o cmd_net.o obj-$(CONFIG_PROC_FS) += fdinfo.o obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o +obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c new file mode 100644 index 000000000000..5207226d72ea --- /dev/null +++ b/io_uring/bpf_filter.c @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF filter support for io_uring. Supports SQE opcodes for now. + */ +#include +#include +#include +#include +#include +#include + +#include "io_uring.h" +#include "bpf_filter.h" +#include "net.h" + +struct io_bpf_filter { + struct bpf_prog *prog; + struct io_bpf_filter *next; +}; + +/* Deny if this is set as the filter */ +static const struct io_bpf_filter dummy_filter; + +static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx, + struct io_kiocb *req) +{ + bctx->opcode = req->opcode; + bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS; + bctx->user_data = req->cqe.user_data; + /* clear residual, anything from pdu_size and below */ + memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0, + sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size)); +} + +/* + * Run registered filters for a given opcode. For filters, a return of 0 denies + * execution of the request, a return of 1 allows it. If any filter for an + * opcode returns 0, filter processing is stopped, and the request is denied. + * This also stops the processing of filters. + * + * __io_uring_run_bpf_filters() returns 0 on success, allow running the + * request, and -EACCES when a request is denied. + */ +int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req) +{ + struct io_bpf_filter *filter; + struct io_uring_bpf_ctx bpf_ctx; + int ret; + + /* Fast check for existence of filters outside of RCU */ + if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode])) + return 0; + + /* + * req->opcode has already been validated to be within the range + * of what we expect, io_init_req() does this. + */ + guard(rcu)(); + filter = rcu_dereference(res->bpf_filters->filters[req->opcode]); + if (!filter) + return 0; + else if (filter == &dummy_filter) + return -EACCES; + + io_uring_populate_bpf_ctx(&bpf_ctx, req); + + /* + * Iterate registered filters. The opcode is allowed IFF all filters + * return 1. If any filter returns denied, opcode will be denied. + */ + do { + if (filter == &dummy_filter) + return -EACCES; + ret = bpf_prog_run(filter->prog, &bpf_ctx); + if (!ret) + return -EACCES; + filter = filter->next; + } while (filter); + + return 0; +} + +static void io_free_bpf_filters(struct rcu_head *head) +{ + struct io_bpf_filter __rcu **filter; + struct io_bpf_filters *filters; + int i; + + filters = container_of(head, struct io_bpf_filters, rcu_head); + scoped_guard(spinlock, &filters->lock) { + filter = filters->filters; + if (!filter) + return; + } + + for (i = 0; i < IORING_OP_LAST; i++) { + struct io_bpf_filter *f; + + rcu_read_lock(); + f = rcu_dereference(filter[i]); + while (f) { + struct io_bpf_filter *next = f->next; + + /* + * Even if stacked, dummy filter will always be last + * as it can only get installed into an empty spot. + */ + if (f == &dummy_filter) + break; + bpf_prog_destroy(f->prog); + kfree(f); + f = next; + } + rcu_read_unlock(); + } + kfree(filters->filters); + kfree(filters); +} + +static void __io_put_bpf_filters(struct io_bpf_filters *filters) +{ + if (refcount_dec_and_test(&filters->refs)) + call_rcu(&filters->rcu_head, io_free_bpf_filters); +} + +void io_put_bpf_filters(struct io_restriction *res) +{ + if (res->bpf_filters) + __io_put_bpf_filters(res->bpf_filters); +} + +static struct io_bpf_filters *io_new_bpf_filters(void) +{ + struct io_bpf_filters *filters __free(kfree) = NULL; + + filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT); + if (!filters) + return ERR_PTR(-ENOMEM); + + filters->filters = kcalloc(IORING_OP_LAST, + sizeof(struct io_bpf_filter *), + GFP_KERNEL_ACCOUNT); + if (!filters->filters) + return ERR_PTR(-ENOMEM); + + refcount_set(&filters->refs, 1); + spin_lock_init(&filters->lock); + return no_free_ptr(filters); +} + +/* + * Validate classic BPF filter instructions. Only allow a safe subset of + * operations - no packet data access, just context field loads and basic + * ALU/jump operations. + */ +static int io_uring_check_cbpf_filter(struct sock_filter *filter, + unsigned int flen) +{ + int pc; + + for (pc = 0; pc < flen; pc++) { + struct sock_filter *ftest = &filter[pc]; + u16 code = ftest->code; + u32 k = ftest->k; + + switch (code) { + case BPF_LD | BPF_W | BPF_ABS: + ftest->code = BPF_LDX | BPF_W | BPF_ABS; + /* 32-bit aligned and not out of bounds. */ + if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3) + return -EINVAL; + continue; + case BPF_LD | BPF_W | BPF_LEN: + ftest->code = BPF_LD | BPF_IMM; + ftest->k = sizeof(struct io_uring_bpf_ctx); + continue; + case BPF_LDX | BPF_W | BPF_LEN: + ftest->code = BPF_LDX | BPF_IMM; + ftest->k = sizeof(struct io_uring_bpf_ctx); + continue; + /* Explicitly include allowed calls. */ + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + case BPF_MISC | BPF_TAX: + case BPF_MISC | BPF_TXA: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + case BPF_JMP | BPF_JA: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + continue; + default: + return -EINVAL; + } + } + return 0; +} + +#define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST + +int io_register_bpf_filter(struct io_restriction *res, + struct io_uring_bpf __user *arg) +{ + struct io_bpf_filter *filter, *old_filter; + struct io_bpf_filters *filters; + struct io_uring_bpf reg; + struct bpf_prog *prog; + struct sock_fprog fprog; + int ret; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (reg.cmd_type != IO_URING_BPF_CMD_FILTER) + return -EINVAL; + if (reg.cmd_flags || reg.resv) + return -EINVAL; + + if (reg.filter.opcode >= IORING_OP_LAST) + return -EINVAL; + if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS) + return -EINVAL; + if (reg.filter.resv) + return -EINVAL; + if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2))) + return -EINVAL; + if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS) + return -EINVAL; + + fprog.len = reg.filter.filter_len; + fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr); + + ret = bpf_prog_create_from_user(&prog, &fprog, + io_uring_check_cbpf_filter, false); + if (ret) + return ret; + + /* + * No existing filters, allocate set. + */ + filters = res->bpf_filters; + if (!filters) { + filters = io_new_bpf_filters(); + if (IS_ERR(filters)) { + ret = PTR_ERR(filters); + goto err_prog; + } + } + + filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT); + if (!filter) { + ret = -ENOMEM; + goto err; + } + filter->prog = prog; + res->bpf_filters = filters; + + /* + * Insert filter - if the current opcode already has a filter + * attached, add to the set. + */ + rcu_read_lock(); + spin_lock_bh(&filters->lock); + old_filter = rcu_dereference(filters->filters[reg.filter.opcode]); + if (old_filter) + filter->next = old_filter; + rcu_assign_pointer(filters->filters[reg.filter.opcode], filter); + + /* + * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered + * opcode with the dummy filter. That will cause them to be denied. + */ + if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) { + for (int i = 0; i < IORING_OP_LAST; i++) { + if (i == reg.filter.opcode) + continue; + old_filter = rcu_dereference(filters->filters[i]); + if (old_filter) + continue; + rcu_assign_pointer(filters->filters[i], &dummy_filter); + } + } + + spin_unlock_bh(&filters->lock); + rcu_read_unlock(); + return 0; +err: + if (filters != res->bpf_filters) + __io_put_bpf_filters(filters); +err_prog: + bpf_prog_destroy(prog); + return ret; +} diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h new file mode 100644 index 000000000000..27eae9705473 --- /dev/null +++ b/io_uring/bpf_filter.h @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IO_URING_BPF_FILTER_H +#define IO_URING_BPF_FILTER_H + +#include + +#ifdef CONFIG_IO_URING_BPF + +int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req); + +int io_register_bpf_filter(struct io_restriction *res, + struct io_uring_bpf __user *arg); + +void io_put_bpf_filters(struct io_restriction *res); + +static inline int io_uring_run_bpf_filters(struct io_restriction *res, + struct io_kiocb *req) +{ + if (res->bpf_filters) + return __io_uring_run_bpf_filters(res, req); + + return 0; +} + +#else + +static inline int io_register_bpf_filter(struct io_restriction *res, + struct io_uring_bpf __user *arg) +{ + return -EINVAL; +} +static inline int io_uring_run_bpf_filters(struct io_restriction *res, + struct io_kiocb *req) +{ + return 0; +} +static inline void io_put_bpf_filters(struct io_restriction *res) +{ +} +#endif /* CONFIG_IO_URING_BPF */ + +#endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a50459238bee..9b9794dfc27a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -94,6 +94,7 @@ #include "alloc_cache.h" #include "eventfd.h" #include "wait.h" +#include "bpf_filter.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -1874,6 +1875,12 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); + if (unlikely(ctx->restrictions.bpf_filters)) { + ret = io_uring_run_bpf_filters(&ctx->restrictions, req); + if (ret) + return io_submit_fail_init(sqe, req, ret); + } + trace_io_uring_submit_req(req); /* @@ -2161,6 +2168,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + io_put_bpf_filters(&ctx->restrictions); WARN_ON_ONCE(ctx->nr_req_allocated); diff --git a/io_uring/register.c b/io_uring/register.c index 8551f13920dc..30957c2cb5eb 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -33,6 +33,7 @@ #include "memmap.h" #include "zcrx.h" #include "query.h" +#include "bpf_filter.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -830,6 +831,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_ZCRX_CTRL: ret = io_zcrx_ctrl(ctx, arg, nr_args); break; + case IORING_REGISTER_BPF_FILTER: + ret = -EINVAL; + + if (nr_args != 1) + break; + ret = io_register_bpf_filter(&ctx->restrictions, arg); + break; default: ret = -EINVAL; break; -- cgit v1.2.3 From cff1c26b4223820431129696b45525e5928e6409 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Jan 2026 14:50:05 -0700 Subject: io_uring/net: allow filtering on IORING_OP_SOCKET data Example population method for the BPF based opcode filtering. This exposes the socket family, type, and protocol to a registered BPF filter. This in turn enables the filter to make decisions based on what was passed in to the IORING_OP_SOCKET request type. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/bpf_filter.h | 7 +++++++ io_uring/bpf_filter.c | 11 +++++++++++ io_uring/net.c | 9 +++++++++ io_uring/net.h | 6 ++++++ 4 files changed, 33 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h index 2d4d0e5743e4..4dbc89bbbf10 100644 --- a/include/uapi/linux/io_uring/bpf_filter.h +++ b/include/uapi/linux/io_uring/bpf_filter.h @@ -16,6 +16,13 @@ struct io_uring_bpf_ctx { __u8 sqe_flags; __u8 pdu_size; /* size of aux data for filter */ __u8 pad[5]; + union { + struct { + __u32 family; + __u32 type; + __u32 protocol; + } socket; + }; }; enum { diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c index 5207226d72ea..889fa915fa54 100644 --- a/io_uring/bpf_filter.c +++ b/io_uring/bpf_filter.c @@ -30,6 +30,17 @@ static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx, /* clear residual, anything from pdu_size and below */ memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0, sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size)); + + /* + * Opcodes can provide a handler fo populating more data into bctx, + * for filters to use. + */ + switch (req->opcode) { + case IORING_OP_SOCKET: + bctx->pdu_size = sizeof(bctx->socket); + io_socket_bpf_populate(bctx, req); + break; + } } /* diff --git a/io_uring/net.c b/io_uring/net.c index 519ea055b761..4fcba36bd0bb 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1699,6 +1699,15 @@ retry: return IOU_COMPLETE; } +void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req) +{ + struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); + + bctx->socket.family = sock->domain; + bctx->socket.type = sock->type; + bctx->socket.protocol = sock->protocol; +} + int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); diff --git a/io_uring/net.h b/io_uring/net.h index 43e5ce5416b7..a862960a3bb9 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -3,6 +3,7 @@ #include #include #include +#include struct io_async_msghdr { #if defined(CONFIG_NET) @@ -44,6 +45,7 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags); int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_socket(struct io_kiocb *req, unsigned int issue_flags); +void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); @@ -64,4 +66,8 @@ void io_netmsg_cache_free(const void *entry); static inline void io_netmsg_cache_free(const void *entry) { } +static inline void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, + struct io_kiocb *req) +{ +} #endif -- cgit v1.2.3 From 8768770cf5d76d177fa2200e6957a372e61e06b5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Jan 2026 15:59:54 -0700 Subject: io_uring/bpf_filter: allow filtering on contents of struct open_how This adds custom filtering for IORING_OP_OPENAT and IORING_OP_OPENAT2, where the open_how flags, mode, and resolve can be checked by filters. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/bpf_filter.h | 5 +++++ io_uring/bpf_filter.c | 6 ++++++ io_uring/openclose.c | 9 +++++++++ io_uring/openclose.h | 3 +++ 4 files changed, 23 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h index 4dbc89bbbf10..220351b81bc0 100644 --- a/include/uapi/linux/io_uring/bpf_filter.h +++ b/include/uapi/linux/io_uring/bpf_filter.h @@ -22,6 +22,11 @@ struct io_uring_bpf_ctx { __u32 type; __u32 protocol; } socket; + struct { + __u64 flags; + __u64 mode; + __u64 resolve; + } open; }; }; diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c index 889fa915fa54..ff723ec44828 100644 --- a/io_uring/bpf_filter.c +++ b/io_uring/bpf_filter.c @@ -12,6 +12,7 @@ #include "io_uring.h" #include "bpf_filter.h" #include "net.h" +#include "openclose.h" struct io_bpf_filter { struct bpf_prog *prog; @@ -40,6 +41,11 @@ static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx, bctx->pdu_size = sizeof(bctx->socket); io_socket_bpf_populate(bctx, req); break; + case IORING_OP_OPENAT: + case IORING_OP_OPENAT2: + bctx->pdu_size = sizeof(bctx->open); + io_openat_bpf_populate(bctx, req); + break; } } diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 15dde9bd6ff6..31c687adf873 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -85,6 +85,15 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return 0; } +void io_openat_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req) +{ + struct io_open *open = io_kiocb_to_cmd(req, struct io_open); + + bctx->open.flags = open->how.flags; + bctx->open.mode = open->how.mode; + bctx->open.resolve = open->how.resolve; +} + int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 4ca2a9935abc..566739920658 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -1,11 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 +#include "bpf_filter.h" + int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset); int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_openat(struct io_kiocb *req, unsigned int issue_flags); void io_open_cleanup(struct io_kiocb *req); +void io_openat_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req); int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_openat2(struct io_kiocb *req, unsigned int issue_flags); -- cgit v1.2.3 From cad3337bb6c3a2ba2307d6a9061e752e15681d2b Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 19 Dec 2025 19:40:33 +0200 Subject: PCI: Add dword #defines for Bus Number + Secondary Latency Timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uapi/linux/pci_regs.h defines Primary/Secondary/Subordinate Bus Numbers and Secondary Latency Timer (PCIe r7.0, sec. 7.5.1.3) as byte register offsets, but in practice the code may read/write the entire dword. In the lack of #defines to handle the dword fields, the code ends up using literals which are not as easy to read. Add dword field masks for the Bus Number and Secondary Latency Timer fields and use them in probe.c. Signed-off-by: Ilpo Järvinen [bhelgaas: squash new #defines and uses together] Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20251219174036.16738-21-ilpo.jarvinen@linux.intel.com Link: https://patch.msgid.link/20251219174036.16738-22-ilpo.jarvinen@linux.intel.com --- drivers/pci/probe.c | 25 +++++++++++++------------ include/uapi/linux/pci_regs.h | 5 +++++ 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index ed4d26833640..53ec1879fb99 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -524,8 +524,8 @@ static void pci_read_bridge_windows(struct pci_dev *bridge) pci_read_config_dword(bridge, PCI_PRIMARY_BUS, &buses); res.flags = IORESOURCE_BUS; - res.start = (buses >> 8) & 0xff; - res.end = (buses >> 16) & 0xff; + res.start = FIELD_GET(PCI_SECONDARY_BUS_MASK, buses); + res.end = FIELD_GET(PCI_SUBORDINATE_BUS_MASK, buses); pci_info(bridge, "PCI bridge to %pR%s\n", &res, bridge->transparent ? " (subtractive decode)" : ""); @@ -1393,9 +1393,9 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev, pm_runtime_get_sync(&dev->dev); pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses); - primary = buses & 0xFF; - secondary = (buses >> 8) & 0xFF; - subordinate = (buses >> 16) & 0xFF; + primary = FIELD_GET(PCI_PRIMARY_BUS_MASK, buses); + secondary = FIELD_GET(PCI_SECONDARY_BUS_MASK, buses); + subordinate = FIELD_GET(PCI_SUBORDINATE_BUS_MASK, buses); pci_dbg(dev, "scanning [bus %02x-%02x] behind bridge, pass %d\n", secondary, subordinate, pass); @@ -1476,7 +1476,7 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev, * ranges. */ pci_write_config_dword(dev, PCI_PRIMARY_BUS, - buses & ~0xffffff); + buses & PCI_SEC_LATENCY_TIMER_MASK); goto out; } @@ -1507,18 +1507,19 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev, if (available_buses) available_buses--; - buses = (buses & 0xff000000) - | ((unsigned int)(child->primary) << 0) - | ((unsigned int)(child->busn_res.start) << 8) - | ((unsigned int)(child->busn_res.end) << 16); + buses = (buses & PCI_SEC_LATENCY_TIMER_MASK) | + FIELD_PREP(PCI_PRIMARY_BUS_MASK, child->primary) | + FIELD_PREP(PCI_SECONDARY_BUS_MASK, child->busn_res.start) | + FIELD_PREP(PCI_SUBORDINATE_BUS_MASK, child->busn_res.end); /* * yenta.c forces a secondary latency timer of 176. * Copy that behaviour here. */ if (is_cardbus) { - buses &= ~0xff000000; - buses |= CARDBUS_LATENCY_TIMER << 24; + buses &= ~PCI_SEC_LATENCY_TIMER_MASK; + buses |= FIELD_PREP(PCI_SEC_LATENCY_TIMER_MASK, + CARDBUS_LATENCY_TIMER); } /* We need to blast all three values with a single write */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 3add74ae2594..8be55ece2a21 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -132,6 +132,11 @@ #define PCI_SECONDARY_BUS 0x19 /* Secondary bus number */ #define PCI_SUBORDINATE_BUS 0x1a /* Highest bus number behind the bridge */ #define PCI_SEC_LATENCY_TIMER 0x1b /* Latency timer for secondary interface */ +/* Masks for dword-sized processing of Bus Number and Sec Latency Timer fields */ +#define PCI_PRIMARY_BUS_MASK 0x000000ff +#define PCI_SECONDARY_BUS_MASK 0x0000ff00 +#define PCI_SUBORDINATE_BUS_MASK 0x00ff0000 +#define PCI_SEC_LATENCY_TIMER_MASK 0xff000000 #define PCI_IO_BASE 0x1c /* I/O range behind the bridge */ #define PCI_IO_LIMIT 0x1d #define PCI_IO_RANGE_TYPE_MASK 0x0fUL /* I/O bridging type */ -- cgit v1.2.3 From e698127eb7249d6c70fa8f2bdba469c0e54f2e2b Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Wed, 23 Jul 2025 10:07:28 -0400 Subject: drm/amdkfd: add extended capabilities to device snapshot Add additional capabilities reporting. Signed-off-by: Jonathan Kim Reviewed-by: James Zhu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 1 + include/uapi/linux/kfd_ioctl.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index 27176b2dc714..8f8a0975f1a7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -1108,6 +1108,7 @@ int kfd_dbg_trap_device_snapshot(struct kfd_process *target, device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask); device_info.capability = topo_dev->node_props.capability; device_info.debug_prop = topo_dev->node_props.debug_prop; + device_info.capability2 = topo_dev->node_props.capability2; if (exception_clear_mask) pdd->exception_status &= ~exception_clear_mask; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 047bcb1cc078..e72359370857 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -149,6 +149,8 @@ struct kfd_dbg_device_info_entry { __u32 num_xcc; __u32 capability; __u32 debug_prop; + __u32 capability2; + __u32 pad; }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ -- cgit v1.2.3 From d8316b837c2ca5f92e781fa1575095c0132ae3c1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Jan 2026 13:59:50 -0500 Subject: nfsd: add controls to set the minimum number of threads per pool Add a new "min_threads" variable to the nfsd_net, along with the corresponding netlink interface, to set that value from userland. Pass that value to svc_set_pool_threads() and svc_set_num_threads(). Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- Documentation/netlink/specs/nfsd.yaml | 5 +++++ fs/nfsd/netlink.c | 5 +++-- fs/nfsd/netns.h | 6 ++++++ fs/nfsd/nfsctl.c | 6 ++++++ fs/nfsd/nfssvc.c | 4 ++-- fs/nfsd/trace.h | 19 +++++++++++++++++++ include/uapi/linux/nfsd_netlink.h | 1 + 7 files changed, 42 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index 100363029e82..badb2fe57c98 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -78,6 +78,9 @@ attribute-sets: - name: scope type: string + - + name: min-threads + type: u32 - name: version attributes: @@ -159,6 +162,7 @@ operations: - gracetime - leasetime - scope + - min-threads - name: threads-get doc: get the number of running threads @@ -170,6 +174,7 @@ operations: - gracetime - leasetime - scope + - min-threads - name: version-set doc: set nfs enabled versions diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index ac51a44e1065..887525964451 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -24,11 +24,12 @@ const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = { }; /* NFSD_CMD_THREADS_SET - do */ -static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE + 1] = { +static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_MIN_THREADS + 1] = { [NFSD_A_SERVER_THREADS] = { .type = NLA_U32, }, [NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, }, [NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, }, [NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, }, + [NFSD_A_SERVER_MIN_THREADS] = { .type = NLA_U32, }, }; /* NFSD_CMD_VERSION_SET - do */ @@ -57,7 +58,7 @@ static const struct genl_split_ops nfsd_nl_ops[] = { .cmd = NFSD_CMD_THREADS_SET, .doit = nfsd_nl_threads_set_doit, .policy = nfsd_threads_set_nl_policy, - .maxattr = NFSD_A_SERVER_SCOPE, + .maxattr = NFSD_A_SERVER_MIN_THREADS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index d83c68872c4c..9fa600602658 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -129,6 +129,12 @@ struct nfsd_net { seqlock_t writeverf_lock; unsigned char writeverf[8]; + /* + * Minimum number of threads to run per pool. If 0 then the + * min == max requested number of threads. + */ + unsigned int min_threads; + u32 clientid_base; u32 clientid_counter; u32 clverifier_counter; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 084fc517e9e1..7a58e54760be 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1642,6 +1642,10 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) scope = nla_data(attr); } + attr = info->attrs[NFSD_A_SERVER_MIN_THREADS]; + if (attr) + nn->min_threads = nla_get_u32(attr); + ret = nfsd_svc(nrpools, nthreads, net, get_current_cred(), scope); if (ret > 0) ret = 0; @@ -1681,6 +1685,8 @@ int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info) nn->nfsd4_grace) || nla_put_u32(skb, NFSD_A_SERVER_LEASETIME, nn->nfsd4_lease) || + nla_put_u32(skb, NFSD_A_SERVER_MIN_THREADS, + nn->min_threads) || nla_put_string(skb, NFSD_A_SERVER_SCOPE, nn->nfsd_name); if (err) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1e2570e3c754..0887ee601d3c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -690,7 +690,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) /* Special case: When n == 1, distribute threads equally among pools. */ if (n == 1) - return svc_set_num_threads(nn->nfsd_serv, 0, nthreads[0]); + return svc_set_num_threads(nn->nfsd_serv, nn->min_threads, nthreads[0]); if (n > nn->nfsd_serv->sv_nrpools) n = nn->nfsd_serv->sv_nrpools; @@ -718,7 +718,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) for (i = 0; i < n; i++) { err = svc_set_pool_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], - 0, nthreads[i]); + nn->min_threads, nthreads[i]); if (err) goto out; } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 8885fd9bead9..d1d0b0dd0545 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -2164,6 +2164,25 @@ TRACE_EVENT(nfsd_ctl_maxblksize, ) ); +TRACE_EVENT(nfsd_ctl_minthreads, + TP_PROTO( + const struct net *net, + int minthreads + ), + TP_ARGS(net, minthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, minthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->minthreads = minthreads + ), + TP_printk("minthreads=%d", + __entry->minthreads + ) +); + TRACE_EVENT(nfsd_ctl_time, TP_PROTO( const struct net *net, diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h index e157e2009ea8..e9efbc9e63d8 100644 --- a/include/uapi/linux/nfsd_netlink.h +++ b/include/uapi/linux/nfsd_netlink.h @@ -35,6 +35,7 @@ enum { NFSD_A_SERVER_GRACETIME, NFSD_A_SERVER_LEASETIME, NFSD_A_SERVER_SCOPE, + NFSD_A_SERVER_MIN_THREADS, __NFSD_A_SERVER_MAX, NFSD_A_SERVER_MAX = (__NFSD_A_SERVER_MAX - 1) -- cgit v1.2.3 From a006ed4ecd4905b69402980ad7d4e5f31bf44953 Mon Sep 17 00:00:00 2001 From: Eugenio Pérez Date: Mon, 19 Jan 2026 15:32:55 +0100 Subject: vduse: add v1 API definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows the kernel to detect whether the userspace VDUSE device supports the VQ group and ASID features. VDUSE devices that don't set the V1 API will not receive the new messages, and vdpa device will be created with only one vq group and asid. The next patches implement the new feature incrementally, only enabling the VDUSE device to set the V1 API version by the end of the series. Acked-by: Jason Wang Reviewed-by: Xie Yongji Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-3-eperezma@redhat.com> --- include/uapi/linux/vduse.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 10ad71aa00d6..ccb92a1efce0 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -10,6 +10,10 @@ #define VDUSE_API_VERSION 0 +/* VQ groups and ASID support */ + +#define VDUSE_API_VERSION_1 1 + /* * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION). * This is used for future extension. -- cgit v1.2.3 From 9350a09afd086771b0612c7b7c9583e8a1568135 Mon Sep 17 00:00:00 2001 From: Eugenio Pérez Date: Mon, 19 Jan 2026 15:32:56 +0100 Subject: vduse: add vq group support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows separate the different virtqueues in groups that shares the same address space. Asking the VDUSE device for the groups of the vq at the beginning as they're needed for the DMA API. Allocating 3 vq groups as net is the device that need the most groups: * Dataplane (guest passthrough) * CVQ * Shadowed vrings. Future versions of the series can include dynamic allocation of the groups array so VDUSE can declare more groups. Acked-by: Jason Wang Reviewed-by: Xie Yongji Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-4-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/vduse_dev.c | 47 ++++++++++++++++++++++++++++++++++---- include/uapi/linux/vduse.h | 12 +++++++--- 2 files changed, 51 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index ae357d014564..5bffc25a266e 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -39,6 +39,7 @@ #define DRV_LICENSE "GPL v2" #define VDUSE_DEV_MAX (1U << MINORBITS) +#define VDUSE_DEV_MAX_GROUPS 0xffff #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) @@ -58,6 +59,7 @@ struct vduse_virtqueue { struct vdpa_vq_state state; bool ready; bool kicked; + u32 group; spinlock_t kick_lock; spinlock_t irq_lock; struct eventfd_ctx *kickfd; @@ -114,6 +116,7 @@ struct vduse_dev { u8 status; u32 vq_num; u32 vq_align; + u32 ngroups; struct vduse_umem *umem; struct mutex mem_lock; unsigned int bounce_size; @@ -592,6 +595,16 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, return 0; } +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (dev->api_version < VDUSE_API_VERSION_1) + return 0; + + return dev->vqs[idx]->group; +} + static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpa_vq_state *state) { @@ -789,6 +802,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .set_vq_cb = vduse_vdpa_set_vq_cb, .set_vq_num = vduse_vdpa_set_vq_num, .get_vq_size = vduse_vdpa_get_vq_size, + .get_vq_group = vduse_get_vq_group, .set_vq_ready = vduse_vdpa_set_vq_ready, .get_vq_ready = vduse_vdpa_get_vq_ready, .set_vq_state = vduse_vdpa_set_vq_state, @@ -1252,12 +1266,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (config.index >= dev->vq_num) break; - if (!is_mem_zero((const char *)config.reserved, - sizeof(config.reserved))) + if (dev->api_version < VDUSE_API_VERSION_1) { + if (config.group) + break; + } else { + if (config.group >= dev->ngroups) + break; + if (dev->status & VIRTIO_CONFIG_S_DRIVER_OK) + break; + } + + if (config.reserved1 || + !is_mem_zero((const char *)config.reserved2, + sizeof(config.reserved2))) break; index = array_index_nospec(config.index, dev->vq_num); dev->vqs[index]->num_max = config.max_size; + dev->vqs[index]->group = config.group; ret = 0; break; } @@ -1737,12 +1763,20 @@ static bool features_is_valid(struct vduse_dev_config *config) return true; } -static bool vduse_validate_config(struct vduse_dev_config *config) +static bool vduse_validate_config(struct vduse_dev_config *config, + u64 api_version) { if (!is_mem_zero((const char *)config->reserved, sizeof(config->reserved))) return false; + if (api_version < VDUSE_API_VERSION_1 && config->ngroups) + return false; + + if (api_version >= VDUSE_API_VERSION_1 && + (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)) + return false; + if (config->vq_align > PAGE_SIZE) return false; @@ -1858,6 +1892,9 @@ static int vduse_create_dev(struct vduse_dev_config *config, dev->device_features = config->features; dev->device_id = config->device_id; dev->vendor_id = config->vendor_id; + dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1) + ? 1 + : config->ngroups; dev->name = kstrdup(config->name, GFP_KERNEL); if (!dev->name) goto err_str; @@ -1936,7 +1973,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, break; ret = -EINVAL; - if (vduse_validate_config(&config) == false) + if (!vduse_validate_config(&config, control->api_version)) break; buf = vmemdup_user(argp + size, config.config_size); @@ -2017,7 +2054,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, &vduse_vdpa_config_ops, &vduse_map_ops, - 1, 1, name, true); + dev->ngroups, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index ccb92a1efce0..a3d51cf6df3a 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -31,6 +31,7 @@ * @features: virtio features * @vq_num: the number of virtqueues * @vq_align: the allocation alignment of virtqueue's metadata + * @ngroups: number of vq groups that VDUSE device declares * @reserved: for future use, needs to be initialized to zero * @config_size: the size of the configuration space * @config: the buffer of the configuration space @@ -45,7 +46,8 @@ struct vduse_dev_config { __u64 features; __u32 vq_num; __u32 vq_align; - __u32 reserved[13]; + __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ + __u32 reserved[12]; __u32 config_size; __u8 config[]; }; @@ -122,14 +124,18 @@ struct vduse_config_data { * struct vduse_vq_config - basic configuration of a virtqueue * @index: virtqueue index * @max_size: the max size of virtqueue - * @reserved: for future use, needs to be initialized to zero + * @reserved1: for future use, needs to be initialized to zero + * @group: virtqueue group + * @reserved2: for future use, needs to be initialized to zero * * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue. */ struct vduse_vq_config { __u32 index; __u16 max_size; - __u16 reserved[13]; + __u16 reserved1; + __u32 group; + __u16 reserved2[10]; }; /* -- cgit v1.2.3 From 079212f6877e5d07308c8998a8fbc7539ca3f8f3 Mon Sep 17 00:00:00 2001 From: Eugenio Pérez Date: Mon, 19 Jan 2026 15:33:04 +0100 Subject: vduse: add vq group asid support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for assigning Address Space Identifiers (ASIDs) to each VQ group. This enables mapping each group into a distinct memory space. The vq group to ASID association is protected by a rwlock now. But the mutex domain_lock keeps protecting the domains of all ASIDs, as some operations like the one related with the bounce buffer size still requires to lock all the ASIDs. Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Michael S. Tsirkin Message-Id: <20260119143306.1818855-12-eperezma@redhat.com> --- drivers/vdpa/vdpa_user/vduse_dev.c | 385 ++++++++++++++++++++++++------------- include/uapi/linux/vduse.h | 66 ++++++- 2 files changed, 315 insertions(+), 136 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index d658f3e1cebf..2727c0c26003 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -9,6 +9,7 @@ */ #include "linux/virtio_net.h" +#include #include #include #include @@ -41,6 +42,7 @@ #define VDUSE_DEV_MAX (1U << MINORBITS) #define VDUSE_DEV_MAX_GROUPS 0xffff +#define VDUSE_DEV_MAX_AS 0xffff #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) @@ -86,7 +88,15 @@ struct vduse_umem { struct mm_struct *mm; }; +struct vduse_as { + struct vduse_iova_domain *domain; + struct vduse_umem *umem; + struct mutex mem_lock; +}; + struct vduse_vq_group { + rwlock_t as_lock; + struct vduse_as *as; /* Protected by as_lock */ struct vduse_dev *dev; }; @@ -94,7 +104,7 @@ struct vduse_dev { struct vduse_vdpa *vdev; struct device *dev; struct vduse_virtqueue **vqs; - struct vduse_iova_domain *domain; + struct vduse_as *as; char *name; struct mutex lock; spinlock_t msg_lock; @@ -122,9 +132,8 @@ struct vduse_dev { u32 vq_num; u32 vq_align; u32 ngroups; - struct vduse_umem *umem; + u32 nas; struct vduse_vq_group *groups; - struct mutex mem_lock; unsigned int bounce_size; struct mutex domain_lock; }; @@ -314,7 +323,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status) return vduse_dev_msg_sync(dev, &msg); } -static int vduse_dev_update_iotlb(struct vduse_dev *dev, +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid, u64 start, u64 last) { struct vduse_dev_msg msg = { 0 }; @@ -323,8 +332,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev, return -EINVAL; msg.req.type = VDUSE_UPDATE_IOTLB; - msg.req.iova.start = start; - msg.req.iova.last = last; + if (dev->api_version < VDUSE_API_VERSION_1) { + msg.req.iova.start = start; + msg.req.iova.last = last; + } else { + msg.req.iova_v2.start = start; + msg.req.iova_v2.last = last; + msg.req.iova_v2.asid = asid; + } return vduse_dev_msg_sync(dev, &msg); } @@ -439,11 +454,14 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait) static void vduse_dev_reset(struct vduse_dev *dev) { int i; - struct vduse_iova_domain *domain = dev->domain; /* The coherent mappings are handled in vduse_dev_free_coherent() */ - if (domain && domain->bounce_map) - vduse_domain_reset_bounce_map(domain); + for (i = 0; i < dev->nas; i++) { + struct vduse_iova_domain *domain = dev->as[i].domain; + + if (domain && domain->bounce_map) + vduse_domain_reset_bounce_map(domain); + } down_write(&dev->rwsem); @@ -622,6 +640,42 @@ static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx) return ret; } +DEFINE_GUARD(vq_group_as_read_lock, struct vduse_vq_group *, + if (_T->dev->nas > 1) + read_lock(&_T->as_lock), + if (_T->dev->nas > 1) + read_unlock(&_T->as_lock)) + +DEFINE_GUARD(vq_group_as_write_lock, struct vduse_vq_group *, + if (_T->dev->nas > 1) + write_lock(&_T->as_lock), + if (_T->dev->nas > 1) + write_unlock(&_T->as_lock)) + +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group, + unsigned int asid) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_dev_msg msg = { 0 }; + int r; + + if (dev->api_version < VDUSE_API_VERSION_1) + return -EINVAL; + + msg.req.type = VDUSE_SET_VQ_GROUP_ASID; + msg.req.vq_group_asid.group = group; + msg.req.vq_group_asid.asid = asid; + + r = vduse_dev_msg_sync(dev, &msg); + if (r < 0) + return r; + + guard(vq_group_as_write_lock)(&dev->groups[group]); + dev->groups[group].as = &dev->as[asid]; + + return 0; +} + static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpa_vq_state *state) { @@ -793,13 +847,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa, struct vduse_dev *dev = vdpa_to_vduse(vdpa); int ret; - ret = vduse_domain_set_map(dev->domain, iotlb); + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb); if (ret) return ret; - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX); + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX); if (ret) { - vduse_domain_clear_map(dev->domain, iotlb); + vduse_domain_clear_map(dev->as[asid].domain, iotlb); return ret; } @@ -842,6 +896,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .get_vq_affinity = vduse_vdpa_get_vq_affinity, .reset = vduse_vdpa_reset, .set_map = vduse_vdpa_set_map, + .set_group_asid = vduse_set_group_asid, .get_vq_map = vduse_get_vq_map, .free = vduse_vdpa_free, }; @@ -850,15 +905,13 @@ static void vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_dev *vdev; struct vduse_iova_domain *domain; if (!token.group) return; - vdev = token.group->dev; - domain = vdev->domain; - + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; vduse_domain_sync_single_for_device(domain, dma_addr, size, dir); } @@ -866,15 +919,13 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_dev *vdev; struct vduse_iova_domain *domain; if (!token.group) return; - vdev = token.group->dev; - domain = vdev->domain; - + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir); } @@ -883,15 +934,13 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_dev *vdev; struct vduse_iova_domain *domain; if (!token.group) return DMA_MAPPING_ERROR; - vdev = token.group->dev; - domain = vdev->domain; - + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; return vduse_domain_map_page(domain, page, offset, size, dir, attrs); } @@ -899,23 +948,19 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_dev *vdev; struct vduse_iova_domain *domain; if (!token.group) return; - vdev = token.group->dev; - domain = vdev->domain; - - return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); } static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, dma_addr_t *dma_addr, gfp_t flag) { - struct vduse_dev *vdev; - struct vduse_iova_domain *domain; void *addr; *dma_addr = DMA_MAPPING_ERROR; @@ -926,11 +971,15 @@ static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, if (!addr) return NULL; - vdev = token.group->dev; - domain = vdev->domain; - *dma_addr = vduse_domain_alloc_coherent(domain, size, addr); - if (*dma_addr == DMA_MAPPING_ERROR) - goto err; + { + struct vduse_iova_domain *domain; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + *dma_addr = vduse_domain_alloc_coherent(domain, size, addr); + if (*dma_addr == DMA_MAPPING_ERROR) + goto err; + } return addr; @@ -943,31 +992,27 @@ static void vduse_dev_free_coherent(union virtio_map token, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { - struct vduse_dev *vdev; - struct vduse_iova_domain *domain; - if (!token.group) return; - vdev = token.group->dev; - domain = vdev->domain; + { + struct vduse_iova_domain *domain; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + vduse_domain_free_coherent(domain, size, dma_addr, attrs); + } - vduse_domain_free_coherent(domain, size, dma_addr, attrs); free_pages_exact(vaddr, size); } static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) { - struct vduse_dev *vdev; - struct vduse_iova_domain *domain; - if (!token.group) return false; - vdev = token.group->dev; - domain = vdev->domain; - - return dma_addr < domain->bounce_size; + guard(vq_group_as_read_lock)(token.group); + return dma_addr < token.group->as->domain->bounce_size; } static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) @@ -979,16 +1024,11 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) static size_t vduse_dev_max_mapping_size(union virtio_map token) { - struct vduse_dev *vdev; - struct vduse_iova_domain *domain; - if (!token.group) return 0; - vdev = token.group->dev; - domain = vdev->domain; - - return domain->bounce_size; + guard(vq_group_as_read_lock)(token.group); + return token.group->as->domain->bounce_size; } static const struct virtio_map_ops vduse_map_ops = { @@ -1128,39 +1168,40 @@ unlock: return ret; } -static int vduse_dev_dereg_umem(struct vduse_dev *dev, +static int vduse_dev_dereg_umem(struct vduse_dev *dev, u32 asid, u64 iova, u64 size) { int ret; - mutex_lock(&dev->mem_lock); + mutex_lock(&dev->as[asid].mem_lock); ret = -ENOENT; - if (!dev->umem) + if (!dev->as[asid].umem) goto unlock; ret = -EINVAL; - if (!dev->domain) + if (!dev->as[asid].domain) goto unlock; - if (dev->umem->iova != iova || size != dev->domain->bounce_size) + if (dev->as[asid].umem->iova != iova || + size != dev->as[asid].domain->bounce_size) goto unlock; - vduse_domain_remove_user_bounce_pages(dev->domain); - unpin_user_pages_dirty_lock(dev->umem->pages, - dev->umem->npages, true); - atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm); - mmdrop(dev->umem->mm); - vfree(dev->umem->pages); - kfree(dev->umem); - dev->umem = NULL; + vduse_domain_remove_user_bounce_pages(dev->as[asid].domain); + unpin_user_pages_dirty_lock(dev->as[asid].umem->pages, + dev->as[asid].umem->npages, true); + atomic64_sub(dev->as[asid].umem->npages, &dev->as[asid].umem->mm->pinned_vm); + mmdrop(dev->as[asid].umem->mm); + vfree(dev->as[asid].umem->pages); + kfree(dev->as[asid].umem); + dev->as[asid].umem = NULL; ret = 0; unlock: - mutex_unlock(&dev->mem_lock); + mutex_unlock(&dev->as[asid].mem_lock); return ret; } static int vduse_dev_reg_umem(struct vduse_dev *dev, - u64 iova, u64 uaddr, u64 size) + u32 asid, u64 iova, u64 uaddr, u64 size) { struct page **page_list = NULL; struct vduse_umem *umem = NULL; @@ -1168,14 +1209,14 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, unsigned long npages, lock_limit; int ret; - if (!dev->domain || !dev->domain->bounce_map || - size != dev->domain->bounce_size || + if (!dev->as[asid].domain || !dev->as[asid].domain->bounce_map || + size != dev->as[asid].domain->bounce_size || iova != 0 || uaddr & ~PAGE_MASK) return -EINVAL; - mutex_lock(&dev->mem_lock); + mutex_lock(&dev->as[asid].mem_lock); ret = -EEXIST; - if (dev->umem) + if (dev->as[asid].umem) goto unlock; ret = -ENOMEM; @@ -1199,7 +1240,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, goto out; } - ret = vduse_domain_add_user_bounce_pages(dev->domain, + ret = vduse_domain_add_user_bounce_pages(dev->as[asid].domain, page_list, pinned); if (ret) goto out; @@ -1212,7 +1253,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, umem->mm = current->mm; mmgrab(current->mm); - dev->umem = umem; + dev->as[asid].umem = umem; out: if (ret && pinned > 0) unpin_user_pages(page_list, pinned); @@ -1223,7 +1264,7 @@ unlock: vfree(page_list); kfree(umem); } - mutex_unlock(&dev->mem_lock); + mutex_unlock(&dev->as[asid].mem_lock); return ret; } @@ -1244,44 +1285,47 @@ static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq) } static int vduse_dev_iotlb_entry(struct vduse_dev *dev, - struct vduse_iotlb_entry *entry, + struct vduse_iotlb_entry_v2 *entry, struct file **f, uint64_t *capability) { + u32 asid; int r = -EINVAL; struct vhost_iotlb_map *map; - if (entry->start > entry->last) + if (entry->v1.start > entry->v1.last || entry->asid >= dev->nas) return -EINVAL; + asid = array_index_nospec(entry->asid, dev->nas); mutex_lock(&dev->domain_lock); - if (!dev->domain) + + if (!dev->as[asid].domain) goto out; - spin_lock(&dev->domain->iotlb_lock); - map = vhost_iotlb_itree_first(dev->domain->iotlb, entry->start, - entry->last); + spin_lock(&dev->as[asid].domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb, + entry->v1.start, entry->v1.last); if (map) { if (f) { const struct vdpa_map_file *map_file; map_file = (struct vdpa_map_file *)map->opaque; - entry->offset = map_file->offset; + entry->v1.offset = map_file->offset; *f = get_file(map_file->file); } - entry->start = map->start; - entry->last = map->last; - entry->perm = map->perm; + entry->v1.start = map->start; + entry->v1.last = map->last; + entry->v1.perm = map->perm; if (capability) { *capability = 0; - if (dev->domain->bounce_map && map->start == 0 && - map->last == dev->domain->bounce_size - 1) + if (dev->as[asid].domain->bounce_map && map->start == 0 && + map->last == dev->as[asid].domain->bounce_size - 1) *capability |= VDUSE_IOVA_CAP_UMEM; } r = 0; } - spin_unlock(&dev->domain->iotlb_lock); + spin_unlock(&dev->as[asid].domain->iotlb_lock); out: mutex_unlock(&dev->domain_lock); @@ -1299,12 +1343,29 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, return -EPERM; switch (cmd) { - case VDUSE_IOTLB_GET_FD: { - struct vduse_iotlb_entry entry; + case VDUSE_IOTLB_GET_FD: + case VDUSE_IOTLB_GET_FD2: { + struct vduse_iotlb_entry_v2 entry = {0}; struct file *f = NULL; + ret = -ENOIOCTLCMD; + if (dev->api_version < VDUSE_API_VERSION_1 && + cmd == VDUSE_IOTLB_GET_FD2) + break; + ret = -EFAULT; - if (copy_from_user(&entry, argp, sizeof(entry))) + if (cmd == VDUSE_IOTLB_GET_FD2) { + if (copy_from_user(&entry, argp, sizeof(entry))) + break; + } else { + if (copy_from_user(&entry.v1, argp, + sizeof(entry.v1))) + break; + } + + ret = -EINVAL; + if (!is_mem_zero((const char *)entry.reserved, + sizeof(entry.reserved))) break; ret = vduse_dev_iotlb_entry(dev, &entry, &f, NULL); @@ -1315,12 +1376,19 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (!f) break; - ret = -EFAULT; - if (copy_to_user(argp, &entry, sizeof(entry))) { + if (cmd == VDUSE_IOTLB_GET_FD2) + ret = copy_to_user(argp, &entry, + sizeof(entry)); + else + ret = copy_to_user(argp, &entry.v1, + sizeof(entry.v1)); + + if (ret) { + ret = -EFAULT; fput(f); break; } - ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm)); + ret = receive_fd(f, NULL, perm_to_file_flags(entry.v1.perm)); fput(f); break; } @@ -1465,6 +1533,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, } case VDUSE_IOTLB_REG_UMEM: { struct vduse_iova_umem umem; + u32 asid; ret = -EFAULT; if (copy_from_user(&umem, argp, sizeof(umem))) @@ -1472,17 +1541,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = -EINVAL; if (!is_mem_zero((const char *)umem.reserved, - sizeof(umem.reserved))) + sizeof(umem.reserved)) || + (dev->api_version < VDUSE_API_VERSION_1 && + umem.asid != 0) || umem.asid >= dev->nas) break; mutex_lock(&dev->domain_lock); - ret = vduse_dev_reg_umem(dev, umem.iova, + asid = array_index_nospec(umem.asid, dev->nas); + ret = vduse_dev_reg_umem(dev, asid, umem.iova, umem.uaddr, umem.size); mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_DEREG_UMEM: { struct vduse_iova_umem umem; + u32 asid; ret = -EFAULT; if (copy_from_user(&umem, argp, sizeof(umem))) @@ -1490,17 +1563,22 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = -EINVAL; if (!is_mem_zero((const char *)umem.reserved, - sizeof(umem.reserved))) + sizeof(umem.reserved)) || + (dev->api_version < VDUSE_API_VERSION_1 && + umem.asid != 0) || + umem.asid >= dev->nas) break; + mutex_lock(&dev->domain_lock); - ret = vduse_dev_dereg_umem(dev, umem.iova, + asid = array_index_nospec(umem.asid, dev->nas); + ret = vduse_dev_dereg_umem(dev, asid, umem.iova, umem.size); mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_GET_INFO: { struct vduse_iova_info info; - struct vduse_iotlb_entry entry; + struct vduse_iotlb_entry_v2 entry; ret = -EFAULT; if (copy_from_user(&info, argp, sizeof(info))) @@ -1510,15 +1588,23 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, sizeof(info.reserved))) break; - entry.start = info.start; - entry.last = info.last; + if (dev->api_version < VDUSE_API_VERSION_1) { + if (info.asid) + break; + } else if (info.asid >= dev->nas) + break; + + entry.v1.start = info.start; + entry.v1.last = info.last; + entry.asid = info.asid; ret = vduse_dev_iotlb_entry(dev, &entry, NULL, &info.capability); if (ret < 0) break; - info.start = entry.start; - info.last = entry.last; + info.start = entry.v1.start; + info.last = entry.v1.last; + info.asid = entry.asid; ret = -EFAULT; if (copy_to_user(argp, &info, sizeof(info))) @@ -1540,8 +1626,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file) struct vduse_dev *dev = file->private_data; mutex_lock(&dev->domain_lock); - if (dev->domain) - vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); + for (int i = 0; i < dev->nas; i++) + if (dev->as[i].domain) + vduse_dev_dereg_umem(dev, i, 0, + dev->as[i].domain->bounce_size); mutex_unlock(&dev->domain_lock); spin_lock(&dev->msg_lock); /* Make sure the inflight messages can processed after reconncection */ @@ -1760,7 +1848,6 @@ static struct vduse_dev *vduse_dev_create(void) return NULL; mutex_init(&dev->lock); - mutex_init(&dev->mem_lock); mutex_init(&dev->domain_lock); spin_lock_init(&dev->msg_lock); INIT_LIST_HEAD(&dev->send_list); @@ -1811,8 +1898,11 @@ static int vduse_destroy_dev(char *name) idr_remove(&vduse_idr, dev->minor); kvfree(dev->config); vduse_dev_deinit_vqs(dev); - if (dev->domain) - vduse_domain_destroy(dev->domain); + for (int i = 0; i < dev->nas; i++) { + if (dev->as[i].domain) + vduse_domain_destroy(dev->as[i].domain); + } + kfree(dev->as); kfree(dev->name); kfree(dev->groups); vduse_dev_destroy(dev); @@ -1859,12 +1949,17 @@ static bool vduse_validate_config(struct vduse_dev_config *config, sizeof(config->reserved))) return false; - if (api_version < VDUSE_API_VERSION_1 && config->ngroups) + if (api_version < VDUSE_API_VERSION_1 && + (config->ngroups || config->nas)) return false; - if (api_version >= VDUSE_API_VERSION_1 && - (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)) - return false; + if (api_version >= VDUSE_API_VERSION_1) { + if (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS) + return false; + + if (!config->nas || config->nas > VDUSE_DEV_MAX_AS) + return false; + } if (config->vq_align > PAGE_SIZE) return false; @@ -1929,7 +2024,8 @@ static ssize_t bounce_size_store(struct device *device, ret = -EPERM; mutex_lock(&dev->domain_lock); - if (dev->domain) + /* Assuming that if the first domain is allocated, all are allocated */ + if (dev->as[0].domain) goto unlock; ret = kstrtouint(buf, 10, &bounce_size); @@ -1981,6 +2077,14 @@ static int vduse_create_dev(struct vduse_dev_config *config, dev->device_features = config->features; dev->device_id = config->device_id; dev->vendor_id = config->vendor_id; + + dev->nas = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->nas; + dev->as = kcalloc(dev->nas, sizeof(dev->as[0]), GFP_KERNEL); + if (!dev->as) + goto err_as; + for (int i = 0; i < dev->nas; i++) + mutex_init(&dev->as[i].mem_lock); + dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->ngroups; @@ -1988,8 +2092,11 @@ static int vduse_create_dev(struct vduse_dev_config *config, GFP_KERNEL); if (!dev->groups) goto err_vq_groups; - for (u32 i = 0; i < dev->ngroups; ++i) + for (u32 i = 0; i < dev->ngroups; ++i) { dev->groups[i].dev = dev; + rwlock_init(&dev->groups[i].as_lock); + dev->groups[i].as = &dev->as[0]; + } dev->name = kstrdup(config->name, GFP_KERNEL); if (!dev->name) @@ -2029,6 +2136,8 @@ err_idr: err_str: kfree(dev->groups); err_vq_groups: + kfree(dev->as); +err_as: vduse_dev_destroy(dev); err: return ret; @@ -2152,7 +2261,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, &vduse_vdpa_config_ops, &vduse_map_ops, - dev->ngroups, 1, name, true); + dev->ngroups, dev->nas, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); @@ -2167,7 +2276,8 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, const struct vdpa_dev_set_config *config) { struct vduse_dev *dev; - int ret; + size_t domain_bounce_size; + int ret, i; mutex_lock(&vduse_lock); dev = vduse_find_dev(name); @@ -2181,29 +2291,38 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, return ret; mutex_lock(&dev->domain_lock); - if (!dev->domain) - dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, - dev->bounce_size); - mutex_unlock(&dev->domain_lock); - if (!dev->domain) { - ret = -ENOMEM; - goto domain_err; + ret = 0; + + domain_bounce_size = dev->bounce_size / dev->nas; + for (i = 0; i < dev->nas; ++i) { + dev->as[i].domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, + domain_bounce_size); + if (!dev->as[i].domain) { + ret = -ENOMEM; + goto err; + } } + mutex_unlock(&dev->domain_lock); + ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); - if (ret) { - goto register_err; - } + if (ret) + goto err_register; return 0; -register_err: +err_register: mutex_lock(&dev->domain_lock); - vduse_domain_destroy(dev->domain); - dev->domain = NULL; + +err: + for (int j = 0; j < i; j++) { + if (dev->as[j].domain) { + vduse_domain_destroy(dev->as[j].domain); + dev->as[j].domain = NULL; + } + } mutex_unlock(&dev->domain_lock); -domain_err: put_device(&dev->vdev->vdpa.dev); return ret; diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index a3d51cf6df3a..68b4287f9fac 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -32,6 +32,7 @@ * @vq_num: the number of virtqueues * @vq_align: the allocation alignment of virtqueue's metadata * @ngroups: number of vq groups that VDUSE device declares + * @nas: number of address spaces that VDUSE device declares * @reserved: for future use, needs to be initialized to zero * @config_size: the size of the configuration space * @config: the buffer of the configuration space @@ -47,7 +48,8 @@ struct vduse_dev_config { __u32 vq_num; __u32 vq_align; __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ - __u32 reserved[12]; + __u32 nas; /* if VDUSE_API_VERSION >= 1 */ + __u32 reserved[11]; __u32 config_size; __u8 config[]; }; @@ -166,6 +168,16 @@ struct vduse_vq_state_packed { __u16 last_used_idx; }; +/** + * struct vduse_vq_group_asid - virtqueue group ASID + * @group: Index of the virtqueue group + * @asid: Address space ID of the group + */ +struct vduse_vq_group_asid { + __u32 group; + __u32 asid; +}; + /** * struct vduse_vq_info - information of a virtqueue * @index: virtqueue index @@ -225,6 +237,7 @@ struct vduse_vq_eventfd { * @uaddr: start address of userspace memory, it must be aligned to page size * @iova: start of the IOVA region * @size: size of the IOVA region + * @asid: Address space ID of the IOVA region * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM @@ -234,7 +247,8 @@ struct vduse_iova_umem { __u64 uaddr; __u64 iova; __u64 size; - __u64 reserved[3]; + __u32 asid; + __u32 reserved[5]; }; /* Register userspace memory for IOVA regions */ @@ -248,6 +262,7 @@ struct vduse_iova_umem { * @start: start of the IOVA region * @last: last of the IOVA region * @capability: capability of the IOVA region + * @asid: Address space ID of the IOVA region, only if device API version >= 1 * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of @@ -258,7 +273,8 @@ struct vduse_iova_info { __u64 last; #define VDUSE_IOVA_CAP_UMEM (1 << 0) __u64 capability; - __u64 reserved[3]; + __u32 asid; /* Only if device API version >= 1 */ + __u32 reserved[5]; }; /* @@ -267,6 +283,28 @@ struct vduse_iova_info { */ #define VDUSE_IOTLB_GET_INFO _IOWR(VDUSE_BASE, 0x1a, struct vduse_iova_info) +/** + * struct vduse_iotlb_entry_v2 - entry of IOTLB to describe one IOVA region + * + * @v1: the original vduse_iotlb_entry + * @asid: address space ID of the IOVA region + * @reserved: for future use, needs to be initialized to zero + * + * Structure used by VDUSE_IOTLB_GET_FD2 ioctl to find an overlapped IOVA region. + */ +struct vduse_iotlb_entry_v2 { + struct vduse_iotlb_entry v1; + __u32 asid; + __u32 reserved[12]; +}; + +/* + * Same as VDUSE_IOTLB_GET_FD but with vduse_iotlb_entry_v2 argument that + * support extra fields. + */ +#define VDUSE_IOTLB_GET_FD2 _IOWR(VDUSE_BASE, 0x1b, struct vduse_iotlb_entry_v2) + + /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */ /** @@ -275,11 +313,14 @@ struct vduse_iova_info { * @VDUSE_SET_STATUS: set the device status * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for * specified IOVA range via VDUSE_IOTLB_GET_FD ioctl + * @VDUSE_SET_VQ_GROUP_ASID: Notify userspace to update the address space of a + * virtqueue group. */ enum vduse_req_type { VDUSE_GET_VQ_STATE, VDUSE_SET_STATUS, VDUSE_UPDATE_IOTLB, + VDUSE_SET_VQ_GROUP_ASID, }; /** @@ -314,6 +355,18 @@ struct vduse_iova_range { __u64 last; }; +/** + * struct vduse_iova_range_v2 - IOVA range [start, last] if API_VERSION >= 1 + * @start: start of the IOVA range + * @last: last of the IOVA range + * @asid: address space ID of the IOVA range + */ +struct vduse_iova_range_v2 { + __u64 start; + __u64 last; + __u32 asid; +}; + /** * struct vduse_dev_request - control request * @type: request type @@ -322,6 +375,8 @@ struct vduse_iova_range { * @vq_state: virtqueue state, only index field is available * @s: device status * @iova: IOVA range for updating + * @iova_v2: IOVA range for updating if API_VERSION >= 1 + * @vq_group_asid: ASID of a virtqueue group * @padding: padding * * Structure used by read(2) on /dev/vduse/$NAME. @@ -334,6 +389,11 @@ struct vduse_dev_request { struct vduse_vq_state vq_state; struct vduse_dev_status s; struct vduse_iova_range iova; + /* Following members but padding exist only if vduse api + * version >= 1 + */ + struct vduse_iova_range_v2 iova_v2; + struct vduse_vq_group_asid vq_group_asid; __u32 padding[32]; }; }; -- cgit v1.2.3 From 5ca243f6e3c30b979a54a96b96df355dda2b4d0f Mon Sep 17 00:00:00 2001 From: Deepak Gupta Date: Sun, 25 Jan 2026 21:09:54 -0700 Subject: prctl: add arch-agnostic prctl()s for indirect branch tracking Three architectures (x86, aarch64, riscv) have support for indirect branch tracking feature in a very similar fashion. On a very high level, indirect branch tracking is a CPU feature where CPU tracks branches which use a memory operand to transfer control. As part of this tracking, during an indirect branch, the CPU expects a landing pad instruction on the target PC, and if not found, the CPU raises some fault (architecture-dependent). x86 landing pad instr - 'ENDBRANCH' arch64 landing pad instr - 'BTI' riscv landing instr - 'lpad' Given that three major architectures have support for indirect branch tracking, this patch creates architecture-agnostic 'prctls' to allow userspace to control this feature. They are: - PR_GET_INDIR_BR_LP_STATUS: Get the current configured status for indirect branch tracking. - PR_SET_INDIR_BR_LP_STATUS: Set the configuration for indirect branch tracking. The following status options are allowed: - PR_INDIR_BR_LP_ENABLE: Enables indirect branch tracking on user thread. - PR_INDIR_BR_LP_DISABLE: Disables indirect branch tracking on user thread. - PR_LOCK_INDIR_BR_LP_STATUS: Locks configured status for indirect branch tracking for user thread. Reviewed-by: Mark Brown Reviewed-by: Zong Li Signed-off-by: Deepak Gupta Tested-by: Andreas Korb # QEMU, custom CVA6 Tested-by: Valentin Haudiquet Link: https://patch.msgid.link/20251112-v5_user_cfi_series-v23-13-b55691eacf4f@rivosinc.com [pjw@kernel.org: cleaned up patch description, code comments] Signed-off-by: Paul Walmsley --- include/linux/cpu.h | 4 ++++ include/uapi/linux/prctl.h | 27 +++++++++++++++++++++++++++ kernel/sys.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 487b3bf2e1ea..8239cd95a005 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -229,4 +229,8 @@ static inline bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v) #define smt_mitigations SMT_MITIGATIONS_OFF #endif +int arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status); +int arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status); +int arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status); + #endif /* _LINUX_CPU_H_ */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 51c4e8c82b1e..f57098fb0ba8 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -386,4 +386,31 @@ struct prctl_mm_map { # define PR_FUTEX_HASH_SET_SLOTS 1 # define PR_FUTEX_HASH_GET_SLOTS 2 +/* + * Get the current indirect branch tracking configuration for the current + * thread, this will be the value configured via PR_SET_INDIR_BR_LP_STATUS. + */ +#define PR_GET_INDIR_BR_LP_STATUS 79 + +/* + * Set the indirect branch tracking configuration. PR_INDIR_BR_LP_ENABLE will + * enable cpu feature for user thread, to track all indirect branches and ensure + * they land on arch defined landing pad instruction. + * x86 - If enabled, an indirect branch must land on an ENDBRANCH instruction. + * arch64 - If enabled, an indirect branch must land on a BTI instruction. + * riscv - If enabled, an indirect branch must land on an lpad instruction. + * PR_INDIR_BR_LP_DISABLE will disable feature for user thread and indirect + * branches will no more be tracked by cpu to land on arch defined landing pad + * instruction. + */ +#define PR_SET_INDIR_BR_LP_STATUS 80 +# define PR_INDIR_BR_LP_ENABLE (1UL << 0) + +/* + * Prevent further changes to the specified indirect branch tracking + * configuration. All bits may be locked via this call, including + * undefined bits. + */ +#define PR_LOCK_INDIR_BR_LP_STATUS 81 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 8b58eece4e58..9071422c1609 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2388,6 +2388,21 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st return -EINVAL; } +int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status) +{ + return -EINVAL; +} + +int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + +int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) static int prctl_set_vma(unsigned long opt, unsigned long addr, @@ -2868,6 +2883,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_FUTEX_HASH: error = futex_hash_prctl(arg2, arg3, arg4); break; + case PR_GET_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_get_indir_br_lp_status(me, (unsigned long __user *)arg2); + break; + case PR_SET_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_set_indir_br_lp_status(me, arg2); + break; + case PR_LOCK_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_lock_indir_br_lp_status(me, arg2); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; -- cgit v1.2.3 From 2af7c9cf021c5dabe880b68e5cc22c618060d954 Mon Sep 17 00:00:00 2001 From: Deepak Gupta Date: Sun, 25 Jan 2026 21:09:55 -0700 Subject: riscv/ptrace: expose riscv CFI status and state via ptrace and in core files Expose a new register type NT_RISCV_USER_CFI for risc-v CFI status and state. Intentionally, both landing pad and shadow stack status and state are rolled into the CFI state. Creating two different NT_RISCV_USER_XXX would not be useful and would waste a note type. Enabling, disabling and locking the CFI feature is not allowed via ptrace set interface. However, setting 'elp' state or setting shadow stack pointer are allowed via the ptrace set interface. It is expected that 'gdb' might need to fixup 'elp' state or 'shadow stack' pointer. Signed-off-by: Deepak Gupta Tested-by: Andreas Korb # QEMU, custom CVA6 Tested-by: Valentin Haudiquet Link: https://patch.msgid.link/20251112-v5_user_cfi_series-v23-19-b55691eacf4f@rivosinc.com [pjw@kernel.org: updated to apply; cleaned patch description and comments; addressed checkpatch issues] Signed-off-by: Paul Walmsley --- arch/riscv/include/uapi/asm/ptrace.h | 30 ++++++++++++ arch/riscv/kernel/ptrace.c | 95 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 2 + 3 files changed, 127 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index 261bfe70f60a..18988a5f1a63 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -131,6 +131,36 @@ struct __sc_riscv_cfi_state { unsigned long ss_ptr; /* shadow stack pointer */ }; +#define PTRACE_CFI_LP_EN_BIT 0 +#define PTRACE_CFI_LP_LOCK_BIT 1 +#define PTRACE_CFI_ELP_BIT 2 +#define PTRACE_CFI_SS_EN_BIT 3 +#define PTRACE_CFI_SS_LOCK_BIT 4 +#define PTRACE_CFI_SS_PTR_BIT 5 + +#define PTRACE_CFI_LP_EN_STATE BIT(PTRACE_CFI_LP_EN_BIT) +#define PTRACE_CFI_LP_LOCK_STATE BIT(PTRACE_CFI_LP_LOCK_BIT) +#define PTRACE_CFI_ELP_STATE BIT(PTRACE_CFI_ELP_BIT) +#define PTRACE_CFI_SS_EN_STATE BIT(PTRACE_CFI_SS_EN_BIT) +#define PTRACE_CFI_SS_LOCK_STATE BIT(PTRACE_CFI_SS_LOCK_BIT) +#define PTRACE_CFI_SS_PTR_STATE BIT(PTRACE_CFI_SS_PTR_BIT) + +#define PRACE_CFI_STATE_INVALID_MASK ~(PTRACE_CFI_LP_EN_STATE | \ + PTRACE_CFI_LP_LOCK_STATE | \ + PTRACE_CFI_ELP_STATE | \ + PTRACE_CFI_SS_EN_STATE | \ + PTRACE_CFI_SS_LOCK_STATE | \ + PTRACE_CFI_SS_PTR_STATE) + +struct __cfi_status { + __u64 cfi_state; +}; + +struct user_cfi_state { + struct __cfi_status cfi_status; + __u64 shstk_ptr; +}; + #endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index e6272d74572f..57e257d459e8 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -19,6 +19,7 @@ #include #include #include +#include enum riscv_regset { REGSET_X, @@ -31,6 +32,9 @@ enum riscv_regset { #ifdef CONFIG_RISCV_ISA_SUPM REGSET_TAGGED_ADDR_CTRL, #endif +#ifdef CONFIG_RISCV_USER_CFI + REGSET_CFI, +#endif }; static int riscv_gpr_get(struct task_struct *target, @@ -195,6 +199,87 @@ static int tagged_addr_ctrl_set(struct task_struct *target, } #endif +#ifdef CONFIG_RISCV_USER_CFI +static int riscv_cfi_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_cfi_state user_cfi; + struct pt_regs *regs; + + memset(&user_cfi, 0, sizeof(user_cfi)); + regs = task_pt_regs(target); + + if (is_indir_lp_enabled(target)) { + user_cfi.cfi_status.cfi_state |= PTRACE_CFI_LP_EN_STATE; + user_cfi.cfi_status.cfi_state |= is_indir_lp_locked(target) ? + PTRACE_CFI_LP_LOCK_STATE : 0; + user_cfi.cfi_status.cfi_state |= (regs->status & SR_ELP) ? + PTRACE_CFI_ELP_STATE : 0; + } + + if (is_shstk_enabled(target)) { + user_cfi.cfi_status.cfi_state |= (PTRACE_CFI_SS_EN_STATE | + PTRACE_CFI_SS_PTR_STATE); + user_cfi.cfi_status.cfi_state |= is_shstk_locked(target) ? + PTRACE_CFI_SS_LOCK_STATE : 0; + user_cfi.shstk_ptr = get_active_shstk(target); + } + + return membuf_write(&to, &user_cfi, sizeof(user_cfi)); +} + +/* + * Does it make sense to allow enable / disable of cfi via ptrace? + * We don't allow enable / disable / locking control via ptrace for now. + * Setting the shadow stack pointer is allowed. GDB might use it to unwind or + * some other fixup. Similarly gdb might want to suppress elp and may want + * to reset elp state. + */ +static int riscv_cfi_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct user_cfi_state user_cfi; + struct pt_regs *regs; + + regs = task_pt_regs(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_cfi, 0, -1); + if (ret) + return ret; + + /* + * Not allowing enabling or locking shadow stack or landing pad + * There is no disabling of shadow stack or landing pad via ptrace + * rsvd field should be set to zero so that if those fields are needed in future + */ + if ((user_cfi.cfi_status.cfi_state & + (PTRACE_CFI_LP_EN_STATE | PTRACE_CFI_LP_LOCK_STATE | + PTRACE_CFI_SS_EN_STATE | PTRACE_CFI_SS_LOCK_STATE)) || + (user_cfi.cfi_status.cfi_state & PRACE_CFI_STATE_INVALID_MASK)) + return -EINVAL; + + /* If lpad is enabled on target and ptrace requests to set / clear elp, do that */ + if (is_indir_lp_enabled(target)) { + if (user_cfi.cfi_status.cfi_state & + PTRACE_CFI_ELP_STATE) /* set elp state */ + regs->status |= SR_ELP; + else + regs->status &= ~SR_ELP; /* clear elp state */ + } + + /* If shadow stack enabled on target, set new shadow stack pointer */ + if (is_shstk_enabled(target) && + (user_cfi.cfi_status.cfi_state & PTRACE_CFI_SS_PTR_STATE)) + set_active_shstk(target, user_cfi.shstk_ptr); + + return 0; +} +#endif + static struct user_regset riscv_user_regset[] __ro_after_init = { [REGSET_X] = { USER_REGSET_NOTE_TYPE(PRSTATUS), @@ -234,6 +319,16 @@ static struct user_regset riscv_user_regset[] __ro_after_init = { .set = tagged_addr_ctrl_set, }, #endif +#ifdef CONFIG_RISCV_USER_CFI + [REGSET_CFI] = { + .core_note_type = NT_RISCV_USER_CFI, + .align = sizeof(__u64), + .n = sizeof(struct user_cfi_state) / sizeof(__u64), + .size = sizeof(__u64), + .regset_get = riscv_cfi_get, + .set = riscv_cfi_set, + }, +#endif }; static const struct user_regset_view riscv_user_native_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 819ded2d39de..ee30dcd80901 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -545,6 +545,8 @@ typedef struct elf64_shdr { #define NT_RISCV_VECTOR 0x901 /* RISC-V vector registers */ #define NN_RISCV_TAGGED_ADDR_CTRL "LINUX" #define NT_RISCV_TAGGED_ADDR_CTRL 0x902 /* RISC-V tagged address control (prctl()) */ +#define NN_RISCV_USER_CFI "LINUX" +#define NT_RISCV_USER_CFI 0x903 /* RISC-V shadow stack state */ #define NN_LOONGARCH_CPUCFG "LINUX" #define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ #define NN_LOONGARCH_CSR "LINUX" -- cgit v1.2.3 From 0e6b7eae1fded85f94a357d6132f07d64c614cfa Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Mon, 26 Jan 2026 12:56:57 +0100 Subject: fs: add FS_XFLAG_VERITY for fs-verity files fs-verity introduced inode flag for inodes with enabled fs-verity on them. This patch adds FS_XFLAG_VERITY file attribute which can be retrieved with FS_IOC_FSGETXATTR ioctl() and file_getattr() syscall. This flag is read-only and can not be set with corresponding set ioctl() and file_setattr(). The FS_IOC_SETFLAGS requires file to be opened for writing which is not allowed for verity files. The FS_IOC_FSSETXATTR and file_setattr() clears this flag from the user input. As this is now common flag for both flag interfaces (flags/xflags) add it to overlapping flags list to exclude it from overwrite. Signed-off-by: Andrey Albershteyn Link: https://patch.msgid.link/20260126115658.27656-2-aalbersh@kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- Documentation/filesystems/fsverity.rst | 16 ++++++++++++++++ fs/file_attr.c | 4 ++++ include/linux/fileattr.h | 6 +++--- include/uapi/linux/fs.h | 1 + 4 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 412cf11e3298..22b49b295d1f 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -341,6 +341,22 @@ the file has fs-verity enabled. This can perform better than FS_IOC_GETFLAGS and FS_IOC_MEASURE_VERITY because it doesn't require opening the file, and opening verity files can be expensive. +FS_IOC_FSGETXATTR +----------------- + +Since Linux v7.0, the FS_IOC_FSGETXATTR ioctl sets FS_XFLAG_VERITY (0x00020000) +in the returned flags when the file has verity enabled. Note that this attribute +cannot be set with FS_IOC_FSSETXATTR as enabling verity requires input +parameters. See FS_IOC_ENABLE_VERITY. + +file_getattr +------------ + +Since Linux v7.0, the file_getattr() syscall sets FS_XFLAG_VERITY (0x00020000) +in the returned flags when the file has verity enabled. Note that this attribute +cannot be set with file_setattr() as enabling verity requires input parameters. +See FS_IOC_ENABLE_VERITY. + .. _accessing_verity_files: Accessing verity files diff --git a/fs/file_attr.c b/fs/file_attr.c index f3704881c126..dfde87401817 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -36,6 +36,8 @@ void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags) fa->flags |= FS_DAX_FL; if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) fa->flags |= FS_PROJINHERIT_FL; + if (fa->fsx_xflags & FS_XFLAG_VERITY) + fa->flags |= FS_VERITY_FL; } EXPORT_SYMBOL(fileattr_fill_xflags); @@ -66,6 +68,8 @@ void fileattr_fill_flags(struct file_kattr *fa, u32 flags) fa->fsx_xflags |= FS_XFLAG_DAX; if (fa->flags & FS_PROJINHERIT_FL) fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; + if (fa->flags & FS_VERITY_FL) + fa->fsx_xflags |= FS_XFLAG_VERITY; } EXPORT_SYMBOL(fileattr_fill_flags); diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h index f89dcfad3f8f..3780904a63a6 100644 --- a/include/linux/fileattr.h +++ b/include/linux/fileattr.h @@ -7,16 +7,16 @@ #define FS_COMMON_FL \ (FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NODUMP_FL | FS_NOATIME_FL | FS_DAX_FL | \ - FS_PROJINHERIT_FL) + FS_PROJINHERIT_FL | FS_VERITY_FL) #define FS_XFLAG_COMMON \ (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND | \ FS_XFLAG_NODUMP | FS_XFLAG_NOATIME | FS_XFLAG_DAX | \ - FS_XFLAG_PROJINHERIT) + FS_XFLAG_PROJINHERIT | FS_XFLAG_VERITY) /* Read-only inode flags */ #define FS_XFLAG_RDONLY_MASK \ - (FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR) + (FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR | FS_XFLAG_VERITY) /* Flags to indicate valid value of fsx_ fields */ #define FS_XFLAG_VALUES_MASK \ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 66ca526cf786..70b2b661f42c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -253,6 +253,7 @@ struct file_attr { #define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ #define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */ #define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */ +#define FS_XFLAG_VERITY 0x00020000 /* fs-verity enabled */ #define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* the read-only stuff doesn't really belong here, but any other place is -- cgit v1.2.3 From 8cf82bb558517503a81f8e3c49914c0836360aa6 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sat, 24 Jan 2026 23:50:11 +0900 Subject: misc: pci_endpoint_test: Add BAR subrange mapping test case Add a new PCITEST_BAR_SUBRANGE ioctl to exercise EPC BAR subrange mapping end-to-end. The test programs a simple 2-subrange layout on the endpoint (via pci-epf-test) and verifies that: - the endpoint-provided per-subrange signature bytes are observed at the expected PCIe BAR offsets, and - writes to each subrange are routed to the correct backing region (i.e. the submap order is applied rather than accidentally working due to an identity mapping). Return -EOPNOTSUPP when the endpoint does not advertise subrange mapping, -ENODATA when the BAR is disabled, and -EBUSY when the BAR is reserved for the test register space. Signed-off-by: Koichiro Den Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260124145012.2794108-8-den@valinux.co.jp --- drivers/misc/pci_endpoint_test.c | 203 ++++++++++++++++++++++++++++++++++++++- include/uapi/linux/pcitest.h | 1 + 2 files changed, 203 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c index 1c0fd185114f..74ab5b5b9011 100644 --- a/drivers/misc/pci_endpoint_test.c +++ b/drivers/misc/pci_endpoint_test.c @@ -39,6 +39,8 @@ #define COMMAND_COPY BIT(5) #define COMMAND_ENABLE_DOORBELL BIT(6) #define COMMAND_DISABLE_DOORBELL BIT(7) +#define COMMAND_BAR_SUBRANGE_SETUP BIT(8) +#define COMMAND_BAR_SUBRANGE_CLEAR BIT(9) #define PCI_ENDPOINT_TEST_STATUS 0x8 #define STATUS_READ_SUCCESS BIT(0) @@ -55,6 +57,10 @@ #define STATUS_DOORBELL_ENABLE_FAIL BIT(11) #define STATUS_DOORBELL_DISABLE_SUCCESS BIT(12) #define STATUS_DOORBELL_DISABLE_FAIL BIT(13) +#define STATUS_BAR_SUBRANGE_SETUP_SUCCESS BIT(14) +#define STATUS_BAR_SUBRANGE_SETUP_FAIL BIT(15) +#define STATUS_BAR_SUBRANGE_CLEAR_SUCCESS BIT(16) +#define STATUS_BAR_SUBRANGE_CLEAR_FAIL BIT(17) #define PCI_ENDPOINT_TEST_LOWER_SRC_ADDR 0x0c #define PCI_ENDPOINT_TEST_UPPER_SRC_ADDR 0x10 @@ -77,6 +83,7 @@ #define CAP_MSI BIT(1) #define CAP_MSIX BIT(2) #define CAP_INTX BIT(3) +#define CAP_SUBRANGE_MAPPING BIT(4) #define PCI_ENDPOINT_TEST_DB_BAR 0x34 #define PCI_ENDPOINT_TEST_DB_OFFSET 0x38 @@ -100,6 +107,8 @@ #define PCI_DEVICE_ID_ROCKCHIP_RK3588 0x3588 +#define PCI_ENDPOINT_TEST_BAR_SUBRANGE_NSUB 2 + static DEFINE_IDA(pci_endpoint_test_ida); #define to_endpoint_test(priv) container_of((priv), struct pci_endpoint_test, \ @@ -414,6 +423,193 @@ static int pci_endpoint_test_bars(struct pci_endpoint_test *test) return 0; } +static u8 pci_endpoint_test_subrange_sig_byte(enum pci_barno barno, + unsigned int subno) +{ + return 0x50 + (barno * 8) + subno; +} + +static u8 pci_endpoint_test_subrange_test_byte(enum pci_barno barno, + unsigned int subno) +{ + return 0xa0 + (barno * 8) + subno; +} + +static int pci_endpoint_test_bar_subrange_cmd(struct pci_endpoint_test *test, + enum pci_barno barno, u32 command, + u32 ok_bit, u32 fail_bit) +{ + struct pci_dev *pdev = test->pdev; + struct device *dev = &pdev->dev; + int irq_type = test->irq_type; + u32 status; + + if (irq_type < PCITEST_IRQ_TYPE_INTX || + irq_type > PCITEST_IRQ_TYPE_MSIX) { + dev_err(dev, "Invalid IRQ type\n"); + return -EINVAL; + } + + reinit_completion(&test->irq_raised); + + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_STATUS, 0); + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_TYPE, irq_type); + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_NUMBER, 1); + /* Reuse SIZE as a command parameter: bar number. */ + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_SIZE, barno); + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, command); + + if (!wait_for_completion_timeout(&test->irq_raised, + msecs_to_jiffies(1000))) + return -ETIMEDOUT; + + status = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_STATUS); + if (status & fail_bit) + return -EIO; + + if (!(status & ok_bit)) + return -EIO; + + return 0; +} + +static int pci_endpoint_test_bar_subrange_setup(struct pci_endpoint_test *test, + enum pci_barno barno) +{ + return pci_endpoint_test_bar_subrange_cmd(test, barno, + COMMAND_BAR_SUBRANGE_SETUP, + STATUS_BAR_SUBRANGE_SETUP_SUCCESS, + STATUS_BAR_SUBRANGE_SETUP_FAIL); +} + +static int pci_endpoint_test_bar_subrange_clear(struct pci_endpoint_test *test, + enum pci_barno barno) +{ + return pci_endpoint_test_bar_subrange_cmd(test, barno, + COMMAND_BAR_SUBRANGE_CLEAR, + STATUS_BAR_SUBRANGE_CLEAR_SUCCESS, + STATUS_BAR_SUBRANGE_CLEAR_FAIL); +} + +static int pci_endpoint_test_bar_subrange(struct pci_endpoint_test *test, + enum pci_barno barno) +{ + u32 nsub = PCI_ENDPOINT_TEST_BAR_SUBRANGE_NSUB; + struct device *dev = &test->pdev->dev; + size_t sub_size, buf_size; + resource_size_t bar_size; + void __iomem *bar_addr; + void *read_buf = NULL; + int ret, clear_ret; + size_t off, chunk; + u32 i, exp, val; + u8 pattern; + + if (!(test->ep_caps & CAP_SUBRANGE_MAPPING)) + return -EOPNOTSUPP; + + /* + * The test register BAR is not safe to reprogram and write/read + * over its full size. BAR_TEST already special-cases it to a tiny + * range. For subrange mapping tests, let's simply skip it. + */ + if (barno == test->test_reg_bar) + return -EBUSY; + + bar_size = pci_resource_len(test->pdev, barno); + if (!bar_size) + return -ENODATA; + + bar_addr = test->bar[barno]; + if (!bar_addr) + return -ENOMEM; + + ret = pci_endpoint_test_bar_subrange_setup(test, barno); + if (ret) + return ret; + + if (bar_size % nsub || bar_size / nsub > SIZE_MAX) { + ret = -EINVAL; + goto out_clear; + } + + sub_size = bar_size / nsub; + if (sub_size < sizeof(u32)) { + ret = -ENOSPC; + goto out_clear; + } + + /* Limit the temporary buffer size */ + buf_size = min_t(size_t, sub_size, SZ_1M); + + read_buf = kmalloc(buf_size, GFP_KERNEL); + if (!read_buf) { + ret = -ENOMEM; + goto out_clear; + } + + /* + * Step 1: verify EP-provided signature per subrange. This detects + * whether the EP actually applied the submap order. + */ + for (i = 0; i < nsub; i++) { + exp = (u32)pci_endpoint_test_subrange_sig_byte(barno, i) * + 0x01010101U; + val = ioread32(bar_addr + (i * sub_size)); + if (val != exp) { + dev_err(dev, + "BAR%d subrange%u signature mismatch @%#zx: exp %#08x got %#08x\n", + barno, i, (size_t)i * sub_size, exp, val); + ret = -EIO; + goto out_clear; + } + val = ioread32(bar_addr + (i * sub_size) + sub_size - sizeof(u32)); + if (val != exp) { + dev_err(dev, + "BAR%d subrange%u signature mismatch @%#zx: exp %#08x got %#08x\n", + barno, i, + ((size_t)i * sub_size) + sub_size - sizeof(u32), + exp, val); + ret = -EIO; + goto out_clear; + } + } + + /* Step 2: write unique pattern per subrange (write all first). */ + for (i = 0; i < nsub; i++) { + pattern = pci_endpoint_test_subrange_test_byte(barno, i); + memset_io(bar_addr + (i * sub_size), pattern, sub_size); + } + + /* Step 3: read back and verify (read all after all writes). */ + for (i = 0; i < nsub; i++) { + pattern = pci_endpoint_test_subrange_test_byte(barno, i); + for (off = 0; off < sub_size; off += chunk) { + void *bad; + + chunk = min_t(size_t, buf_size, sub_size - off); + memcpy_fromio(read_buf, bar_addr + (i * sub_size) + off, + chunk); + bad = memchr_inv(read_buf, pattern, chunk); + if (bad) { + size_t bad_off = (u8 *)bad - (u8 *)read_buf; + + dev_err(dev, + "BAR%d subrange%u data mismatch @%#zx (pattern %#02x)\n", + barno, i, (size_t)i * sub_size + off + bad_off, + pattern); + ret = -EIO; + goto out_clear; + } + } + } + +out_clear: + kfree(read_buf); + clear_ret = pci_endpoint_test_bar_subrange_clear(test, barno); + return ret ?: clear_ret; +} + static int pci_endpoint_test_intx_irq(struct pci_endpoint_test *test) { u32 val; @@ -936,12 +1132,17 @@ static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case PCITEST_BAR: + case PCITEST_BAR_SUBRANGE: bar = arg; if (bar <= NO_BAR || bar > BAR_5) goto ret; if (is_am654_pci_dev(pdev) && bar == BAR_0) goto ret; - ret = pci_endpoint_test_bar(test, bar); + + if (cmd == PCITEST_BAR) + ret = pci_endpoint_test_bar(test, bar); + else + ret = pci_endpoint_test_bar_subrange(test, bar); break; case PCITEST_BARS: ret = pci_endpoint_test_bars(test); diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h index d6023a45a9d0..710f8842223f 100644 --- a/include/uapi/linux/pcitest.h +++ b/include/uapi/linux/pcitest.h @@ -22,6 +22,7 @@ #define PCITEST_GET_IRQTYPE _IO('P', 0x9) #define PCITEST_BARS _IO('P', 0xa) #define PCITEST_DOORBELL _IO('P', 0xb) +#define PCITEST_BAR_SUBRANGE _IO('P', 0xc) #define PCITEST_CLEAR_IRQ _IO('P', 0x10) #define PCITEST_IRQ_TYPE_UNDEFINED -1 -- cgit v1.2.3 From bc443c253fcdd2636e2a29fde3f749d39d479cf0 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Mon, 26 Jan 2026 17:22:51 +0100 Subject: dpll: expose fractional frequency offset in ppt Currently, the dpll subsystem exports the fractional frequency offset (FFO) in parts per million (ppm). This granularity is insufficient for high-precision synchronization scenarios which often require parts per trillion (ppt) resolution. Add a new netlink attribute DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET_PPT to expose the FFO in ppt. Update the dpll netlink core to expect the driver-provided FFO value to be in ppt. To maintain backward compatibility with existing userspace tools, populate the legacy DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET attribute by dividing the new ppt value by 1,000,000. Update the zl3073x and mlx5 drivers to provide the FFO value in ppt: - zl3073x: adjust the fixed-point calculation to produce ppt (10^12) instead of ppm (10^6). - mlx5: scale the existing ppm value by 1,000,000. Signed-off-by: Ivan Vecera Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20260126162253.27890-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/dpll.yaml | 11 +++++++++++ drivers/dpll/dpll_netlink.c | 10 +++++++++- drivers/dpll/zl3073x/core.c | 7 +++++-- drivers/net/ethernet/mellanox/mlx5/core/dpll.c | 2 +- include/uapi/linux/dpll.h | 1 + 5 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index b55afa77eac4..3dd48a32f783 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -446,6 +446,16 @@ attribute-sets: doc: | Granularity of phase adjustment, in picoseconds. The value of phase adjustment must be a multiple of this granularity. + - + name: fractional-frequency-offset-ppt + type: sint + doc: | + The FFO (Fractional Frequency Offset) of the pin with respect to + the nominal frequency. + Value = (frequency_measured - frequency_nominal) / frequency_nominal + Value is in PPT (parts per trillion, 10^-12). + Note: This attribute provides higher resolution than the standard + fractional-frequency-offset (which is in PPM). - name: pin-parent-device @@ -628,6 +638,7 @@ operations: - phase-adjust-max - phase-adjust - fractional-frequency-offset + - fractional-frequency-offset-ppt - esync-frequency - esync-frequency-supported - esync-pulse diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 499bca460b1e..904199ddd178 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -389,7 +389,15 @@ static int dpll_msg_add_ffo(struct sk_buff *msg, struct dpll_pin *pin, return 0; return ret; } - return nla_put_sint(msg, DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET, ffo); + /* Put the FFO value in PPM to preserve compatibility with older + * programs. + */ + ret = nla_put_sint(msg, DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET, + div_s64(ffo, 1000000)); + if (ret) + return -EMSGSIZE; + return nla_put_sint(msg, DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET_PPT, + ffo); } static int diff --git a/drivers/dpll/zl3073x/core.c b/drivers/dpll/zl3073x/core.c index 383e2397dd03..63bd97181b9e 100644 --- a/drivers/dpll/zl3073x/core.c +++ b/drivers/dpll/zl3073x/core.c @@ -710,8 +710,11 @@ zl3073x_ref_ffo_update(struct zl3073x_dev *zldev) if (rc) return rc; - /* Convert to ppm -> ffo = (10^6 * value) / 2^32 */ - zldev->ref[i].ffo = mul_s64_u64_shr(value, 1000000, 32); + /* Convert to ppt + * ffo = (10^12 * value) / 2^32 + * ffo = ( 5^12 * value) / 2^20 + */ + zldev->ref[i].ffo = mul_s64_u64_shr(value, 244140625, 20); } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c index 1e5522a19483..3ea8a1766ae2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c @@ -136,7 +136,7 @@ mlx5_dpll_pin_ffo_get(struct mlx5_dpll_synce_status *synce_status, { if (!synce_status->oper_freq_measure) return -ENODATA; - *ffo = synce_status->frequency_diff; + *ffo = 1000000LL * synce_status->frequency_diff; return 0; } diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index b7ff9c44f9aa..de0005f28e5c 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -253,6 +253,7 @@ enum dpll_a_pin { DPLL_A_PIN_ESYNC_PULSE, DPLL_A_PIN_REFERENCE_SYNC, DPLL_A_PIN_PHASE_ADJUST_GRAN, + DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET_PPT, __DPLL_A_PIN_MAX, DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1) -- cgit v1.2.3 From 8443e2087e7002fa25984faad6bbf5f63b280645 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:51 +0800 Subject: ublk: add UBLK_F_NO_AUTO_PART_SCAN feature flag Add a new feature flag UBLK_F_NO_AUTO_PART_SCAN to allow users to suppress automatic partition scanning when starting a ublk device. This is useful for some cases in which user don't want to scan partitions. Users still can manually trigger partition scanning later when appropriate using standard tools (e.g., partprobe, blockdev --rereadpt). Reported-by: Yoav Cohen Link: https://lore.kernel.org/linux-block/DM4PR12MB63280C5637917C071C2F0D65A9A8A@DM4PR12MB6328.namprd12.prod.outlook.com/ Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 14 ++++++++++---- include/uapi/linux/ublk_cmd.h | 3 +++ 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 92bd2351e3ad..4fe754e7d1e8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -80,7 +80,8 @@ | UBLK_F_BUF_REG_OFF_DAEMON \ | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ | UBLK_F_SAFE_STOP_DEV \ - | UBLK_F_BATCH_IO) + | UBLK_F_BATCH_IO \ + | UBLK_F_NO_AUTO_PART_SCAN) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -4430,9 +4431,14 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, set_bit(UB_STATE_USED, &ub->state); - /* Schedule async partition scan for trusted daemons */ - if (!ub->unprivileged_daemons) - schedule_work(&ub->partition_scan_work); + /* Skip partition scan if disabled by user */ + if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) { + clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state); + } else { + /* Schedule async partition scan for trusted daemons */ + if (!ub->unprivileged_daemons) + schedule_work(&ub->partition_scan_work); + } out_put_cdev: if (ret) { diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 743d31491387..a88876756805 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -367,6 +367,9 @@ */ #define UBLK_F_SAFE_STOP_DEV (1ULL << 17) +/* Disable automatic partition scanning when device is started */ +#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 -- cgit v1.2.3 From 503efe850c7463a1e59df133b84461ef53c0361f Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Mon, 19 Jan 2026 10:02:41 +0800 Subject: delayacct: add timestamp of delay max Problem ======= Commit 658eb5ab916d ("delayacct: add delay max to record delay peak") introduced the delay max for getdelays, which records abnormal latency peaks and helps us understand the magnitude of such delays. However, the peak latency value alone is insufficient for effective root cause analysis. Without the precise timestamp of when the peak occurred, we still lack the critical context needed to correlate it with other system events. Solution ======== To address this, we need to additionally record a precise timestamp when the maximum latency occurs. By correlating this timestamp with system logs and monitoring metrics, we can identify processes with abnormal resource usage at the same moment, which can help us to pinpoint root causes. Use Case ======== bash-4.4# ./getdelays -d -t 227 print delayacct stats ON TGID 227 CPU count real total virtual total delay total delay average delay max delay min delay max timestamp 46 188000000 192348334 4098012 0.089ms 0.429260ms 0.051205ms 2026-01-15T15:06:58 IO count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A SWAP count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A RECLAIM count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A THRAS HING count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A COMPACT count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A WPCOPY count delay total delay average delay max delay min delay max timestamp 182 19413338 0.107ms 0.547353ms 0.022462ms 2026-01-15T15:05:24 IRQ count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A Link: https://lkml.kernel.org/r/20260119100241520gWubW8-5QfhSf9gjqcc_E@zte.com.cn Signed-off-by: Wang Yaxin Cc: Fan Yu Cc: Jonathan Corbet Cc: xu xin Cc: Yang Yang Signed-off-by: Andrew Morton --- Documentation/accounting/delay-accounting.rst | 32 ++--- include/linux/delayacct.h | 8 ++ include/linux/sched.h | 5 + include/uapi/linux/taskstats.h | 22 +++- kernel/delayacct.c | 31 +++-- kernel/sched/stats.h | 8 +- tools/accounting/getdelays.c | 172 ++++++++++++++++++++++---- 7 files changed, 223 insertions(+), 55 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 86d7902a657f..e209c46241b0 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -107,22 +107,22 @@ Get sum and peak of delays, since system boot, for all pids with tgid 242:: TGID 242 - CPU count real total virtual total delay total delay average delay max delay min - 39 156000000 156576579 2111069 0.054ms 0.212296ms 0.031307ms - IO count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - SWAP count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - RECLAIM count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - THRASHING count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - COMPACT count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - WPCOPY count delay total delay average delay max delay min - 156 11215873 0.072ms 0.207403ms 0.033913ms - IRQ count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms + CPU count real total virtual total delay total delay average delay max delay min delay max timestamp + 46 188000000 192348334 4098012 0.089ms 0.429260ms 0.051205ms 2026-01-15T15:06:58 + IO count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + SWAP count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + RECLAIM count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + THRASHING count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + COMPACT count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + WPCOPY count delay total delay average delay max delay min delay max timestamp + 182 19413338 0.107ms 0.547353ms 0.022462ms 2026-01-15T15:05:24 + IRQ count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A Get IO accounting for pid 1, it works only with -p:: diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 800dcc360db2..ecb06f16d22c 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -69,6 +69,14 @@ struct task_delay_info { u32 compact_count; /* total count of memory compact */ u32 wpcopy_count; /* total count of write-protect copy */ u32 irq_count; /* total count of IRQ/SOFTIRQ */ + + struct timespec64 blkio_delay_max_ts; + struct timespec64 swapin_delay_max_ts; + struct timespec64 freepages_delay_max_ts; + struct timespec64 thrashing_delay_max_ts; + struct timespec64 compact_delay_max_ts; + struct timespec64 wpcopy_delay_max_ts; + struct timespec64 irq_delay_max_ts; }; #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index da0133524d08..1d22b6229b95 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,7 @@ #include #include #include +#include #ifndef COMPILE_OFFSETS #include #endif @@ -86,6 +87,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; +struct timespec64; struct user_event_mm; #include @@ -435,6 +437,9 @@ struct sched_info { /* When were we last queued to run? */ unsigned long long last_queued; + /* Timestamp of max time spent waiting on a runqueue: */ + struct timespec64 max_run_delay_ts; + #endif /* CONFIG_SCHED_INFO */ }; diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 5929030d4e8b..1b31e8e14d2f 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -18,6 +18,16 @@ #define _LINUX_TASKSTATS_H #include +#ifdef __KERNEL__ +#include +#else +#ifndef _LINUX_TIME64_H +struct timespec64 { + __s64 tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; +#endif +#endif /* Format for per-task data returned to userland when * - a task exits @@ -34,7 +44,7 @@ */ -#define TASKSTATS_VERSION 16 +#define TASKSTATS_VERSION 17 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -230,6 +240,16 @@ struct taskstats { __u64 irq_delay_max; __u64 irq_delay_min; + + /*v17: delay max timestamp record*/ + struct timespec64 cpu_delay_max_ts; + struct timespec64 blkio_delay_max_ts; + struct timespec64 swapin_delay_max_ts; + struct timespec64 freepages_delay_max_ts; + struct timespec64 thrashing_delay_max_ts; + struct timespec64 compact_delay_max_ts; + struct timespec64 wpcopy_delay_max_ts; + struct timespec64 irq_delay_max_ts; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 30e7912ebb0d..d58ffc63bcba 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -18,6 +18,7 @@ do { \ d->type##_delay_max = tsk->delays->type##_delay_max; \ d->type##_delay_min = tsk->delays->type##_delay_min; \ + d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \ tmp = d->type##_delay_total + tsk->delays->type##_delay; \ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ d->type##_count += tsk->delays->type##_count; \ @@ -104,7 +105,8 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumulator (@total) and @count */ -static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, + u64 *max, u64 *min, struct timespec64 *ts) { s64 ns = local_clock() - *start; unsigned long flags; @@ -113,8 +115,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - if (ns > *max) + if (ns > *max) { *max = ns; + ktime_get_real_ts64(ts); + } if (*min == 0 || ns < *min) *min = ns; raw_spin_unlock_irqrestore(lock, flags); @@ -137,7 +141,8 @@ void __delayacct_blkio_end(struct task_struct *p) &p->delays->blkio_delay, &p->delays->blkio_count, &p->delays->blkio_delay_max, - &p->delays->blkio_delay_min); + &p->delays->blkio_delay_min, + &p->delays->blkio_delay_max_ts); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -170,6 +175,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_delay_max = tsk->sched_info.max_run_delay; d->cpu_delay_min = tsk->sched_info.min_run_delay; + d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; tmp = (s64)d->cpu_run_virtual_total + t3; @@ -217,7 +223,8 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_delay, ¤t->delays->freepages_count, ¤t->delays->freepages_delay_max, - ¤t->delays->freepages_delay_min); + ¤t->delays->freepages_delay_min, + ¤t->delays->freepages_delay_max_ts); } void __delayacct_thrashing_start(bool *in_thrashing) @@ -241,7 +248,8 @@ void __delayacct_thrashing_end(bool *in_thrashing) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count, ¤t->delays->thrashing_delay_max, - ¤t->delays->thrashing_delay_min); + ¤t->delays->thrashing_delay_min, + ¤t->delays->thrashing_delay_max_ts); } void __delayacct_swapin_start(void) @@ -256,7 +264,8 @@ void __delayacct_swapin_end(void) ¤t->delays->swapin_delay, ¤t->delays->swapin_count, ¤t->delays->swapin_delay_max, - ¤t->delays->swapin_delay_min); + ¤t->delays->swapin_delay_min, + ¤t->delays->swapin_delay_max_ts); } void __delayacct_compact_start(void) @@ -271,7 +280,8 @@ void __delayacct_compact_end(void) ¤t->delays->compact_delay, ¤t->delays->compact_count, ¤t->delays->compact_delay_max, - ¤t->delays->compact_delay_min); + ¤t->delays->compact_delay_min, + ¤t->delays->compact_delay_max_ts); } void __delayacct_wpcopy_start(void) @@ -286,7 +296,8 @@ void __delayacct_wpcopy_end(void) ¤t->delays->wpcopy_delay, ¤t->delays->wpcopy_count, ¤t->delays->wpcopy_delay_max, - ¤t->delays->wpcopy_delay_min); + ¤t->delays->wpcopy_delay_min, + ¤t->delays->wpcopy_delay_max_ts); } void __delayacct_irq(struct task_struct *task, u32 delta) @@ -296,8 +307,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta) raw_spin_lock_irqsave(&task->delays->lock, flags); task->delays->irq_delay += delta; task->delays->irq_count++; - if (delta > task->delays->irq_delay_max) + if (delta > task->delays->irq_delay_max) { task->delays->irq_delay_max = delta; + ktime_get_real_ts64(&task->delays->irq_delay_max_ts); + } if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min)) task->delays->irq_delay_min = delta; raw_spin_unlock_irqrestore(&task->delays->lock, flags); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c903f1a42891..a612cf253c87 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) delta = rq_clock(rq) - t->sched_info.last_queued; t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - if (delta > t->sched_info.max_run_delay) + if (delta > t->sched_info.max_run_delay) { t->sched_info.max_run_delay = delta; + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); + } if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; rq_sched_info_dequeue(rq, delta); @@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; - if (delta > t->sched_info.max_run_delay) + if (delta > t->sched_info.max_run_delay) { t->sched_info.max_run_delay = delta; + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); + } if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 21cb3c3d1331..64796c0223be 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -194,6 +195,37 @@ static int get_family_id(int sd) #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) #define delay_ms(t) (t / 1000000ULL) +/* + * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS) + * Returns formatted string or "N/A" if timestamp is zero + */ +static const char *format_timespec64(struct timespec64 *ts) +{ + static char buffer[32]; + struct tm tm_info; + time_t time_sec; + + /* Check if timestamp is zero (not set) */ + if (ts->tv_sec == 0 && ts->tv_nsec == 0) + return "N/A"; + + time_sec = (time_t)ts->tv_sec; + + /* Use thread-safe localtime_r */ + if (localtime_r(&time_sec, &tm_info) == NULL) + return "N/A"; + + snprintf(buffer, sizeof(buffer), "%04d-%02d-%02dT%02d:%02d:%02d", + tm_info.tm_year + 1900, + tm_info.tm_mon + 1, + tm_info.tm_mday, + tm_info.tm_hour, + tm_info.tm_min, + tm_info.tm_sec); + + return buffer; +} + /* * Version compatibility note: * Field availability depends on taskstats version (t->version), @@ -205,13 +237,28 @@ static int get_family_id(int sd) * version >= 13 - supports WPCOPY statistics * version >= 14 - supports IRQ statistics * version >= 16 - supports *_max and *_min delay statistics + * version >= 17 - supports delay max timestamp statistics * * Always verify version before accessing version-dependent fields * to maintain backward compatibility. */ #define PRINT_CPU_DELAY(version, t) \ do { \ - if (version >= 16) { \ + if (version >= 17) { \ + printf("%-10s%15s%15s%15s%15s%15s%15s%15s%25s\n", \ + "CPU", "count", "real total", "virtual total", \ + "delay total", "delay average", "delay max", \ + "delay min", "delay max timestamp"); \ + printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \ + (unsigned long long)(t)->cpu_count, \ + (unsigned long long)(t)->cpu_run_real_total, \ + (unsigned long long)(t)->cpu_run_virtual_total, \ + (unsigned long long)(t)->cpu_delay_total, \ + average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \ + delay_ms((double)(t)->cpu_delay_max), \ + delay_ms((double)(t)->cpu_delay_min), \ + format_timespec64(&(t)->cpu_delay_max_ts)); \ + } else if (version >= 16) { \ printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \ "CPU", "count", "real total", "virtual total", \ "delay total", "delay average", "delay max", "delay min"); \ @@ -257,44 +304,115 @@ static int get_family_id(int sd) } \ } while (0) +#define PRINT_FILED_DELAY_WITH_TS(name, version, t, count, total, max, min, max_ts) \ + do { \ + if (version >= 17) { \ + printf("%-10s%15s%15s%15s%15s%15s%25s\n", \ + name, "count", "delay total", "delay average", \ + "delay max", "delay min", "delay max timestamp"); \ + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count), \ + delay_ms((double)(t)->max), \ + delay_ms((double)(t)->min), \ + format_timespec64(&(t)->max_ts)); \ + } else if (version >= 16) { \ + printf("%-10s%15s%15s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average", \ + "delay max", "delay min"); \ + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count), \ + delay_ms((double)(t)->max), \ + delay_ms((double)(t)->min)); \ + } else { \ + printf("%-10s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average"); \ + printf(" %15llu%15llu%15.3fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count)); \ + } \ + } while (0) + static void print_delayacct(struct taskstats *t) { printf("\n\n"); PRINT_CPU_DELAY(t->version, t); - PRINT_FILED_DELAY("IO", t->version, t, - blkio_count, blkio_delay_total, - blkio_delay_max, blkio_delay_min); + /* Use new macro with timestamp support for version >= 17 */ + if (t->version >= 17) { + PRINT_FILED_DELAY_WITH_TS("IO", t->version, t, + blkio_count, blkio_delay_total, + blkio_delay_max, blkio_delay_min, blkio_delay_max_ts); - PRINT_FILED_DELAY("SWAP", t->version, t, - swapin_count, swapin_delay_total, - swapin_delay_max, swapin_delay_min); + PRINT_FILED_DELAY_WITH_TS("SWAP", t->version, t, + swapin_count, swapin_delay_total, + swapin_delay_max, swapin_delay_min, swapin_delay_max_ts); - PRINT_FILED_DELAY("RECLAIM", t->version, t, - freepages_count, freepages_delay_total, - freepages_delay_max, freepages_delay_min); + PRINT_FILED_DELAY_WITH_TS("RECLAIM", t->version, t, + freepages_count, freepages_delay_total, + freepages_delay_max, freepages_delay_min, freepages_delay_max_ts); - PRINT_FILED_DELAY("THRASHING", t->version, t, - thrashing_count, thrashing_delay_total, - thrashing_delay_max, thrashing_delay_min); + PRINT_FILED_DELAY_WITH_TS("THRASHING", t->version, t, + thrashing_count, thrashing_delay_total, + thrashing_delay_max, thrashing_delay_min, thrashing_delay_max_ts); - if (t->version >= 11) { - PRINT_FILED_DELAY("COMPACT", t->version, t, - compact_count, compact_delay_total, - compact_delay_max, compact_delay_min); - } + if (t->version >= 11) { + PRINT_FILED_DELAY_WITH_TS("COMPACT", t->version, t, + compact_count, compact_delay_total, + compact_delay_max, compact_delay_min, compact_delay_max_ts); + } - if (t->version >= 13) { - PRINT_FILED_DELAY("WPCOPY", t->version, t, - wpcopy_count, wpcopy_delay_total, - wpcopy_delay_max, wpcopy_delay_min); - } + if (t->version >= 13) { + PRINT_FILED_DELAY_WITH_TS("WPCOPY", t->version, t, + wpcopy_count, wpcopy_delay_total, + wpcopy_delay_max, wpcopy_delay_min, wpcopy_delay_max_ts); + } - if (t->version >= 14) { - PRINT_FILED_DELAY("IRQ", t->version, t, - irq_count, irq_delay_total, - irq_delay_max, irq_delay_min); + if (t->version >= 14) { + PRINT_FILED_DELAY_WITH_TS("IRQ", t->version, t, + irq_count, irq_delay_total, + irq_delay_max, irq_delay_min, irq_delay_max_ts); + } + } else { + /* Use original macro for older versions */ + PRINT_FILED_DELAY("IO", t->version, t, + blkio_count, blkio_delay_total, + blkio_delay_max, blkio_delay_min); + + PRINT_FILED_DELAY("SWAP", t->version, t, + swapin_count, swapin_delay_total, + swapin_delay_max, swapin_delay_min); + + PRINT_FILED_DELAY("RECLAIM", t->version, t, + freepages_count, freepages_delay_total, + freepages_delay_max, freepages_delay_min); + + PRINT_FILED_DELAY("THRASHING", t->version, t, + thrashing_count, thrashing_delay_total, + thrashing_delay_max, thrashing_delay_min); + + if (t->version >= 11) { + PRINT_FILED_DELAY("COMPACT", t->version, t, + compact_count, compact_delay_total, + compact_delay_max, compact_delay_min); + } + + if (t->version >= 13) { + PRINT_FILED_DELAY("WPCOPY", t->version, t, + wpcopy_count, wpcopy_delay_total, + wpcopy_delay_max, wpcopy_delay_min); + } + + if (t->version >= 14) { + PRINT_FILED_DELAY("IRQ", t->version, t, + irq_count, irq_delay_total, + irq_delay_max, irq_delay_min); + } } } -- cgit v1.2.3 From 072e6f7f416f5d17be71000b31fb108651ad360d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 30 Jan 2026 16:21:06 +0100 Subject: wifi: cfg80211: add initial UHR support Add initial support for making UHR connections (or suppressing that), adding UHR capable stations on the AP side, encoding and decoding UHR MCSes (except rate calculation for the new MCSes 17, 19, 20 and 23) as well as regulatory support. Link: https://patch.msgid.link/20260130164259.54cc12fbb307.I26126bebd83c7ab17e99827489f946ceabb3521f@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 58 ++++++++++++++++++++++-- include/uapi/linux/nl80211.h | 30 +++++++++++++ net/wireless/nl80211.c | 102 +++++++++++++++++++++++++++++++++++++++++-- net/wireless/reg.c | 4 +- net/wireless/util.c | 101 ++++++++++++++++++++++++++++++++---------- 5 files changed, 265 insertions(+), 30 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7911ed58abbb..fc01de19c798 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7,7 +7,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include @@ -126,6 +126,7 @@ struct wiphy; * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel. * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel. * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_UHR: UHR operation is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = BIT(0), @@ -143,6 +144,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_10MHZ = BIT(12), IEEE80211_CHAN_NO_HE = BIT(13), /* can use free bits here */ + IEEE80211_CHAN_NO_UHR = BIT(18), IEEE80211_CHAN_NO_320MHZ = BIT(19), IEEE80211_CHAN_NO_EHT = BIT(20), IEEE80211_CHAN_DFS_CONCURRENT = BIT(21), @@ -429,6 +431,18 @@ struct ieee80211_sta_eht_cap { u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN]; }; +/** + * struct ieee80211_sta_uhr_cap - STA's UHR capabilities + * @has_uhr: true iff UHR is supported and data is valid + * @mac: fixed MAC capabilities + * @phy: fixed PHY capabilities + */ +struct ieee80211_sta_uhr_cap { + bool has_uhr; + struct ieee80211_uhr_cap_mac mac; + struct ieee80211_uhr_cap_phy phy; +}; + /* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */ #ifdef __CHECKER__ /* @@ -454,6 +468,7 @@ struct ieee80211_sta_eht_cap { * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a * 6 GHz band channel (and 0 may be valid value). * @eht_cap: STA's EHT capabilities + * @uhr_cap: STA's UHR capabilities * @vendor_elems: vendor element(s) to advertise * @vendor_elems.data: vendor element(s) data * @vendor_elems.len: vendor element(s) length @@ -463,6 +478,7 @@ struct ieee80211_sband_iftype_data { struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; + struct ieee80211_sta_uhr_cap uhr_cap; struct { const u8 *data; unsigned int len; @@ -704,6 +720,26 @@ ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband, return NULL; } +/** + * ieee80211_get_uhr_iftype_cap - return UHR capabilities for an sband's iftype + * @sband: the sband to search for the iftype on + * @iftype: enum nl80211_iftype + * + * Return: pointer to the struct ieee80211_sta_uhr_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_uhr_cap * +ieee80211_get_uhr_iftype_cap(const struct ieee80211_supported_band *sband, + enum nl80211_iftype iftype) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, iftype); + + if (data && data->uhr_cap.has_uhr) + return &data->uhr_cap; + + return NULL; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * @@ -1486,6 +1522,7 @@ struct cfg80211_s1g_short_beacon { * @he_cap: HE capabilities (or %NULL if HE isn't enabled) * @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled) * @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled) + * @uhr_oper: UHR operation (or %NULL if UHR isn't enabled) * @ht_required: stations must support HT * @vht_required: stations must support VHT * @twt_responder: Enable Target Wait Time @@ -1525,6 +1562,7 @@ struct cfg80211_ap_settings { const struct ieee80211_he_operation *he_oper; const struct ieee80211_eht_cap_elem *eht_cap; const struct ieee80211_eht_operation *eht_oper; + const struct ieee80211_uhr_operation *uhr_oper; bool ht_required, vht_required, he_required, sae_h2e_required; bool twt_responder; u32 flags; @@ -1698,6 +1736,8 @@ struct sta_txpwr { * @eht_capa: EHT capabilities of station * @eht_capa_len: the length of the EHT capabilities * @s1g_capa: S1G capabilities of station + * @uhr_capa: UHR capabilities of the station + * @uhr_capa_len: the length of the UHR capabilities */ struct link_station_parameters { const u8 *mld_mac; @@ -1717,6 +1757,8 @@ struct link_station_parameters { const struct ieee80211_eht_cap_elem *eht_capa; u8 eht_capa_len; const struct ieee80211_s1g_cap *s1g_capa; + const struct ieee80211_uhr_cap *uhr_capa; + u8 uhr_capa_len; }; /** @@ -1898,6 +1940,11 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information * @RATE_INFO_FLAGS_S1G_MCS: MCS field filled with S1G MCS + * @RATE_INFO_FLAGS_UHR_MCS: UHR MCS information + * @RATE_INFO_FLAGS_UHR_ELR_MCS: UHR ELR MCS was used + * (set together with @RATE_INFO_FLAGS_UHR_MCS) + * @RATE_INFO_FLAGS_UHR_IM: UHR Interference Mitigation + * was used */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), @@ -1909,6 +1956,9 @@ enum rate_info_flags { RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6), RATE_INFO_FLAGS_EHT_MCS = BIT(7), RATE_INFO_FLAGS_S1G_MCS = BIT(8), + RATE_INFO_FLAGS_UHR_MCS = BIT(9), + RATE_INFO_FLAGS_UHR_ELR_MCS = BIT(10), + RATE_INFO_FLAGS_UHR_IM = BIT(11), }; /** @@ -1924,7 +1974,7 @@ enum rate_info_flags { * @RATE_INFO_BW_160: 160 MHz bandwidth * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation * @RATE_INFO_BW_320: 320 MHz bandwidth - * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation + * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT/UHR RU allocation * @RATE_INFO_BW_1: 1 MHz bandwidth * @RATE_INFO_BW_2: 2 MHz bandwidth * @RATE_INFO_BW_4: 4 MHz bandwidth @@ -1955,7 +2005,7 @@ enum rate_info_bw { * * @flags: bitflag of flags from &enum rate_info_flags * @legacy: bitrate in 100kbit/s for 802.11abg - * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G rate + * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G/UHR rate * @nss: number of streams (VHT & HE only) * @bw: bandwidth (from &enum rate_info_bw) * @he_gi: HE guard interval (from &enum nl80211_he_gi) @@ -3262,6 +3312,7 @@ struct cfg80211_ml_reconf_req { * Drivers shall disable MLO features for the current association if this * flag is not set. * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any) + * @ASSOC_REQ_DISABLE_UHR: Disable UHR */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), @@ -3272,6 +3323,7 @@ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_EHT = BIT(5), CONNECT_REQ_MLO_SUPPORT = BIT(6), ASSOC_REQ_SPP_AMSDU = BIT(7), + ASSOC_REQ_DISABLE_UHR = BIT(8), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 706a98686068..b63f71850906 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2977,6 +2977,13 @@ enum nl80211_commands { * @NL80211_ATTR_EPP_PEER: A flag attribute to indicate if the peer is an EPP * STA. Used with %NL80211_CMD_NEW_STA and %NL80211_CMD_ADD_LINK_STA * + * @NL80211_ATTR_UHR_CAPABILITY: UHR Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION). Can be set + * only if HE/EHT are also available. + * @NL80211_ATTR_DISABLE_UHR: Force UHR capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3547,6 +3554,9 @@ enum nl80211_attrs { NL80211_ATTR_EPP_PEER, + NL80211_ATTR_UHR_CAPABILITY, + NL80211_ATTR_DISABLE_UHR, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3899,6 +3909,12 @@ enum nl80211_eht_ru_alloc { * @NL80211_RATE_INFO_4_MHZ_WIDTH: 4 MHz S1G rate * @NL80211_RATE_INFO_8_MHZ_WIDTH: 8 MHz S1G rate * @NL80211_RATE_INFO_16_MHZ_WIDTH: 16 MHz S1G rate + * @NL80211_RATE_INFO_UHR_MCS: UHR MCS index (u8, 0-15, 17, 19, 20, 23) + * Note that the other EHT attributes (such as @NL80211_RATE_INFO_EHT_NSS) + * are used in conjunction with this where applicable + * @NL80211_RATE_INFO_UHR_ELR: UHR ELR flag, which restricts NSS to 1, + * MCS to 0 or 1, and GI to %NL80211_RATE_INFO_EHT_GI_1_6. + * @NL80211_RATE_INFO_UHR_IM: UHR Interference Mitigation flag * @__NL80211_RATE_INFO_AFTER_LAST: internal use */ enum nl80211_rate_info { @@ -3932,6 +3948,9 @@ enum nl80211_rate_info { NL80211_RATE_INFO_4_MHZ_WIDTH, NL80211_RATE_INFO_8_MHZ_WIDTH, NL80211_RATE_INFO_16_MHZ_WIDTH, + NL80211_RATE_INFO_UHR_MCS, + NL80211_RATE_INFO_UHR_ELR, + NL80211_RATE_INFO_UHR_IM, /* keep last */ __NL80211_RATE_INFO_AFTER_LAST, @@ -4254,6 +4273,10 @@ enum nl80211_mpath_info { * capabilities element * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE: EHT PPE thresholds information as * defined in EHT capabilities element + * @NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC: UHR MAC capabilities as in UHR + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY: UHR PHY capabilities as in UHR + * capabilities element * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band attribute currently defined */ @@ -4271,6 +4294,8 @@ enum nl80211_band_iftype_attr { NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE, + NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC, + NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY, /* keep last */ __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, @@ -4453,6 +4478,8 @@ enum nl80211_wmm_rule { * @NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY: Channel is not permitted for use * as a primary channel. Does not prevent the channel from existing * as a non-primary subchannel. Only applicable to S1G channels. + * @NL80211_FREQUENCY_ATTR_NO_UHR: UHR operation is not allowed on this channel + * in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4502,6 +4529,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_8MHZ, NL80211_FREQUENCY_ATTR_NO_16MHZ, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY, + NL80211_FREQUENCY_ATTR_NO_UHR, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4715,6 +4743,7 @@ enum nl80211_sched_scan_match_attr { * despite NO_IR configuration. * @NL80211_RRF_ALLOW_20MHZ_ACTIVITY: Allow activity in 20 MHz bandwidth, * despite NO_IR configuration. + * @NL80211_RRF_NO_UHR: UHR operation not allowed */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1 << 0, @@ -4741,6 +4770,7 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_6GHZ_AFC_CLIENT = 1 << 23, NL80211_RRF_ALLOW_6GHZ_VLP_AP = 1 << 24, NL80211_RRF_ALLOW_20MHZ_ACTIVITY = 1 << 25, + NL80211_RRF_NO_UHR = 1 << 26, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9aa83a6943a2..6e58b238a1f8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -332,6 +332,15 @@ static int validate_nan_cluster_id(const struct nlattr *attr, return 0; } +static int validate_uhr_capa(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + + return ieee80211_uhr_capa_size_ok(data, len, false); +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -934,6 +943,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, [NL80211_ATTR_EPP_PEER] = { .type = NLA_FLAG }, + [NL80211_ATTR_UHR_CAPABILITY] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_uhr_capa, 255), + [NL80211_ATTR_DISABLE_UHR] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1319,6 +1331,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_UHR) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHR)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -1954,6 +1969,7 @@ nl80211_send_iftype_data(struct sk_buff *msg, { const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; const struct ieee80211_sta_eht_cap *eht_cap = &iftdata->eht_cap; + const struct ieee80211_sta_uhr_cap *uhr_cap = &iftdata->uhr_cap; if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES, iftdata->types_mask)) @@ -2005,6 +2021,14 @@ nl80211_send_iftype_data(struct sk_buff *msg, return -ENOBUFS; } + if (uhr_cap->has_uhr) { + if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC, + sizeof(uhr_cap->mac), &uhr_cap->mac) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY, + sizeof(uhr_cap->phy), &uhr_cap->phy)) + return -ENOBUFS; + } + if (sband->band == NL80211_BAND_6GHZ && nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, sizeof(iftdata->he_6ghz_capa), @@ -6462,6 +6486,17 @@ static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) cap->datalen - 1)) return -EINVAL; } + + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_UHR_OPER, ies, ies_len); + if (cap) { + if (!cap->datalen) + return -EINVAL; + params->uhr_oper = (void *)(cap->data + 1); + if (!ieee80211_uhr_oper_size_ok((const u8 *)params->uhr_oper, + cap->datalen - 1, true)) + return -EINVAL; + } + return 0; } @@ -6593,6 +6628,9 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params (channel->flags & IEEE80211_CHAN_NO_EHT)) return -EOPNOTSUPP; + if (params->uhr_oper && (channel->flags & IEEE80211_CHAN_NO_UHR)) + return -EOPNOTSUPP; + return 0; } @@ -7175,7 +7213,8 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) break; case RATE_INFO_BW_EHT_RU: rate_flg = 0; - WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS)); + WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS) && + !(info->flags & RATE_INFO_FLAGS_UHR_MCS)); break; } @@ -7228,6 +7267,23 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC, info->eht_ru_alloc)) return false; + } else if (info->flags & RATE_INFO_FLAGS_UHR_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_UHR_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_NSS, info->nss)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_GI, info->eht_gi)) + return false; + if (info->bw == RATE_INFO_BW_EHT_RU && + nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC, + info->eht_ru_alloc)) + return false; + if (info->flags & RATE_INFO_FLAGS_UHR_ELR_MCS && + nla_put_flag(msg, NL80211_RATE_INFO_UHR_ELR)) + return false; + if (info->flags & RATE_INFO_FLAGS_UHR_IM && + nla_put_flag(msg, NL80211_RATE_INFO_UHR_IM)) + return false; } nla_nest_end(msg, rate); @@ -8101,7 +8157,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, if (params->ext_capab || params->link_sta_params.ht_capa || params->link_sta_params.vht_capa || params->link_sta_params.he_capa || - params->link_sta_params.eht_capa) + params->link_sta_params.eht_capa || + params->link_sta_params.uhr_capa) return -EINVAL; if (params->sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) return -EINVAL; @@ -8321,6 +8378,16 @@ static int nl80211_set_station_tdls(struct genl_info *info, } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params->link_sta_params.eht_capa) + return -EINVAL; + + params->link_sta_params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params->link_sta_params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) params->link_sta_params.s1g_capa = nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]); @@ -8641,6 +8708,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params.link_sta_params.eht_capa) + return -EINVAL; + + params.link_sta_params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params.link_sta_params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { params.eml_cap_present = true; params.eml_cap = @@ -8700,10 +8777,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.link_sta_params.ht_capa = NULL; params.link_sta_params.vht_capa = NULL; - /* HE and EHT require WME */ + /* HE, EHT and UHR require WME */ if (params.link_sta_params.he_capa_len || params.link_sta_params.he_6ghz_capa || - params.link_sta_params.eht_capa_len) + params.link_sta_params.eht_capa_len || + params.link_sta_params.uhr_capa_len) return -EINVAL; } @@ -12376,6 +12454,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT])) req.flags |= ASSOC_REQ_DISABLE_EHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR])) + req.flags |= ASSOC_REQ_DISABLE_UHR; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&req.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -13248,6 +13329,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT])) connect.flags |= ASSOC_REQ_DISABLE_EHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR])) + connect.flags |= ASSOC_REQ_DISABLE_UHR; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&connect.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -17680,6 +17764,16 @@ nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info, } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params.eht_capa) + return -EINVAL; + + params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 6cbfa3b78311..139cb27e5a81 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2025 Intel Corporation + * Copyright (C) 2018 - 2026 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -1605,6 +1605,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP; if (rd_flags & NL80211_RRF_ALLOW_20MHZ_ACTIVITY) channel_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY; + if (rd_flags & NL80211_RRF_NO_UHR) + channel_flags |= IEEE80211_CHAN_NO_UHR; return channel_flags; } diff --git a/net/wireless/util.c b/net/wireless/util.c index 08c525835518..404fe604a8db 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023, 2025 Intel Corporation + * Copyright (C) 2018-2023, 2025-2026 Intel Corporation */ #include #include @@ -1574,26 +1574,30 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) return result / 10000; } -static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) +static u32 _cfg80211_calculate_bitrate_eht_uhr(struct rate_info *rate) { #define SCALE 6144 - static const u32 mcs_divisors[16] = { - 102399, /* 16.666666... */ - 51201, /* 8.333333... */ - 34134, /* 5.555555... */ - 25599, /* 4.166666... */ - 17067, /* 2.777777... */ - 12801, /* 2.083333... */ - 11377, /* 1.851725... */ - 10239, /* 1.666666... */ - 8532, /* 1.388888... */ - 7680, /* 1.250000... */ - 6828, /* 1.111111... */ - 6144, /* 1.000000... */ - 5690, /* 0.926106... */ - 5120, /* 0.833333... */ - 409600, /* 66.666666... */ - 204800, /* 33.333333... */ + static const u32 mcs_divisors[] = { + [ 0] = 102399, /* 16.666666... */ + [ 1] = 51201, /* 8.333333... */ + [ 2] = 34134, /* 5.555555... */ + [ 3] = 25599, /* 4.166666... */ + [ 4] = 17067, /* 2.777777... */ + [ 5] = 12801, /* 2.083333... */ + [ 6] = 11377, /* 1.851725... */ + [ 7] = 10239, /* 1.666666... */ + [ 8] = 8532, /* 1.388888... */ + [ 9] = 7680, /* 1.250000... */ + [10] = 6828, /* 1.111111... */ + [11] = 6144, /* 1.000000... */ + [12] = 5690, /* 0.926106... */ + [13] = 5120, /* 0.833333... */ + [14] = 409600, /* 66.666666... */ + [15] = 204800, /* 33.333333... */ + [17] = 38400, /* 6.250180... */ + [19] = 19200, /* 3.125090... */ + [20] = 15360, /* 2.500000... */ + [23] = 9600, /* 1.562545... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; @@ -1604,8 +1608,6 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) u64 tmp; u32 result; - if (WARN_ON_ONCE(rate->mcs > 15)) - return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > @@ -1686,7 +1688,7 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { - WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", + WARN(1, "invalid EHT or UHR MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } @@ -1700,11 +1702,64 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) tmp *= rate->nss; do_div(tmp, 8); + /* and handle interference mitigation - 0.9x */ + if (rate->flags & RATE_INFO_FLAGS_UHR_IM) { + if (WARN(rate->nss != 1 || rate->mcs == 15, + "invalid NSS or MCS for UHR IM\n")) + return 0; + tmp *= 9000; + do_div(tmp, 10000); + } + result = tmp; return result / 10000; } +static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) +{ + if (WARN_ONCE(rate->mcs > 15, "bad EHT MCS %d\n", rate->mcs)) + return 0; + + if (WARN_ONCE(rate->flags & (RATE_INFO_FLAGS_UHR_ELR_MCS | + RATE_INFO_FLAGS_UHR_IM), + "bad EHT MCS flags 0x%x\n", rate->flags)) + return 0; + + return _cfg80211_calculate_bitrate_eht_uhr(rate); +} + +static u32 cfg80211_calculate_bitrate_uhr(struct rate_info *rate) +{ + if (rate->flags & RATE_INFO_FLAGS_UHR_ELR_MCS) { + WARN_ONCE(rate->eht_gi != NL80211_RATE_INFO_EHT_GI_1_6, + "bad UHR ELR guard interval %d\n", + rate->eht_gi); + WARN_ONCE(rate->mcs > 1, "bad UHR ELR MCS %d\n", rate->mcs); + WARN_ONCE(rate->nss != 1, "bad UHR ELR NSS %d\n", rate->nss); + WARN_ONCE(rate->bw != RATE_INFO_BW_20, + "bad UHR ELR bandwidth %d\n", + rate->bw); + WARN_ONCE(rate->flags & RATE_INFO_FLAGS_UHR_IM, + "bad UHR MCS flags 0x%x\n", rate->flags); + if (rate->mcs == 0) + return 17; + return 33; + } + + switch (rate->mcs) { + case 0 ... 15: + case 17: + case 19: + case 20: + case 23: + return _cfg80211_calculate_bitrate_eht_uhr(rate); + } + + WARN_ONCE(1, "bad UHR MCS %d\n", rate->mcs); + return 0; +} + static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ @@ -1829,6 +1884,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); + if (rate->flags & RATE_INFO_FLAGS_UHR_MCS) + return cfg80211_calculate_bitrate_uhr(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); -- cgit v1.2.3 From 3495064b6d65a669b409cfe1241db4f3c540251a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 30 Jan 2026 17:36:00 +0000 Subject: ptp: vmclock: add vm generation counter Similar to live migration, loading a VM from some saved state (aka snapshot) is also an event that calls for clock adjustments in the guest. However, guests might want to take more actions as a response to such events, e.g. as discarding UUIDs, resetting network connections, reseeding entropy pools, etc. These are actions that guests don't typically take during live migration, so add a new field in the vmclock_abi called vm_generation_counter which informs the guest about such events. Hypervisor advertises support for vm_generation_counter through the VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT flag. Users need to check the presence of this bit in vmclock_abi flags field before using this flag. Signed-off-by: Babis Chalios Signed-off-by: David Woodhouse Reviewed-by: David Woodhouse Tested-by: Takahiro Itazur Link: https://patch.msgid.link/20260130173704.12575-2-itazur@amazon.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/vmclock-abi.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h index 2d99b29ac44a..937fe00e4f33 100644 --- a/include/uapi/linux/vmclock-abi.h +++ b/include/uapi/linux/vmclock-abi.h @@ -115,6 +115,12 @@ struct vmclock_abi { * bit again after the update, using the about-to-be-valid fields. */ #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + /* + * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will + * bump the vm_generation_counter field every time the guest is + * loaded from some save state (restored from a snapshot). + */ +#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) __u8 pad[2]; __u8 clock_status; @@ -177,6 +183,15 @@ struct vmclock_abi { __le64 time_frac_sec; /* Units of 1/2^64 of a second */ __le64 time_esterror_nanosec; __le64 time_maxerror_nanosec; + + /* + * This field changes to another non-repeating value when the guest + * has been loaded from a snapshot. In addition to handling a + * disruption in time (which will also be signalled through the + * disruption_marker field), a guest may wish to discard UUIDs, + * reset network connections, reseed entropy, etc. + */ + __le64 vm_generation_counter; }; #endif /* __VMCLOCK_ABI_H__ */ -- cgit v1.2.3 From 3b1526ddb25452385b52f2588b655f524a57070b Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 30 Jan 2026 17:36:01 +0000 Subject: ptp: vmclock: support device notifications Add optional support for device notifications in VMClock. When supported, the hypervisor will send a device notification every time it updates the seq_count to a new even value. Moreover, add support for poll() in VMClock as a means to propagate this notification to user space. poll() will return a POLLIN event to listeners every time seq_count changes to a value different than the one last seen (since open() or last read()/pread()). This means that when poll() returns a POLLIN event, listeners need to use read() to observe what has changed and update the reader's view of seq_count. In other words, after a poll() returned, all subsequent calls to poll() will immediately return with a POLLIN event until the listener calls read(). The device advertises support for the notification mechanism by setting flag VMCLOCK_FLAG_NOTIFICATION_PRESENT in vmclock_abi flags field. If the flag is not present the driver won't setup the ACPI notification handler and poll() will always immediately return POLLHUP. Signed-off-by: Babis Chalios Signed-off-by: David Woodhouse Signed-off-by: Takahiro Itazuri Reviewed-by: David Woodhouse Tested-by: Takahiro Itazuri Link: https://patch.msgid.link/20260130173704.12575-3-itazur@amazon.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_vmclock.c | 162 ++++++++++++++++++++++++++++++++++----- include/uapi/linux/vmclock-abi.h | 5 ++ 2 files changed, 148 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index b3a83b03d9c1..f8b24f9e85cd 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -5,6 +5,9 @@ * Copyright © 2024 Amazon.com, Inc. or its affiliates. */ +#include "linux/poll.h" +#include "linux/types.h" +#include "linux/wait.h" #include #include #include @@ -39,6 +42,7 @@ struct vmclock_state { struct resource res; struct vmclock_abi *clk; struct miscdevice miscdev; + wait_queue_head_t disrupt_wait; struct ptp_clock_info ptp_clock_info; struct ptp_clock *ptp_clock; enum clocksource_ids cs_id, sys_cs_id; @@ -357,10 +361,15 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev, return ptp_clock_register(&st->ptp_clock_info, dev); } +struct vmclock_file_state { + struct vmclock_state *st; + atomic_t seq; +}; + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) { - struct vmclock_state *st = container_of(fp->private_data, - struct vmclock_state, miscdev); + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) return -EROFS; @@ -379,11 +388,11 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { - struct vmclock_state *st = container_of(fp->private_data, - struct vmclock_state, miscdev); ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; + uint32_t seq, old_seq; size_t max_count; - uint32_t seq; if (*ppos >= PAGE_SIZE) return 0; @@ -392,6 +401,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, if (count > max_count) count = max_count; + old_seq = atomic_read(&fst->seq); while (1) { seq = le32_to_cpu(st->clk->seq_count) & ~1U; /* Pairs with hypervisor wmb */ @@ -402,8 +412,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, /* Pairs with hypervisor wmb */ virt_rmb(); - if (seq == le32_to_cpu(st->clk->seq_count)) - break; + if (seq == le32_to_cpu(st->clk->seq_count)) { + /* + * Either we updated fst->seq to seq (the latest version we observed) + * or someone else did (old_seq == seq), so we can break. + */ + if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) || + old_seq == seq) { + break; + } + } if (ktime_after(ktime_get(), deadline)) return -ETIMEDOUT; @@ -413,25 +431,62 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, return count; } +static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait) +{ + struct vmclock_file_state *fst = fp->private_data; + struct vmclock_state *st = fst->st; + uint32_t seq; + + /* + * Hypervisor will not send us any notifications, so fail immediately + * to avoid having caller sleeping for ever. + */ + if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) + return POLLHUP; + + poll_wait(fp, &st->disrupt_wait, wait); + + seq = le32_to_cpu(st->clk->seq_count); + if (atomic_read(&fst->seq) != seq) + return POLLIN | POLLRDNORM; + + return 0; +} + +static int vmclock_miscdev_open(struct inode *inode, struct file *fp) +{ + struct vmclock_state *st = container_of(fp->private_data, + struct vmclock_state, miscdev); + struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL); + + if (!fst) + return -ENOMEM; + + fst->st = st; + atomic_set(&fst->seq, 0); + + fp->private_data = fst; + + return 0; +} + +static int vmclock_miscdev_release(struct inode *inode, struct file *fp) +{ + kfree(fp->private_data); + return 0; +} + static const struct file_operations vmclock_miscdev_fops = { .owner = THIS_MODULE, + .open = vmclock_miscdev_open, + .release = vmclock_miscdev_release, .mmap = vmclock_miscdev_mmap, .read = vmclock_miscdev_read, + .poll = vmclock_miscdev_poll, }; /* module operations */ -static void vmclock_remove(void *data) -{ - struct vmclock_state *st = data; - - if (st->ptp_clock) - ptp_clock_unregister(st->ptp_clock); - - if (st->miscdev.minor != MISC_DYNAMIC_MINOR) - misc_deregister(&st->miscdev); -} - static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) { struct vmclock_state *st = data; @@ -459,6 +514,44 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data return AE_ERROR; } +static void +vmclock_acpi_notification_handler(acpi_handle __always_unused handle, + u32 __always_unused event, void *dev) +{ + struct device *device = dev; + struct vmclock_state *st = device->driver_data; + + wake_up_interruptible(&st->disrupt_wait); +} + +static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) +{ + struct acpi_device *adev = ACPI_COMPANION(dev); + acpi_status status; + + /* + * This should never happen as this function is only called when + * has_acpi_companion(dev) is true, but the logic is sufficiently + * complex that Coverity can't see the tautology. + */ + if (!adev) + return -ENODEV; + + /* The device does not support notifications. Nothing else to do */ + if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) + return 0; + + status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, + vmclock_acpi_notification_handler, + dev); + if (ACPI_FAILURE(status)) { + dev_err(dev, "failed to install notification handler"); + return -ENODEV; + } + + return 0; +} + static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) { struct acpi_device *adev = ACPI_COMPANION(dev); @@ -482,6 +575,30 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) return 0; } +static void vmclock_remove(void *data) +{ + struct device *dev = data; + struct vmclock_state *st = dev->driver_data; + + if (!st) { + dev_err(dev, "%s called with NULL driver_data", __func__); + return; + } + + if (has_acpi_companion(dev)) + acpi_remove_notify_handler(ACPI_COMPANION(dev)->handle, + ACPI_DEVICE_NOTIFY, + vmclock_acpi_notification_handler); + + if (st->ptp_clock) + ptp_clock_unregister(st->ptp_clock); + + if (st->miscdev.minor != MISC_DYNAMIC_MINOR) + misc_deregister(&st->miscdev); + + dev->driver_data = NULL; +} + static void vmclock_put_idx(void *data) { struct vmclock_state *st = data; @@ -545,7 +662,14 @@ static int vmclock_probe(struct platform_device *pdev) st->miscdev.minor = MISC_DYNAMIC_MINOR; - ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st); + init_waitqueue_head(&st->disrupt_wait); + dev->driver_data = st; + + ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, dev); + if (ret) + return ret; + + ret = vmclock_setup_notification(dev, st); if (ret) return ret; diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h index 937fe00e4f33..d320623b0118 100644 --- a/include/uapi/linux/vmclock-abi.h +++ b/include/uapi/linux/vmclock-abi.h @@ -121,6 +121,11 @@ struct vmclock_abi { * loaded from some save state (restored from a snapshot). */ #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + /* + * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send + * a notification every time it updates seq_count to a new even number. + */ +#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) __u8 pad[2]; __u8 clock_status; -- cgit v1.2.3 From ef6a31d035a1000071dc4846aebd02ad081db9e4 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:01 +0000 Subject: btrfs: add definitions and constants for remap-tree Add an incompat flag for the new remap-tree feature, and the constants and definitions needed to support it. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 4 ++++ fs/btrfs/locking.c | 1 + fs/btrfs/sysfs.c | 3 +++ fs/btrfs/tree-checker.c | 6 ++---- fs/btrfs/tree-checker.h | 5 +++++ fs/btrfs/volumes.c | 1 + include/uapi/linux/btrfs.h | 1 + include/uapi/linux/btrfs_tree.h | 17 +++++++++++++++++ 8 files changed, 34 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 78721412951c..09cdd6bfddf5 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -1010,6 +1010,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); +BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap_item, address, 64); +BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap_item, + address, 64); + /* Cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0035851d72b0..e3df5ca0b552 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, + { .id = BTRFS_REMAP_TREE_OBJECTID, DEFINE_NAME("remap") }, { .id = 0, DEFINE_NAME("tree") }, }; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index ebd6d1d6778b..8834a1dd499c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -299,6 +299,8 @@ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); +/* Remove once support for remap tree is feature complete. */ +BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -331,6 +333,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), + BTRFS_FEAT_ATTR_PTR(remap_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index c21c21adf61e..aedc208a95b8 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -913,12 +913,10 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } - if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK))) { + if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) { chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", - ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); + type & ~BTRFS_BLOCK_GROUP_VALID); return -EUCLEAN; } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index eb201f4ec3c7..833e2fd989eb 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -57,6 +57,11 @@ enum btrfs_tree_block_status { BTRFS_TREE_BLOCK_WRITTEN_NOT_SET, }; + +#define BTRFS_BLOCK_GROUP_VALID (BTRFS_BLOCK_GROUP_TYPE_MASK | \ + BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_BLOCK_GROUP_REMAPPED) + /* * Exported simply for btrfs-progs which wants to have the * btrfs_tree_block_status return codes. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c4be17fcb87a..d2b7352eb7cb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -231,6 +231,7 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e8fd92789423..9165154a274d 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14) #define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16) +#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17) struct btrfs_ioctl_feature_flags { __u64 compat_flags; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc29d273845d..f011d34cb699 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -76,6 +76,9 @@ /* Tracks RAID stripes in block groups. */ #define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL +/* Holds details of remapped addresses after relocation. */ +#define BTRFS_REMAP_TREE_OBJECTID 13ULL + /* device stats in the device tree */ #define BTRFS_DEV_STATS_OBJECTID 0ULL @@ -282,6 +285,10 @@ #define BTRFS_RAID_STRIPE_KEY 230 +#define BTRFS_IDENTITY_REMAP_KEY 234 +#define BTRFS_REMAP_KEY 235 +#define BTRFS_REMAP_BACKREF_KEY 236 + /* * Records the overall state of the qgroups. * There's only one instance of this key present, @@ -1161,6 +1168,7 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) +#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) @@ -1323,4 +1331,13 @@ struct btrfs_verity_descriptor_item { __u8 encryption; } __attribute__ ((__packed__)); +/* + * For a range identified by a BTRFS_REMAP_KEY item in the remap tree, gives + * the address that the start of the range will get remapped to. This + * structure is also shared by BTRFS_REMAP_BACKREF_KEY. + */ +struct btrfs_remap_item { + __le64 address; +} __attribute__ ((__packed__)); + #endif /* _BTRFS_CTREE_H_ */ -- cgit v1.2.3 From 0b4d29fa98ca1a49c4498353253f857573871ba0 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:02 +0000 Subject: btrfs: add METADATA_REMAP chunk type Add a new METADATA_REMAP chunk type, which is a metadata chunk that holds the remap tree. This is needed for bootstrapping purposes: the remap tree can't itself be remapped, and must be relocated the existing way, by COWing every leaf. The remap tree can't go in the SYSTEM chunk as space there is limited, because a copy of the chunk item gets placed in the superblock. The changes in fs/btrfs/volumes.h are because we're adding a new block group type bit after the profile bits, and so can no longer rely on the const_ilog2 trick. The sizing to 32MB per chunk, matching the SYSTEM chunk, is an estimate here, we can adjust it later if it proves to be too big or too small. This works out to be ~500,000 remap items, which for a 4KB block size covers ~2GB of remapped data in the worst case and ~500TB in the best case. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 8 ++++++++ fs/btrfs/block-rsv.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/fs.h | 2 ++ fs/btrfs/space-info.c | 13 ++++++++++++- fs/btrfs/sysfs.c | 2 ++ fs/btrfs/tree-checker.c | 13 +++++++++++-- fs/btrfs/volumes.c | 3 +++ fs/btrfs/volumes.h | 10 +++++++++- include/uapi/linux/btrfs_tree.h | 4 +++- 10 files changed, 52 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 96cf7a162987..e823230c09b7 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -419,6 +419,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_TREE_LOG_OBJECTID: root->block_rsv = &fs_info->treelog_rsv; break; + case BTRFS_REMAP_TREE_OBJECTID: + root->block_rsv = &fs_info->remap_block_rsv; + break; default: root->block_rsv = NULL; break; @@ -432,6 +435,9 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA_REMAP); + fs_info->remap_block_rsv.space_info = space_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; @@ -458,6 +464,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->remap_block_rsv.size > 0); + WARN_ON(fs_info->remap_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 79ae9d05cd91..8359fb96bc3c 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -22,6 +22,7 @@ enum btrfs_rsv_type { BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_REMAP, BTRFS_BLOCK_RSV_DELOPS, BTRFS_BLOCK_RSV_DELREFS, BTRFS_BLOCK_RSV_TREELOG, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index faa1c2c20ecd..922e69038d81 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2751,6 +2751,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP); btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index e3e5e52e97a2..195428ecfd75 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -509,6 +509,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* Block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for remap tree. */ + struct btrfs_block_rsv remap_block_rsv; /* Block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 1d76242f5e0d..2c9cf1ab232b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -215,7 +215,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) if (flags & BTRFS_BLOCK_GROUP_DATA) return BTRFS_MAX_DATA_CHUNK_SIZE; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP)) return SZ_32M; /* Handle BTRFS_BLOCK_GROUP_METADATA */ @@ -348,6 +348,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); + if (ret) + goto out; } else { flags = BTRFS_BLOCK_GROUP_METADATA; ret = create_space_info(fs_info, flags); @@ -356,7 +358,15 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) flags = BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); + if (ret) + goto out; + } + + if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; + ret = create_space_info(fs_info, flags); } + out: return ret; } @@ -611,6 +621,7 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, remap_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8834a1dd499c..27bfb7b55ec4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1929,6 +1929,8 @@ static const char *alloc_name(struct btrfs_space_info *space_info) case BTRFS_BLOCK_GROUP_SYSTEM: ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; + case BTRFS_BLOCK_GROUP_METADATA_REMAP: + return "metadata-remap"; default: WARN_ON(1); return "invalid-combination"; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index aedc208a95b8..a6c158cd8fcd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -748,17 +748,26 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } + if (unlikely(flags & BTRFS_BLOCK_GROUP_METADATA_REMAP && + !btrfs_fs_incompat(fs_info, REMAP_TREE))) { + block_group_err(leaf, slot, +"invalid flags, have 0x%llx (METADATA_REMAP flag set) but no remap-tree incompat flag", + flags); + return -EUCLEAN; + } + type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && type != BTRFS_BLOCK_GROUP_METADATA && type != BTRFS_BLOCK_GROUP_SYSTEM && + type != BTRFS_BLOCK_GROUP_METADATA_REMAP && type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { block_group_err(leaf, slot, -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA_REMAP, BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); return -EUCLEAN; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d2b7352eb7cb..eda6505f3ee5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -231,6 +231,9 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + /* Block groups containing the remap tree. */ + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA_REMAP, "metadata-remap"); + /* Block group that has been remapped. */ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 59347a4bb185..e4b3cb50f94a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,6 @@ static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); */ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); -static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); /* ilog2() can handle both constants and variables */ #define BTRFS_BG_FLAG_TO_INDEX(profile) \ @@ -80,6 +79,15 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; +static_assert(BTRFS_RAID_RAID0 == 1); +static_assert(BTRFS_RAID_RAID1 == 2); +static_assert(BTRFS_RAID_DUP == 3); +static_assert(BTRFS_RAID_RAID10 == 4); +static_assert(BTRFS_RAID_RAID5 == 5); +static_assert(BTRFS_RAID_RAID6 == 6); +static_assert(BTRFS_RAID_RAID1C3 == 7); +static_assert(BTRFS_RAID_RAID1C4 == 8); + /* * Use sequence counter to get consistent device stat data on * 32-bit processors. diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index f011d34cb699..76578426671c 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1169,12 +1169,14 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) #define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11) +#define BTRFS_BLOCK_GROUP_METADATA_REMAP (1ULL << 12) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) + BTRFS_BLOCK_GROUP_METADATA | \ + BTRFS_BLOCK_GROUP_METADATA_REMAP) #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ -- cgit v1.2.3 From 7977011460cffc6f5a0cd830584c832c4aa07076 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:07 +0000 Subject: btrfs: add extended version of struct block_group_item Add a struct btrfs_block_group_item_v2, which is used in the block group tree if the remap-tree incompat flag is set. This adds two new fields to the block group item: `remap_bytes` and `identity_remap_count`. `remap_bytes` records the amount of data that's physically within this block group, but nominally in another, remapped block group. This is necessary because this data will need to be moved first if this block group is itself relocated. If `remap_bytes` > 0, this is an indicator to the relocation thread that it will need to search the remap-tree for backrefs. A block group must also have `remap_bytes` == 0 before it can be dropped. `identity_remap_count` records how many identity remap items are located in the remap tree for this block group. When relocation is begun for this block group, this is set to the number of holes in the free-space tree for this range. As identity remaps are converted into actual remaps by the relocation process, this number is decreased. Once it reaches 0, either because of relocation or because extents have been deleted, the block group has been fully remapped and its chunk's device extents are removed. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 20 +++++++++ fs/btrfs/block-group.c | 92 ++++++++++++++++++++++++++++++----------- fs/btrfs/block-group.h | 10 ++++- fs/btrfs/discard.c | 2 +- fs/btrfs/tree-checker.c | 10 ++++- include/uapi/linux/btrfs_tree.h | 8 ++++ 6 files changed, 114 insertions(+), 28 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 09cdd6bfddf5..9797f9e8d4e5 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -240,6 +240,26 @@ BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); +/* struct btrfs_block_group_item_v2 */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2, + used, 64); +BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid, + struct btrfs_block_group_item_v2, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid, + struct btrfs_block_group_item_v2, chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags, + struct btrfs_block_group_item_v2, flags, 64); +BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes, + struct btrfs_block_group_item_v2, remap_bytes, 64); +BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2, + remap_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count, + struct btrfs_block_group_item_v2, identity_remap_count, 32); +BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2, + identity_remap_count, 32); + /* struct btrfs_free_space_info */ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, extent_count, 32); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5709acc84297..a1ab513fa8ea 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2371,7 +2371,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) } static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_block_group_item *bgi, + struct btrfs_block_group_item_v2 *bgi, const struct btrfs_key *key, int need_clear) { @@ -2386,11 +2386,15 @@ static int read_one_block_group(struct btrfs_fs_info *info, return -ENOMEM; cache->length = key->offset; - cache->used = btrfs_stack_block_group_used(bgi); + cache->used = btrfs_stack_block_group_v2_used(bgi); cache->last_used = cache->used; - cache->flags = btrfs_stack_block_group_flags(bgi); - cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->flags = btrfs_stack_block_group_v2_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi); cache->space_info = btrfs_find_space_info(info, cache->flags); + cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi); + cache->last_remap_bytes = cache->remap_bytes; + cache->identity_remap_count = btrfs_stack_block_group_v2_identity_remap_count(bgi); + cache->last_identity_remap_count = cache->identity_remap_count; btrfs_set_free_space_tree_thresholds(cache); @@ -2455,7 +2459,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, } else if (cache->length == cache->used) { cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); - } else if (cache->used == 0) { + } else if (cache->used == 0 && cache->remap_bytes == 0) { cache->cached = BTRFS_CACHE_FINISHED; ret = btrfs_add_new_free_space(cache, cache->start, cache->start + cache->length, NULL); @@ -2475,7 +2479,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_writeable(info, cache->start)) { - if (cache->used == 0) { + if (cache->used == 0 && cache->remap_bytes == 0) { ASSERT(list_empty(&cache->bg_list)); if (btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_discard_queue_work(&info->discard_ctl, cache); @@ -2579,9 +2583,10 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct extent_buffer *leaf; int slot; + size_t size; ret = find_first_block_group(info, path, &key); if (ret > 0) @@ -2592,8 +2597,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) leaf = path->nodes[0]; slot = path->slots[0]; + if (btrfs_fs_incompat(info, REMAP_TREE)) { + size = sizeof(struct btrfs_block_group_item_v2); + } else { + size = sizeof(struct btrfs_block_group_item); + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0); + } + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); + size); btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_release_path(path); @@ -2663,25 +2676,34 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; u64 old_last_used; + size_t size; int ret; spin_lock(&block_group->lock); - btrfs_set_stack_block_group_used(&bgi, block_group->used); - btrfs_set_stack_block_group_chunk_objectid(&bgi, - block_group->global_root_id); - btrfs_set_stack_block_group_flags(&bgi, block_group->flags); + btrfs_set_stack_block_group_v2_used(&bgi, block_group->used); + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, block_group->global_root_id); + btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags); + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, block_group->remap_bytes); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, block_group->identity_remap_count); old_last_used = block_group->last_used; block_group->last_used = block_group->used; + block_group->last_remap_bytes = block_group->remap_bytes; + block_group->last_identity_remap_count = block_group->identity_remap_count; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; spin_unlock(&block_group->lock); - ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + size = sizeof(struct btrfs_block_group_item_v2); + else + size = sizeof(struct btrfs_block_group_item); + + ret = btrfs_insert_item(trans, root, &key, &bgi, size); if (ret < 0) { spin_lock(&block_group->lock); block_group->last_used = old_last_used; @@ -3132,10 +3154,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_root *root = btrfs_block_group_root(fs_info); unsigned long bi; struct extent_buffer *leaf; - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct btrfs_key key; - u64 old_last_used; - u64 used; + u64 old_last_used, old_last_remap_bytes; + u32 old_last_identity_remap_count; + u64 used, remap_bytes; + u32 identity_remap_count; /* * Block group items update can be triggered out of commit transaction @@ -3145,13 +3169,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, */ spin_lock(&cache->lock); old_last_used = cache->last_used; + old_last_remap_bytes = cache->last_remap_bytes; + old_last_identity_remap_count = cache->last_identity_remap_count; used = cache->used; - /* No change in used bytes, can safely skip it. */ - if (cache->last_used == used) { + remap_bytes = cache->remap_bytes; + identity_remap_count = cache->identity_remap_count; + /* No change in values, can safely skip it. */ + if (cache->last_used == used && + cache->last_remap_bytes == remap_bytes && + cache->last_identity_remap_count == identity_remap_count) { spin_unlock(&cache->lock); return 0; } cache->last_used = used; + cache->last_remap_bytes = remap_bytes; + cache->last_identity_remap_count = identity_remap_count; spin_unlock(&cache->lock); key.objectid = cache->start; @@ -3167,11 +3199,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, used); - btrfs_set_stack_block_group_chunk_objectid(&bgi, - cache->global_root_id); - btrfs_set_stack_block_group_flags(&bgi, cache->flags); - write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); + btrfs_set_stack_block_group_v2_used(&bgi, used); + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, cache->global_root_id); + btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags); + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, cache->remap_bytes); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, + cache->identity_remap_count); + write_extent_buffer(leaf, &bgi, bi, + sizeof(struct btrfs_block_group_item_v2)); + } else { + write_extent_buffer(leaf, &bgi, bi, + sizeof(struct btrfs_block_group_item)); + } + fail: btrfs_release_path(path); /* @@ -3186,6 +3228,8 @@ fail: if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->last_used = old_last_used; + cache->last_remap_bytes = old_last_remap_bytes; + cache->last_identity_remap_count = old_last_identity_remap_count; spin_unlock(&cache->lock); } return ret; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index b0fb85a36d97..ecabb1a9fc0e 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -129,6 +129,8 @@ struct btrfs_block_group { u64 flags; u64 cache_generation; u64 global_root_id; + u64 remap_bytes; + u32 identity_remap_count; /* * The last committed used bytes of this block group, if the above @used @@ -136,6 +138,11 @@ struct btrfs_block_group { * group item of this block group. */ u64 last_used; + /* The last committed remap_bytes value of this block group. */ + u64 last_remap_bytes; + /* The last commited identity_remap_count value of this block group. */ + u32 last_identity_remap_count; + /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. @@ -282,7 +289,8 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) { lockdep_assert_held(&bg->lock); - return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 || + bg->remap_bytes > 0); } static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 89fe85778115..ee5f5b2788e1 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -373,7 +373,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; - if (block_group->used == 0) + if (block_group->used == 0 && block_group->remap_bytes == 0) add_to_discard_unused_list(discard_ctl, block_group); else add_to_discard_list(discard_ctl, block_group); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ead2e1e2a0bb..452394b34d01 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -688,6 +688,7 @@ static int check_block_group_item(struct extent_buffer *leaf, u64 chunk_objectid; u64 flags; u64 type; + size_t exp_size; /* * Here we don't really care about alignment since extent allocator can @@ -699,10 +700,15 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } - if (unlikely(item_size != sizeof(bgi))) { + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + exp_size = sizeof(struct btrfs_block_group_item_v2); + else + exp_size = sizeof(struct btrfs_block_group_item); + + if (unlikely(item_size != exp_size)) { block_group_err(leaf, slot, "invalid item size, have %u expect %zu", - item_size, sizeof(bgi)); + item_size, exp_size); return -EUCLEAN; } diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 76578426671c..86820a9644e8 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1229,6 +1229,14 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_block_group_item_v2 { + __le64 used; + __le64 chunk_objectid; + __le64 flags; + __le64 remap_bytes; + __le32 identity_remap_count; +} __attribute__ ((__packed__)); + struct btrfs_free_space_info { __le32 extent_count; __le32 flags; -- cgit v1.2.3 From 8620da16fb6be1fd9906374fa1c763a10c6918df Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:08 +0000 Subject: btrfs: allow mounting filesystems with remap-tree incompat flag If we encounter a filesystem with the remap-tree incompat flag set, validate its compatibility with the other flags, and load the remap tree using the values that have been added to the superblock. The remap-tree feature depends on the free-space-tree, but no-holes and block-group-tree have been made dependencies to reduce the testing matrix. Similarly I'm not aware of any reason why mixed-bg and zoned would be incompatible with remap-tree, but this is blocked for the time being until it can be fully tested. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 2 + fs/btrfs/accessors.h | 6 +++ fs/btrfs/disk-io.c | 105 +++++++++++++++++++++++++++++++++++----- fs/btrfs/extent-tree.c | 2 + fs/btrfs/fs.h | 4 +- fs/btrfs/transaction.c | 7 +++ include/uapi/linux/btrfs_tree.h | 5 +- 7 files changed, 116 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 423122786a93..ede184b6eda1 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -116,4 +116,6 @@ config BTRFS_EXPERIMENTAL - asynchronous checksum generation for data writes + - remap-tree - logical address remapping tree + If unsure, say N. diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 9797f9e8d4e5..8938357fcb40 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -883,6 +883,12 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, nr_global_roots, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root, struct btrfs_super_block, + remap_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root_generation, struct btrfs_super_block, + remap_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root_level, struct btrfs_super_block, + remap_root_level, 8); /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cd46b9d85880..c69734c74c26 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1136,6 +1136,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_RAID_STRIPE_TREE_OBJECTID: return btrfs_grab_root(fs_info->stripe_root); + case BTRFS_REMAP_TREE_OBJECTID: + return btrfs_grab_root(fs_info->remap_root); default: return NULL; } @@ -1226,6 +1228,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); btrfs_put_root(fs_info->stripe_root); + btrfs_put_root(fs_info->remap_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1778,6 +1781,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); free_root_extent_buffers(info->stripe_root); + free_root_extent_buffers(info->remap_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2191,21 +2195,44 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (ret) goto out; - /* - * This tree can share blocks with some other fs tree during relocation - * and we need a proper setup by btrfs_get_fs_root - */ - root = btrfs_get_fs_root(tree_root->fs_info, - BTRFS_DATA_RELOC_TREE_OBJECTID, true); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - ret = PTR_ERR(root); - goto out; + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + /* The remap_root has already been loaded in load_important_roots(). */ + root = fs_info->remap_root; + + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + + root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + /* Check that data reloc tree doesn't also exist. */ + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + root = btrfs_read_tree_root(fs_info->tree_root, &location); + if (!IS_ERR(root)) { + btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled"); + btrfs_put_root(root); + return -EIO; + } else if (PTR_ERR(root) != -ENOENT) { + btrfs_warn(fs_info, "error %ld when checking for data reloc tree", + PTR_ERR(root)); } } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->data_reloc_root = root; + /* + * This tree can share blocks with some other fs tree during + * relocation and we need a proper setup by btrfs_get_fs_root(). + */ + root = btrfs_get_fs_root(tree_root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID, true); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; + } } location.objectid = BTRFS_QUOTA_TREE_OBJECTID; @@ -2445,6 +2472,35 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + /* + * Reduce test matrix for remap tree by requiring block-group-tree + * and no-holes. Free-space-tree is a hard requirement. + */ + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + btrfs_err(fs_info, +"remap-tree feature requires free-space-tree, no-holes, and block-group-tree"); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + btrfs_err(fs_info, "remap-tree not supported with mixed-bg"); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, ZONED)) { + btrfs_err(fs_info, "remap-tree not supported with zoned devices"); + ret = -EINVAL; + } + + if (sectorsize > PAGE_SIZE) { + btrfs_err(fs_info, "remap-tree not supported when block size > page size"); + ret = -EINVAL; + } + } + /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later @@ -2603,6 +2659,18 @@ static int load_important_roots(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "couldn't read tree root"); return ret; } + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + bytenr = btrfs_super_remap_root(sb); + gen = btrfs_super_remap_root_generation(sb); + level = btrfs_super_remap_root_level(sb); + ret = load_super_root(fs_info->remap_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read remap root"); + return ret; + } + } + return 0; } @@ -3231,6 +3299,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; struct btrfs_root *chunk_root; + struct btrfs_root *remap_root; int ret; int level; @@ -3365,6 +3434,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret < 0) goto fail_alloc; + if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { + remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID, + GFP_KERNEL); + fs_info->remap_root = remap_root; + if (!remap_root) { + ret = -ENOMEM; + goto fail_alloc; + } + } + /* * At this point our mount options are validated, if we set ->max_inline * to something non-standard make sure we truncate it to sectorsize. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 48a453fa3063..ce4bda1f37ad 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2593,6 +2593,8 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) flags = BTRFS_BLOCK_GROUP_DATA; else if (root == fs_info->chunk_root) flags = BTRFS_BLOCK_GROUP_SYSTEM; + else if (root == fs_info->remap_root) + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; else flags = BTRFS_BLOCK_GROUP_METADATA; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 195428ecfd75..13b0aa0b9da9 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -315,7 +315,8 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \ + BTRFS_FEATURE_INCOMPAT_REMAP_TREE) #else @@ -475,6 +476,7 @@ struct btrfs_fs_info { struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; struct btrfs_root *stripe_root; + struct btrfs_root *remap_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e2f993b1783f..f4cc9e1a1b93 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1967,6 +1967,13 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + root_item = &fs_info->remap_root->root_item; + super->remap_root = root_item->bytenr; + super->remap_root_generation = root_item->generation; + super->remap_root_level = root_item->level; + } } int btrfs_transaction_blocked(struct btrfs_fs_info *info) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 86820a9644e8..f7843e6bb978 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -721,9 +721,12 @@ struct btrfs_super_block { __u8 metadata_uuid[BTRFS_FSID_SIZE]; __u64 nr_global_roots; + __le64 remap_root; + __le64 remap_root_generation; + __u8 remap_root_level; /* Future expansion */ - __le64 reserved[27]; + __u8 reserved[199]; __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; -- cgit v1.2.3 From 4fa4ac5e584841c0f9b01c2f7dd0c2e3caa8bca0 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Sat, 31 Jan 2026 23:25:13 +0100 Subject: tcp: accecn: add tcpi_ecn_mode and tcpi_option2 in tcp_info Add 2-bit tcpi_ecn_mode feild within tcp_info to indicate which ECN mode is negotiated: ECN_MODE_DISABLED, ECN_MODE_RFC3168, ECN_MODE_ACCECN, or ECN_MODE_PENDING. This is done by utilizing available bits from tcpi_accecn_opt_seen (reduced from 16 bits to 2 bits) and tcpi_accecn_fail_mode (reduced from 16 bits to 4 bits). Also, an extra 24-bit tcpi_options2 field is identified to represent newer options and connection features, as all 8 bits of tcpi_options field have been used. Signed-off-by: Chia-Yu Chang Co-developed-by: Neal Cardwell Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260131222515.8485-14-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/net/tcp_ecn.h | 11 ----------- include/uapi/linux/tcp.h | 26 +++++++++++++++++++++++--- net/ipv4/tcp.c | 8 ++++++++ 3 files changed, 31 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index e01653bbf181..e9a933641636 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -67,12 +67,6 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } -/* tp->accecn_fail_mode */ -#define TCP_ACCECN_ACE_FAIL_SEND BIT(0) -#define TCP_ACCECN_ACE_FAIL_RECV BIT(1) -#define TCP_ACCECN_OPT_FAIL_SEND BIT(2) -#define TCP_ACCECN_OPT_FAIL_RECV BIT(3) - static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) { return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; @@ -98,11 +92,6 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) tp->accecn_fail_mode |= mode; } -#define TCP_ACCECN_OPT_NOT_SEEN 0x0 -#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 -#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 -#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 - static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dce3113787a7..03772dd4d399 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -226,6 +226,24 @@ enum tcp_ca_state { #define TCPF_CA_Loss (1<rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + if (tcp_ecn_disabled(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED; + else if (tcp_ecn_mode_rfc3168(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168; + else if (tcp_ecn_mode_accecn(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN; + else if (tcp_ecn_mode_pending(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING; info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; -- cgit v1.2.3 From 0ee4ddc1647b8b3b9e7a94d798a1774a764428c1 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 4 Feb 2026 16:02:57 +0100 Subject: KVM: s390: Storage key manipulation IOCTL Add a new IOCTL to allow userspace to manipulate storage keys directly. This will make it easier to write selftests related to storage keys. Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda --- Documentation/virt/kvm/api.rst | 42 ++++++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 58 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 11 ++++++++ 3 files changed, 111 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..72e04dedb068 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6517,6 +6517,40 @@ the capability to be present. `flags` must currently be zero. +4.144 KVM_S390_KEYOP +-------------------- + +:Capability: KVM_CAP_S390_KEYOP +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_keyop (in/out) +:Returns: 0 in case of success, < 0 on error + +The specified key operation is performed on the given guest address. The +previous storage key (or the relevant part thereof) will be returned in +`key`. + +:: + + struct kvm_s390_keyop { + __u64 guest_addr; + __u8 key; + __u8 operation; + }; + +Currently supported values for ``operation``: + +KVM_S390_KEYOP_ISKE + Returns the storage key for the guest address ``guest_addr`` in ``key``. + +KVM_S390_KEYOP_RRBE + Resets the reference bit for the guest address ``guest_addr``, returning the + R and C bits of the old storage key in ``key``; the remaining fields of + the storage key will be set to 0. + +KVM_S390_KEYOP_SSKE + Sets the storage key for the guest address ``guest_addr`` to the key + specified in ``key``, returning the previous value in ``key``. .. _kvm_run: @@ -9287,6 +9321,14 @@ The presence of this capability indicates that KVM_RUN will update the KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the vCPU was executing nested guest code when it exited. +8.46 KVM_CAP_S390_KEYOP +----------------------- + +:Architectures: s390 + +The presence of this capability indicates that the KVM_S390_KEYOP ioctl is +available. + KVM exits with the register state of either the L1 or L2 guest depending on which executed at the time of an exit. Userspace must take care to differentiate between these cases. diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ac7b5f56f0b5..9f24252775dd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -554,6 +554,37 @@ static void __kvm_s390_exit(void) debug_unregister(kvm_s390_dbf_uv); } +static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op, + unsigned long addr, union skey skey) +{ + union asce asce = kvm->arch.gmap->asce; + gfn_t gfn = gpa_to_gfn(addr); + int r; + + guard(read_lock)(&kvm->mmu_lock); + + switch (op) { + case KVM_S390_KEYOP_SSKE: + r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0); + if (r >= 0) + return skey.skey; + break; + case KVM_S390_KEYOP_ISKE: + r = dat_get_storage_key(asce, gfn, &skey); + if (!r) + return skey.skey; + break; + case KVM_S390_KEYOP_RRBE: + r = dat_reset_reference_bit(asce, gfn); + if (r > 0) + return r << 1; + break; + default: + return -EINVAL; + } + return r; +} + /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) @@ -598,6 +629,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_DIAG318: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_S390_USER_OPEREXEC: + case KVM_CAP_S390_KEYOP: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -2931,6 +2963,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) r = -EFAULT; break; } + case KVM_S390_KEYOP: { + struct kvm_s390_mmu_cache *mc; + struct kvm_s390_keyop kop; + union skey skey; + + if (copy_from_user(&kop, argp, sizeof(kop))) { + r = -EFAULT; + break; + } + skey.skey = kop.key; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey); + kvm_s390_free_mmu_cache(mc); + if (r < 0) + break; + + kop.key = r; + r = 0; + if (copy_to_user(argp, &kop, sizeof(kop))) + r = -EFAULT; + break; + } case KVM_S390_ZPCI_OP: { struct kvm_s390_zpci_op args; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dddb781b0507..ab3d3d96e75f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -974,6 +974,7 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD_FLAGS 244 #define KVM_CAP_ARM_SEA_TO_USER 245 #define KVM_CAP_S390_USER_OPEREXEC 246 +#define KVM_CAP_S390_KEYOP 247 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1219,6 +1220,15 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; +#define KVM_S390_KEYOP_ISKE 0x01 +#define KVM_S390_KEYOP_RRBE 0x02 +#define KVM_S390_KEYOP_SSKE 0x03 +struct kvm_s390_keyop { + __u64 guest_addr; + __u8 key; + __u8 operation; +}; + /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -1238,6 +1248,7 @@ struct kvm_vfio_spapr_tce { #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) +#define KVM_S390_KEYOP _IOWR(KVMIO, 0x53, struct kvm_s390_keyop) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) -- cgit v1.2.3 From f7ab71f178d56447e5efb55b65436feb68662f8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Feb 2026 10:17:30 +0100 Subject: KVM: s390: Add explicit padding to struct kvm_s390_keyop The newly added structure causes a warning about implied padding: ./usr/include/linux/kvm.h:1247:1: error: padding struct size to alignment boundary with 6 bytes [-Werror=padded] The padding can lead to leaking kernel data and ABI incompatibilies when used on x86. Neither of these is a problem in this specific patch, but it's best to avoid it and use explicit padding fields in general. Fixes: 0ee4ddc1647b ("KVM: s390: Storage key manipulation IOCTL") Signed-off-by: Arnd Bergmann Reviewed-by: Claudio Imbrenda Signed-off-by: Claudio Imbrenda --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ab3d3d96e75f..d4250ab662fe 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1227,6 +1227,7 @@ struct kvm_s390_keyop { __u64 guest_addr; __u8 key; __u8 operation; + __u8 pad[6]; }; /* -- cgit v1.2.3 From ed82f35b926b2e505c14b7006473614b8f58b4f4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Jan 2026 10:18:31 -0700 Subject: io_uring: allow registration of per-task restrictions Currently io_uring supports restricting operations on a per-ring basis. To use those, the ring must be setup in a disabled state by setting IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and the ring can then be enabled. This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd == -1, like the other "blind" register opcodes which work on the task rather than a specific ring. This allows registration of the same kind of restrictions as can been done on a specific ring, but with the task itself. Once done, any ring created will inherit these restrictions. If a restriction filter is registered with a task, then it's inherited on fork for its children. Children may only further restrict operations, not extend them. Inheriting restrictions include both the classic IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF filters that have been registered with the task via IORING_REGISTER_BPF_FILTER. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 + include/uapi/linux/io_uring.h | 7 ++++ io_uring/bpf_filter.c | 86 +++++++++++++++++++++++++++++++++++++++++- io_uring/bpf_filter.h | 6 +++ io_uring/io_uring.c | 33 ++++++++++++++++ io_uring/io_uring.h | 1 + io_uring/register.c | 80 +++++++++++++++++++++++++++++++++++++++ io_uring/tctx.c | 17 +++++++++ 8 files changed, 231 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7617df247238..510d801b9a55 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -231,6 +231,8 @@ struct io_restriction { DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); DECLARE_BITMAP(sqe_op, IORING_OP_LAST); struct io_bpf_filters *bpf_filters; + /* ->bpf_filters needs COW on modification */ + bool bpf_filters_cow; u8 sqe_flags_allowed; u8 sqe_flags_required; /* IORING_OP_* restrictions exist */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 94669b77fee8..aeeffcf27fee 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -808,6 +808,13 @@ struct io_uring_restriction { __u32 resv2[3]; }; +struct io_uring_task_restriction { + __u16 flags; + __u16 nr_res; + __u32 resv[3]; + __DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions); +}; + struct io_uring_clock_register { __u32 clockid; __u32 __resv[3]; diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c index b94944ab8442..3816883a45ed 100644 --- a/io_uring/bpf_filter.c +++ b/io_uring/bpf_filter.c @@ -249,13 +249,77 @@ static int io_uring_check_cbpf_filter(struct sock_filter *filter, return 0; } +void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src) +{ + if (!src->bpf_filters) + return; + + rcu_read_lock(); + /* + * If the src filter is going away, just ignore it. + */ + if (refcount_inc_not_zero(&src->bpf_filters->refs)) { + dst->bpf_filters = src->bpf_filters; + dst->bpf_filters_cow = true; + } + rcu_read_unlock(); +} + +/* + * Allocate a new struct io_bpf_filters. Used when a filter is cloned and + * modifications need to be made. + */ +static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src) +{ + struct io_bpf_filters *filters; + struct io_bpf_filter *srcf; + int i; + + filters = io_new_bpf_filters(); + if (IS_ERR(filters)) + return filters; + + /* + * Iterate filters from src and assign in destination. Grabbing + * a reference is enough, we don't need to duplicate the memory. + * This is safe because filters are only ever appended to the + * front of the list, hence the only memory ever touched inside + * a filter is the refcount. + */ + rcu_read_lock(); + for (i = 0; i < IORING_OP_LAST; i++) { + srcf = rcu_dereference(src->bpf_filters->filters[i]); + if (!srcf) { + continue; + } else if (srcf == &dummy_filter) { + rcu_assign_pointer(filters->filters[i], &dummy_filter); + continue; + } + + /* + * Getting a ref on the first node is enough, putting the + * filter and iterating nodes to free will stop on the first + * one that doesn't hit zero when dropping. + */ + if (!refcount_inc_not_zero(&srcf->refs)) + goto err; + rcu_assign_pointer(filters->filters[i], srcf); + } + rcu_read_unlock(); + return filters; +err: + rcu_read_unlock(); + __io_put_bpf_filters(filters); + return ERR_PTR(-EBUSY); +} + #define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST int io_register_bpf_filter(struct io_restriction *res, struct io_uring_bpf __user *arg) { + struct io_bpf_filters *filters, *old_filters = NULL; struct io_bpf_filter *filter, *old_filter; - struct io_bpf_filters *filters; struct io_uring_bpf reg; struct bpf_prog *prog; struct sock_fprog fprog; @@ -297,6 +361,17 @@ int io_register_bpf_filter(struct io_restriction *res, ret = PTR_ERR(filters); goto err_prog; } + } else if (res->bpf_filters_cow) { + filters = io_bpf_filter_cow(res); + if (IS_ERR(filters)) { + ret = PTR_ERR(filters); + goto err_prog; + } + /* + * Stash old filters, we'll put them once we know we'll + * succeed. Until then, res->bpf_filters is left untouched. + */ + old_filters = res->bpf_filters; } filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT); @@ -306,6 +381,15 @@ int io_register_bpf_filter(struct io_restriction *res, } refcount_set(&filter->refs, 1); filter->prog = prog; + + /* + * Success - install the new filter set now. If we did COW, put + * the old filters as we're replacing them. + */ + if (old_filters) { + __io_put_bpf_filters(old_filters); + res->bpf_filters_cow = false; + } res->bpf_filters = filters; /* diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h index 9f3cdb92eb16..66a776cf25b4 100644 --- a/io_uring/bpf_filter.h +++ b/io_uring/bpf_filter.h @@ -13,6 +13,8 @@ int io_register_bpf_filter(struct io_restriction *res, void io_put_bpf_filters(struct io_restriction *res); +void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src); + static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, struct io_kiocb *req) { @@ -37,6 +39,10 @@ static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, static inline void io_put_bpf_filters(struct io_restriction *res) { } +static inline void io_bpf_filter_clone(struct io_restriction *dst, + struct io_restriction *src) +{ +} #endif /* CONFIG_IO_URING_BPF */ #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 049454278563..e43c5283b23a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2880,6 +2880,32 @@ int io_prepare_config(struct io_ctx_config *config) return 0; } +void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src) +{ + memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op)); + memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op)); + dst->sqe_flags_allowed = src->sqe_flags_allowed; + dst->sqe_flags_required = src->sqe_flags_required; + dst->op_registered = src->op_registered; + dst->reg_registered = src->reg_registered; + + io_bpf_filter_clone(dst, src); +} + +static void io_ctx_restriction_clone(struct io_ring_ctx *ctx, + struct io_restriction *src) +{ + struct io_restriction *dst = &ctx->restrictions; + + io_restriction_clone(dst, src); + if (dst->bpf_filters) + WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters); + if (dst->op_registered) + ctx->op_restricted = 1; + if (dst->reg_registered) + ctx->reg_restricted = 1; +} + static __cold int io_uring_create(struct io_ctx_config *config) { struct io_uring_params *p = &config->p; @@ -2940,6 +2966,13 @@ static __cold int io_uring_create(struct io_ctx_config *config) else ctx->notify_method = TWA_SIGNAL; + /* + * If the current task has restrictions enabled, then copy them to + * our newly created ring and mark it as registered. + */ + if (current->io_uring_restrict) + io_ctx_restriction_clone(ctx, current->io_uring_restrict); + /* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 29b8f90fdabf..a08d78c716f8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -197,6 +197,7 @@ void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); void io_activate_pollwq(struct io_ring_ctx *ctx); +void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { diff --git a/io_uring/register.c b/io_uring/register.c index 40de9b8924b9..af4815bc11d6 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -190,6 +190,82 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return 0; } +static int io_register_restrictions_task(void __user *arg, unsigned int nr_args) +{ + struct io_uring_task_restriction __user *ures = arg; + struct io_uring_task_restriction tres; + struct io_restriction *res; + int ret; + + /* Disallow if task already has registered restrictions */ + if (current->io_uring_restrict) + return -EPERM; + /* + * Similar to seccomp, disallow setting a filter if task_no_new_privs + * is true and we're not CAP_SYS_ADMIN. + */ + if (!task_no_new_privs(current) && + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) + return -EACCES; + if (nr_args != 1) + return -EINVAL; + + if (copy_from_user(&tres, arg, sizeof(tres))) + return -EFAULT; + + if (tres.flags) + return -EINVAL; + if (!mem_is_zero(tres.resv, sizeof(tres.resv))) + return -EINVAL; + + res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT); + if (!res) + return -ENOMEM; + + ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res); + if (ret < 0) { + kfree(res); + return ret; + } + current->io_uring_restrict = res; + return 0; +} + +static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args) +{ + struct io_restriction *res; + int ret; + + /* + * Similar to seccomp, disallow setting a filter if task_no_new_privs + * is true and we're not CAP_SYS_ADMIN. + */ + if (!task_no_new_privs(current) && + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) + return -EACCES; + + if (nr_args != 1) + return -EINVAL; + + /* If no task restrictions exist, setup a new set */ + res = current->io_uring_restrict; + if (!res) { + res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT); + if (!res) + return -ENOMEM; + } + + ret = io_register_bpf_filter(res, arg); + if (ret) { + if (res != current->io_uring_restrict) + kfree(res); + return ret; + } + if (!current->io_uring_restrict) + current->io_uring_restrict = res; + return 0; +} + static int io_register_enable_rings(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_R_DISABLED)) @@ -912,6 +988,10 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg, return io_uring_register_send_msg_ring(arg, nr_args); case IORING_REGISTER_QUERY: return io_query(arg, nr_args); + case IORING_REGISTER_RESTRICTIONS: + return io_register_restrictions_task(arg, nr_args); + case IORING_REGISTER_BPF_FILTER: + return io_register_bpf_filter_task(arg, nr_args); } return -EINVAL; } diff --git a/io_uring/tctx.c b/io_uring/tctx.c index d4f7698805e4..e3da31fdf16f 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "tctx.h" +#include "bpf_filter.h" static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, struct task_struct *task) @@ -66,6 +67,11 @@ void __io_uring_free(struct task_struct *tsk) kfree(tctx); tsk->io_uring = NULL; } + if (tsk->io_uring_restrict) { + io_put_bpf_filters(tsk->io_uring_restrict); + kfree(tsk->io_uring_restrict); + tsk->io_uring_restrict = NULL; + } } __cold int io_uring_alloc_task_context(struct task_struct *task, @@ -356,5 +362,16 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, int __io_uring_fork(struct task_struct *tsk) { + struct io_restriction *res, *src = tsk->io_uring_restrict; + + /* Don't leave it dangling on error */ + tsk->io_uring_restrict = NULL; + + res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT); + if (!res) + return -ENOMEM; + + tsk->io_uring_restrict = res; + io_restriction_clone(res, src); return 0; } -- cgit v1.2.3 From 42fc7e6543f6d17d2cf9ed3e5021f103a3d11182 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Thu, 27 Nov 2025 12:51:34 +0100 Subject: landlock: Multithreading support for landlock_restrict_self() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the LANDLOCK_RESTRICT_SELF_TSYNC flag. With this flag, a given Landlock ruleset is applied to all threads of the calling process, instead of only the current one. Without this flag, multithreaded userspace programs currently resort to using the nptl(7)/libpsx hack for multithreaded policy enforcement, which is also used by libcap and for setuid(2). Using this userspace-based scheme, the threads of a process enforce the same Landlock policy, but the resulting Landlock domains are still separate. The domains being separate causes multiple problems: * When using Landlock's "scoped" access rights, the domain identity is used to determine whether an operation is permitted. As a result, when using LANLDOCK_SCOPE_SIGNAL, signaling between sibling threads stops working. This is a problem for programming languages and frameworks which are inherently multithreaded (e.g. Go). * In audit logging, the domains of separate threads in a process will get logged with different domain IDs, even when they are based on the same ruleset FD, which might confuse users. Cc: Andrew G. Morgan Cc: John Johansen Cc: Paul Moore Suggested-by: Jann Horn Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20251127115136.3064948-2-gnoack@google.com [mic: Fix restrict_self_flags test, clean up Makefile, allign comments, reduce local variable scope, add missing includes] Closes: https://github.com/landlock-lsm/linux/issues/2 Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 13 + security/landlock/Makefile | 11 +- security/landlock/cred.h | 12 + security/landlock/limits.h | 2 +- security/landlock/syscalls.c | 61 +-- security/landlock/tsync.c | 561 +++++++++++++++++++++++++++ security/landlock/tsync.h | 16 + tools/testing/selftests/landlock/base_test.c | 4 +- 8 files changed, 650 insertions(+), 30 deletions(-) create mode 100644 security/landlock/tsync.c create mode 100644 security/landlock/tsync.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 75fd7f5e6cc3..d5081ab4e5ef 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -117,11 +117,24 @@ struct landlock_ruleset_attr { * future nested domains, not the one being created. It can also be used * with a @ruleset_fd value of -1 to mute subdomain logs without creating a * domain. + * + * The following flag supports policy enforcement in multithreaded processes: + * + * %LANDLOCK_RESTRICT_SELF_TSYNC + * Applies the new Landlock configuration atomically to all threads of the + * current process, including the Landlock domain and logging + * configuration. This overrides the Landlock configuration of sibling + * threads, irrespective of previously established Landlock domains and + * logging configurations on these threads. + * + * If the calling thread is running with no_new_privs, this operation + * enables no_new_privs on the sibling threads as well. */ /* clang-format off */ #define LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF (1U << 0) #define LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON (1U << 1) #define LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF (1U << 2) +#define LANDLOCK_RESTRICT_SELF_TSYNC (1U << 3) /* clang-format on */ /** diff --git a/security/landlock/Makefile b/security/landlock/Makefile index 3160c2bdac1d..ffa7646d99f3 100644 --- a/security/landlock/Makefile +++ b/security/landlock/Makefile @@ -1,7 +1,14 @@ obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o -landlock-y := setup.o syscalls.o object.o ruleset.o \ - cred.o task.o fs.o +landlock-y := \ + setup.o \ + syscalls.o \ + object.o \ + ruleset.o \ + cred.o \ + task.o \ + fs.o \ + tsync.o landlock-$(CONFIG_INET) += net.o diff --git a/security/landlock/cred.h b/security/landlock/cred.h index c82fe63ec598..c10a06727eb1 100644 --- a/security/landlock/cred.h +++ b/security/landlock/cred.h @@ -26,6 +26,8 @@ * This structure is packed to minimize the size of struct * landlock_file_security. However, it is always aligned in the LSM cred blob, * see lsm_set_blob_size(). + * + * When updating this, also update landlock_cred_copy() if needed. */ struct landlock_cred_security { /** @@ -65,6 +67,16 @@ landlock_cred(const struct cred *cred) return cred->security + landlock_blob_sizes.lbs_cred; } +static inline void landlock_cred_copy(struct landlock_cred_security *dst, + const struct landlock_cred_security *src) +{ + landlock_put_ruleset(dst->domain); + + *dst = *src; + + landlock_get_ruleset(src->domain); +} + static inline struct landlock_ruleset *landlock_get_current_domain(void) { return landlock_cred(current_cred())->domain; diff --git a/security/landlock/limits.h b/security/landlock/limits.h index 65b5ff051674..eb584f47288d 100644 --- a/security/landlock/limits.h +++ b/security/landlock/limits.h @@ -31,7 +31,7 @@ #define LANDLOCK_MASK_SCOPE ((LANDLOCK_LAST_SCOPE << 1) - 1) #define LANDLOCK_NUM_SCOPE __const_hweight64(LANDLOCK_MASK_SCOPE) -#define LANDLOCK_LAST_RESTRICT_SELF LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF +#define LANDLOCK_LAST_RESTRICT_SELF LANDLOCK_RESTRICT_SELF_TSYNC #define LANDLOCK_MASK_RESTRICT_SELF ((LANDLOCK_LAST_RESTRICT_SELF << 1) - 1) /* clang-format on */ diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c index 0116e9f93ffe..3e4e99deb7f9 100644 --- a/security/landlock/syscalls.c +++ b/security/landlock/syscalls.c @@ -36,6 +36,7 @@ #include "net.h" #include "ruleset.h" #include "setup.h" +#include "tsync.h" static bool is_initialized(void) { @@ -161,7 +162,7 @@ static const struct file_operations ruleset_fops = { * Documentation/userspace-api/landlock.rst should be updated to reflect the * UAPI change. */ -const int landlock_abi_version = 7; +const int landlock_abi_version = 8; /** * sys_landlock_create_ruleset - Create a new ruleset @@ -454,9 +455,10 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd, * - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF * - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON * - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF + * - %LANDLOCK_RESTRICT_SELF_TSYNC * - * This system call enables to enforce a Landlock ruleset on the current - * thread. Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its + * This system call enforces a Landlock ruleset on the current thread. + * Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its * namespace or is running with no_new_privs. This avoids scenarios where * unprivileged tasks can affect the behavior of privileged children. * @@ -478,8 +480,7 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd, SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32, flags) { - struct landlock_ruleset *new_dom, - *ruleset __free(landlock_put_ruleset) = NULL; + struct landlock_ruleset *ruleset __free(landlock_put_ruleset) = NULL; struct cred *new_cred; struct landlock_cred_security *new_llcred; bool __maybe_unused log_same_exec, log_new_exec, log_subdomains, @@ -538,33 +539,43 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32, * We could optimize this case by not calling commit_creds() if this flag * was already set, but it is not worth the complexity. */ - if (!ruleset) - return commit_creds(new_cred); - - /* - * There is no possible race condition while copying and manipulating - * the current credentials because they are dedicated per thread. - */ - new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset); - if (IS_ERR(new_dom)) { - abort_creds(new_cred); - return PTR_ERR(new_dom); - } + if (ruleset) { + /* + * There is no possible race condition while copying and + * manipulating the current credentials because they are + * dedicated per thread. + */ + struct landlock_ruleset *const new_dom = + landlock_merge_ruleset(new_llcred->domain, ruleset); + if (IS_ERR(new_dom)) { + abort_creds(new_cred); + return PTR_ERR(new_dom); + } #ifdef CONFIG_AUDIT - new_dom->hierarchy->log_same_exec = log_same_exec; - new_dom->hierarchy->log_new_exec = log_new_exec; - if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains) - new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED; + new_dom->hierarchy->log_same_exec = log_same_exec; + new_dom->hierarchy->log_new_exec = log_new_exec; + if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains) + new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED; #endif /* CONFIG_AUDIT */ - /* Replaces the old (prepared) domain. */ - landlock_put_ruleset(new_llcred->domain); - new_llcred->domain = new_dom; + /* Replaces the old (prepared) domain. */ + landlock_put_ruleset(new_llcred->domain); + new_llcred->domain = new_dom; #ifdef CONFIG_AUDIT - new_llcred->domain_exec |= BIT(new_dom->num_layers - 1); + new_llcred->domain_exec |= BIT(new_dom->num_layers - 1); #endif /* CONFIG_AUDIT */ + } + + if (flags & LANDLOCK_RESTRICT_SELF_TSYNC) { + const int err = landlock_restrict_sibling_threads( + current_cred(), new_cred); + if (err) { + abort_creds(new_cred); + return err; + } + } return commit_creds(new_cred); } diff --git a/security/landlock/tsync.c b/security/landlock/tsync.c new file mode 100644 index 000000000000..0d2b9c646030 --- /dev/null +++ b/security/landlock/tsync.c @@ -0,0 +1,561 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Landlock - Cross-thread ruleset enforcement + * + * Copyright © 2025 Google LLC + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cred.h" +#include "tsync.h" + +/* + * Shared state between multiple threads which are enforcing Landlock rulesets + * in lockstep with each other. + */ +struct tsync_shared_context { + /* The old and tentative new creds of the calling thread. */ + const struct cred *old_cred; + const struct cred *new_cred; + + /* True if sibling tasks need to set the no_new_privs flag. */ + bool set_no_new_privs; + + /* An error encountered in preparation step, or 0. */ + atomic_t preparation_error; + + /* + * Barrier after preparation step in restrict_one_thread. + * The calling thread waits for completion. + * + * Re-initialized on every round of looking for newly spawned threads. + */ + atomic_t num_preparing; + struct completion all_prepared; + + /* Sibling threads wait for completion. */ + struct completion ready_to_commit; + + /* + * Barrier after commit step (used by syscall impl to wait for + * completion). + */ + atomic_t num_unfinished; + struct completion all_finished; +}; + +struct tsync_work { + struct callback_head work; + struct task_struct *task; + struct tsync_shared_context *shared_ctx; +}; + +/* + * restrict_one_thread - update a thread's Landlock domain in lockstep with the + * other threads in the same process + * + * When this is run, the same function gets run in all other threads in the same + * process (except for the calling thread which called landlock_restrict_self). + * The concurrently running invocations of restrict_one_thread coordinate + * through the shared ctx object to do their work in lockstep to implement + * all-or-nothing semantics for enforcing the new Landlock domain. + * + * Afterwards, depending on the presence of an error, all threads either commit + * or abort the prepared credentials. The commit operation can not fail any + * more. + */ +static void restrict_one_thread(struct tsync_shared_context *ctx) +{ + int err; + struct cred *cred = NULL; + + if (current_cred() == ctx->old_cred) { + /* + * Switch out old_cred with new_cred, if possible. + * + * In the common case, where all threads initially point to the same + * struct cred, this optimization avoids creating separate redundant + * credentials objects for each, which would all have the same contents. + * + * Note: We are intentionally dropping the const qualifier here, because + * it is required by commit_creds() and abort_creds(). + */ + cred = (struct cred *)get_cred(ctx->new_cred); + } else { + /* Else, prepare new creds and populate them. */ + cred = prepare_creds(); + + if (!cred) { + atomic_set(&ctx->preparation_error, -ENOMEM); + + /* + * Even on error, we need to adhere to the protocol and coordinate + * with concurrently running invocations. + */ + if (atomic_dec_return(&ctx->num_preparing) == 0) + complete_all(&ctx->all_prepared); + + goto out; + } + + landlock_cred_copy(landlock_cred(cred), + landlock_cred(ctx->new_cred)); + } + + /* + * Barrier: Wait until all threads are done preparing. + * After this point, we can have no more failures. + */ + if (atomic_dec_return(&ctx->num_preparing) == 0) + complete_all(&ctx->all_prepared); + + /* + * Wait for signal from calling thread that it's safe to read the + * preparation error now and we are ready to commit (or abort). + */ + wait_for_completion(&ctx->ready_to_commit); + + /* Abort the commit if any of the other threads had an error. */ + err = atomic_read(&ctx->preparation_error); + if (err) { + abort_creds(cred); + goto out; + } + + /* + * Make sure that all sibling tasks fulfill the no_new_privs prerequisite. + * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in + * kernel/seccomp.c) + */ + if (ctx->set_no_new_privs) + task_set_no_new_privs(current); + + commit_creds(cred); + +out: + /* Notify the calling thread once all threads are done */ + if (atomic_dec_return(&ctx->num_unfinished) == 0) + complete_all(&ctx->all_finished); +} + +/* + * restrict_one_thread_callback - task_work callback for restricting a thread + * + * Calls restrict_one_thread with the struct landlock_shared_tsync_context. + */ +static void restrict_one_thread_callback(struct callback_head *work) +{ + struct tsync_work *ctx = container_of(work, struct tsync_work, work); + + restrict_one_thread(ctx->shared_ctx); +} + +/* + * struct tsync_works - a growable array of per-task contexts + * + * The zero-initialized struct represents the empty array. + */ +struct tsync_works { + struct tsync_work **works; + size_t size; + size_t capacity; +}; + +/* + * tsync_works_provide - provides a preallocated tsync_work for the given task + * + * This also stores a task pointer in the context and increments the reference + * count of the task. + * + * This function may fail in the case where we did not preallocate sufficient + * capacity. This can legitimately happen if new threads get started after we + * grew the capacity. + * + * Returns: + * A pointer to the preallocated context struct, with task filled in. + * + * NULL, if we ran out of preallocated context structs. + */ +static struct tsync_work *tsync_works_provide(struct tsync_works *s, + struct task_struct *task) +{ + struct tsync_work *ctx; + + if (s->size >= s->capacity) + return NULL; + + ctx = s->works[s->size]; + s->size++; + + ctx->task = get_task_struct(task); + return ctx; +} + +/* + * tsync_works_grow_by - preallocates space for n more contexts in s + * + * On a successful return, the subsequent n calls to tsync_works_provide() are + * guaranteed to succeed. (size + n <= capacity) + * + * Returns: + * -ENOMEM if the (re)allocation fails + + * 0 if the allocation succeeds, partially succeeds, or no reallocation + * was needed + */ +static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags) +{ + size_t i; + size_t new_capacity; + struct tsync_work **works; + struct tsync_work *work; + + if (check_add_overflow(s->size, n, &new_capacity)) + return -EOVERFLOW; + + /* No need to reallocate if s already has sufficient capacity. */ + if (new_capacity <= s->capacity) + return 0; + + works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]), + flags); + if (!works) + return -ENOMEM; + + s->works = works; + + for (i = s->capacity; i < new_capacity; i++) { + work = kzalloc(sizeof(*work), flags); + if (!work) { + /* + * Leave the object in a consistent state, + * but return an error. + */ + s->capacity = i; + return -ENOMEM; + } + s->works[i] = work; + } + s->capacity = new_capacity; + return 0; +} + +/* + * tsync_works_contains - checks for presence of task in s + */ +static bool tsync_works_contains_task(const struct tsync_works *s, + struct task_struct *task) +{ + size_t i; + + for (i = 0; i < s->size; i++) + if (s->works[i]->task == task) + return true; + return false; +} + +/* + * tsync_works_release - frees memory held by s and drops all task references + * + * This does not free s itself, only the data structures held by it. + */ +static void tsync_works_release(struct tsync_works *s) +{ + size_t i; + + for (i = 0; i < s->size; i++) { + if (!s->works[i]->task) + continue; + + put_task_struct(s->works[i]->task); + } + + for (i = 0; i < s->capacity; i++) + kfree(s->works[i]); + kfree(s->works); + s->works = NULL; + s->size = 0; + s->capacity = 0; +} + +/* + * count_additional_threads - counts the sibling threads that are not in works + */ +static size_t count_additional_threads(const struct tsync_works *works) +{ + struct task_struct *thread, *caller; + size_t n = 0; + + caller = current; + + guard(rcu)(); + + for_each_thread(caller, thread) { + /* Skip current, since it is initiating the sync. */ + if (thread == caller) + continue; + + /* Skip exited threads. */ + if (thread->flags & PF_EXITING) + continue; + + /* Skip threads that we have already seen. */ + if (tsync_works_contains_task(works, thread)) + continue; + + n++; + } + return n; +} + +/* + * schedule_task_work - adds task_work for all eligible sibling threads + * which have not been scheduled yet + * + * For each added task_work, atomically increments shared_ctx->num_preparing and + * shared_ctx->num_unfinished. + * + * Returns: + * true, if at least one eligible sibling thread was found + */ +static bool schedule_task_work(struct tsync_works *works, + struct tsync_shared_context *shared_ctx) +{ + int err; + struct task_struct *thread, *caller; + struct tsync_work *ctx; + bool found_more_threads = false; + + caller = current; + + guard(rcu)(); + + for_each_thread(caller, thread) { + /* Skip current, since it is initiating the sync. */ + if (thread == caller) + continue; + + /* Skip exited threads. */ + if (thread->flags & PF_EXITING) + continue; + + /* Skip threads that we already looked at. */ + if (tsync_works_contains_task(works, thread)) + continue; + + /* + * We found a sibling thread that is not doing its task_work yet, and + * which might spawn new threads before our task work runs, so we need + * at least one more round in the outer loop. + */ + found_more_threads = true; + + ctx = tsync_works_provide(works, thread); + if (!ctx) { + /* + * We ran out of preallocated contexts -- we need to try again with + * this thread at a later time! + * found_more_threads is already true at this point. + */ + break; + } + + ctx->shared_ctx = shared_ctx; + + atomic_inc(&shared_ctx->num_preparing); + atomic_inc(&shared_ctx->num_unfinished); + + init_task_work(&ctx->work, restrict_one_thread_callback); + err = task_work_add(thread, &ctx->work, TWA_SIGNAL); + if (err) { + /* + * task_work_add() only fails if the task is about to exit. We + * checked that earlier, but it can happen as a race. Resume + * without setting an error, as the task is probably gone in the + * next loop iteration. For consistency, remove the task from ctx + * so that it does not look like we handed it a task_work. + */ + put_task_struct(ctx->task); + ctx->task = NULL; + + atomic_dec(&shared_ctx->num_preparing); + atomic_dec(&shared_ctx->num_unfinished); + } + } + + return found_more_threads; +} + +/* + * cancel_tsync_works - cancel all task works where it is possible + * + * Task works can be canceled as long as they are still queued and have not + * started running. If they get canceled, we decrement + * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two + * completions if needed, as if the task was never scheduled. + */ +static void cancel_tsync_works(struct tsync_works *works, + struct tsync_shared_context *shared_ctx) +{ + int i; + + for (i = 0; i < works->size; i++) { + if (!task_work_cancel(works->works[i]->task, + &works->works[i]->work)) + continue; + + /* After dequeueing, act as if the task work had executed. */ + + if (atomic_dec_return(&shared_ctx->num_preparing) == 0) + complete_all(&shared_ctx->all_prepared); + + if (atomic_dec_return(&shared_ctx->num_unfinished) == 0) + complete_all(&shared_ctx->all_finished); + } +} + +/* + * restrict_sibling_threads - enables a Landlock policy for all sibling threads + */ +int landlock_restrict_sibling_threads(const struct cred *old_cred, + const struct cred *new_cred) +{ + int err; + struct tsync_shared_context shared_ctx; + struct tsync_works works = {}; + size_t newly_discovered_threads; + bool found_more_threads; + + atomic_set(&shared_ctx.preparation_error, 0); + init_completion(&shared_ctx.all_prepared); + init_completion(&shared_ctx.ready_to_commit); + atomic_set(&shared_ctx.num_unfinished, 1); + init_completion(&shared_ctx.all_finished); + shared_ctx.old_cred = old_cred; + shared_ctx.new_cred = new_cred; + shared_ctx.set_no_new_privs = task_no_new_privs(current); + + /* + * We schedule a pseudo-signal task_work for each of the calling task's + * sibling threads. In the task work, each thread: + * + * 1) runs prepare_creds() and writes back the error to + * shared_ctx.preparation_error, if needed. + * + * 2) signals that it's done with prepare_creds() to the calling task. + * (completion "all_prepared"). + * + * 3) waits for the completion "ready_to_commit". This is sent by the + * calling task after ensuring that all sibling threads have done + * with the "preparation" stage. + * + * After this barrier is reached, it's safe to read + * shared_ctx.preparation_error. + * + * 4) reads shared_ctx.preparation_error and then either does commit_creds() + * or abort_creds(). + * + * 5) signals that it's done altogether (barrier synchronization + * "all_finished") + * + * Unlike seccomp, which modifies sibling tasks directly, we do not need to + * acquire the cred_guard_mutex and sighand->siglock: + * + * - As in our case, all threads are themselves exchanging their own struct + * cred through the credentials API, no locks are needed for that. + * - Our for_each_thread() loops are protected by RCU. + * - We do not acquire a lock to keep the list of sibling threads stable + * between our for_each_thread loops. If the list of available sibling + * threads changes between these for_each_thread loops, we make up for + * that by continuing to look for threads until they are all discovered + * and have entered their task_work, where they are unable to spawn new + * threads. + */ + do { + /* In RCU read-lock, count the threads we need. */ + newly_discovered_threads = count_additional_threads(&works); + + if (newly_discovered_threads == 0) + break; /* done */ + + err = tsync_works_grow_by(&works, newly_discovered_threads, + GFP_KERNEL_ACCOUNT); + if (err) { + atomic_set(&shared_ctx.preparation_error, err); + break; + } + + /* + * The "all_prepared" barrier is used locally to the loop body, this use + * of for_each_thread(). We can reset it on each loop iteration because + * all previous loop iterations are done with it already. + * + * num_preparing is initialized to 1 so that the counter can not go to 0 + * and mark the completion as done before all task works are registered. + * We decrement it at the end of the loop body. + */ + atomic_set(&shared_ctx.num_preparing, 1); + reinit_completion(&shared_ctx.all_prepared); + + /* + * In RCU read-lock, schedule task work on newly discovered sibling + * tasks. + */ + found_more_threads = schedule_task_work(&works, &shared_ctx); + + /* + * Decrement num_preparing for current, to undo that we initialized it + * to 1 a few lines above. + */ + if (atomic_dec_return(&shared_ctx.num_preparing) > 0) { + if (wait_for_completion_interruptible( + &shared_ctx.all_prepared)) { + /* In case of interruption, we need to retry the system call. */ + atomic_set(&shared_ctx.preparation_error, + -ERESTARTNOINTR); + + /* + * Cancel task works for tasks that did not start running yet, + * and decrement all_prepared and num_unfinished accordingly. + */ + cancel_tsync_works(&works, &shared_ctx); + + /* + * The remaining task works have started running, so waiting for + * their completion will finish. + */ + wait_for_completion(&shared_ctx.all_prepared); + } + } + } while (found_more_threads && + !atomic_read(&shared_ctx.preparation_error)); + + /* + * We now have all sibling threads blocking and in "prepared" state in the + * task work. Ask all threads to commit. + */ + complete_all(&shared_ctx.ready_to_commit); + + /* + * Decrement num_unfinished for current, to undo that we initialized it to 1 + * at the beginning. + */ + if (atomic_dec_return(&shared_ctx.num_unfinished) > 0) + wait_for_completion(&shared_ctx.all_finished); + + tsync_works_release(&works); + + return atomic_read(&shared_ctx.preparation_error); +} diff --git a/security/landlock/tsync.h b/security/landlock/tsync.h new file mode 100644 index 000000000000..ef86bb61c2f6 --- /dev/null +++ b/security/landlock/tsync.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Landlock - Cross-thread ruleset enforcement + * + * Copyright © 2025 Google LLC + */ + +#ifndef _SECURITY_LANDLOCK_TSYNC_H +#define _SECURITY_LANDLOCK_TSYNC_H + +#include + +int landlock_restrict_sibling_threads(const struct cred *old_cred, + const struct cred *new_cred); + +#endif /* _SECURITY_LANDLOCK_TSYNC_H */ diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index 7b69002239d7..fdbb672009ac 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -76,7 +76,7 @@ TEST(abi_version) const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, }; - ASSERT_EQ(7, landlock_create_ruleset(NULL, 0, + ASSERT_EQ(8, landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION)); ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, @@ -306,7 +306,7 @@ TEST(restrict_self_fd_flags) TEST(restrict_self_flags) { - const __u32 last_flag = LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF; + const __u32 last_flag = LANDLOCK_RESTRICT_SELF_TSYNC; /* Tests invalid flag combinations. */ -- cgit v1.2.3 From bbb6f53e905ca119f99ccab8496f8921d9db9c50 Mon Sep 17 00:00:00 2001 From: Matthieu Buffet Date: Fri, 12 Dec 2025 17:36:57 +0100 Subject: landlock: Minor reword of docs for TCP access rights MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move ABI requirement next to each access right to prepare adding more access rights; - Mention the possibility to remove the random component of a socket's ephemeral port choice within the netns-wide ephemeral port range, since it allows choosing the "random" ephemeral port. Signed-off-by: Matthieu Buffet Link: https://lore.kernel.org/r/20251212163704.142301-2-matthieu@buffet.re Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index d5081ab4e5ef..f88fa1f68b77 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -195,11 +195,13 @@ struct landlock_net_port_attr { * It should be noted that port 0 passed to :manpage:`bind(2)` will bind * to an available port from the ephemeral port range. This can be * configured with the ``/proc/sys/net/ipv4/ip_local_port_range`` sysctl - * (also used for IPv6). + * (also used for IPv6), and within that range, on a per-socket basis + * with ``setsockopt(IP_LOCAL_PORT_RANGE)``. * - * A Landlock rule with port 0 and the ``LANDLOCK_ACCESS_NET_BIND_TCP`` + * A Landlock rule with port 0 and the %LANDLOCK_ACCESS_NET_BIND_TCP * right means that requesting to bind on port 0 is allowed and it will - * automatically translate to binding on the related port range. + * automatically translate to binding on a kernel-assigned ephemeral + * port. */ __u64 port; }; @@ -342,13 +344,12 @@ struct landlock_net_port_attr { * These flags enable to restrict a sandboxed process to a set of network * actions. * - * This is supported since Landlock ABI version 4. - * * The following access rights apply to TCP port numbers: * - * - %LANDLOCK_ACCESS_NET_BIND_TCP: Bind a TCP socket to a local port. - * - %LANDLOCK_ACCESS_NET_CONNECT_TCP: Connect an active TCP socket to - * a remote port. + * - %LANDLOCK_ACCESS_NET_BIND_TCP: Bind TCP sockets to the given local + * port. Support added in Landlock ABI version 4. + * - %LANDLOCK_ACCESS_NET_CONNECT_TCP: Connect TCP sockets to the given + * remote port. Support added in Landlock ABI version 4. */ /* clang-format off */ #define LANDLOCK_ACCESS_NET_BIND_TCP (1ULL << 0) -- cgit v1.2.3 From 136f1e168f4941021565f8c10ff4bb81b1f13f2c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 5 Feb 2026 18:34:23 +0100 Subject: mptcp: fix kdoc warnings The following warnings were visible: $ ./scripts/kernel-doc -Wall -none \ net/mptcp/ include/net/mptcp.h include/uapi/linux/mptcp*.h \ include/trace/events/mptcp.h Warning: net/mptcp/token.c:108 No description found for return value of 'mptcp_token_new_request' Warning: net/mptcp/token.c:151 No description found for return value of 'mptcp_token_new_connect' Warning: net/mptcp/token.c:246 No description found for return value of 'mptcp_token_get_sock' Warning: net/mptcp/token.c:298 No description found for return value of 'mptcp_token_iter_next' Warning: net/mptcp/protocol.c:4431 No description found for return value of 'mptcp_splice_read' Warning: include/uapi/linux/mptcp_pm.h:13 missing initial short description on line: * enum mptcp_event_type Address all of them: either by using the 'Return:' keyword, or by adding a missing initial short description. The MPTCP CI will soon report issues with kdoc to avoid introducing new issues and being flagged by the Netdev CI. Reviewed-by: Geliang Tang Reviewed-by: Randy Dunlap Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260205-net-mptcp-misc-fixes-6-19-rc8-v2-3-c2720ce75c34@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 1 + include/uapi/linux/mptcp_pm.h | 2 +- net/mptcp/token.c | 16 +++++++++------- 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index ba30a40b9dbf..39f3facc38e5 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -15,6 +15,7 @@ definitions: type: enum name: event-type enum-name: mptcp-event-type + doc: Netlink MPTCP event types name-prefix: mptcp-event- entries: - diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index c97d060ee90b..fe9863d75350 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -11,7 +11,7 @@ #define MPTCP_PM_VER 1 /** - * enum mptcp_event_type + * enum mptcp_event_type - Netlink MPTCP event types * @MPTCP_EVENT_UNSPEC: unused event * @MPTCP_EVENT_CREATED: A new MPTCP connection has been created. It is the * good time to allocate memory and send ADD_ADDR if needed. Depending on the diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 5bb924534387..f1a50f367add 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -103,7 +103,7 @@ static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) * It creates a unique token to identify the new mptcp connection, * a secret local key and the initial data sequence number (idsn). * - * Returns 0 on success. + * Return: 0 on success. */ int mptcp_token_new_request(struct request_sock *req) { @@ -146,7 +146,7 @@ int mptcp_token_new_request(struct request_sock *req) * the computed token at a later time, this is needed to process * join requests. * - * returns 0 on success. + * Return: 0 on success. */ int mptcp_token_new_connect(struct sock *ssk) { @@ -241,7 +241,7 @@ found: * This function returns the mptcp connection structure with the given token. * A reference count on the mptcp socket returned is taken. * - * returns NULL if no connection with the given token value exists. + * Return: NULL if no connection with the given token value exists. */ struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token) { @@ -288,11 +288,13 @@ EXPORT_SYMBOL_GPL(mptcp_token_get_sock); * @s_slot: start slot number * @s_num: start number inside the given lock * - * This function returns the first mptcp connection structure found inside the - * token container starting from the specified position, or NULL. + * Description: + * On successful iteration, the iterator is moved to the next position and a + * reference to the returned socket is acquired. * - * On successful iteration, the iterator is moved to the next position and - * a reference to the returned socket is acquired. + * Return: + * The first mptcp connection structure found inside the token container + * starting from the specified position, or NULL. */ struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num) -- cgit v1.2.3 From 90079798f1d748e97c74e23736491543577b8aee Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Feb 2026 10:59:00 +0100 Subject: delayacct: fix uapi timespec64 definition The custom definition of 'struct timespec64' is incompatible with both the kernel's internal definition and the glibc type, at least on big-endian targets that have the tv_nsec field in a different place, and the definition clashes with any userspace that also defines a timespec64 structure. Running the header check with -Wpadding enabled produces this output that warns about the incorrect padding: usr/include/linux/taskstats.h:25:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded] Remove the hack and instead use the regular __kernel_timespec type that is meant to be used in uapi definitions. Link: https://lkml.kernel.org/r/20260202095906.1344100-1-arnd@kernel.org Fixes: 29b63f6eff0e ("delayacct: add timestamp of delay max") Signed-off-by: Arnd Bergmann Cc: Fan Yu Cc: Jonathan Corbet Cc: xu xin Cc: Yang Yang Cc: Balbir Singh Cc: Jiang Kun Signed-off-by: Andrew Morton --- include/uapi/linux/taskstats.h | 27 +++++++++------------------ kernel/delayacct.c | 6 ++++-- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 1b31e8e14d2f..3ae25f3ce067 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -18,16 +18,7 @@ #define _LINUX_TASKSTATS_H #include -#ifdef __KERNEL__ -#include -#else -#ifndef _LINUX_TIME64_H -struct timespec64 { - __s64 tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ -}; -#endif -#endif +#include /* Format for per-task data returned to userland when * - a task exits @@ -242,14 +233,14 @@ struct taskstats { __u64 irq_delay_min; /*v17: delay max timestamp record*/ - struct timespec64 cpu_delay_max_ts; - struct timespec64 blkio_delay_max_ts; - struct timespec64 swapin_delay_max_ts; - struct timespec64 freepages_delay_max_ts; - struct timespec64 thrashing_delay_max_ts; - struct timespec64 compact_delay_max_ts; - struct timespec64 wpcopy_delay_max_ts; - struct timespec64 irq_delay_max_ts; + struct __kernel_timespec cpu_delay_max_ts; + struct __kernel_timespec blkio_delay_max_ts; + struct __kernel_timespec swapin_delay_max_ts; + struct __kernel_timespec freepages_delay_max_ts; + struct __kernel_timespec thrashing_delay_max_ts; + struct __kernel_timespec compact_delay_max_ts; + struct __kernel_timespec wpcopy_delay_max_ts; + struct __kernel_timespec irq_delay_max_ts; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index d58ffc63bcba..2e55c493c98b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -18,7 +18,8 @@ do { \ d->type##_delay_max = tsk->delays->type##_delay_max; \ d->type##_delay_min = tsk->delays->type##_delay_min; \ - d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \ + d->type##_delay_max_ts.tv_sec = tsk->delays->type##_delay_max_ts.tv_sec; \ + d->type##_delay_max_ts.tv_nsec = tsk->delays->type##_delay_max_ts.tv_nsec; \ tmp = d->type##_delay_total + tsk->delays->type##_delay; \ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ d->type##_count += tsk->delays->type##_count; \ @@ -175,7 +176,8 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_delay_max = tsk->sched_info.max_run_delay; d->cpu_delay_min = tsk->sched_info.min_run_delay; - d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts; + d->cpu_delay_max_ts.tv_sec = tsk->sched_info.max_run_delay_ts.tv_sec; + d->cpu_delay_max_ts.tv_nsec = tsk->sched_info.max_run_delay_ts.tv_nsec; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; tmp = (s64)d->cpu_run_virtual_total + t3; -- cgit v1.2.3 From ebcff9dacaf2c1418f8bc927388186d7d3674603 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Feb 2026 23:48:07 +0100 Subject: vduse: avoid adding implicit padding The vduse_iova_range_v2 and vduse_iotlb_entry_v2 structures are both defined in a way that adds implicit padding and is incompatible between i386 and x86_64 userspace because of the different structure alignment requirements. Building the header with -Wpadded shows these new warnings: vduse.h:305:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded] vduse.h:374:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded] Change the amount of padding in these two structures to align them to 64 bit words and avoid those problems. Since the v1 vduse_iotlb_entry already has an inconsistent size, do not attempt to reuse the structure but rather list the members indiviudally, with a fixed amount of padding. Fixes: 079212f6877e ("vduse: add vq group asid support") Signed-off-by: Arnd Bergmann Signed-off-by: Michael S. Tsirkin Message-Id: <20260202224835.559538-1-arnd@kernel.org> --- drivers/vdpa/vdpa_user/vduse_dev.c | 40 +++++++++++++------------------------- include/uapi/linux/vduse.h | 9 +++++++-- 2 files changed, 21 insertions(+), 28 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 73d1d517dc6c..405d59610f76 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1301,7 +1301,7 @@ static int vduse_dev_iotlb_entry(struct vduse_dev *dev, int r = -EINVAL; struct vhost_iotlb_map *map; - if (entry->v1.start > entry->v1.last || entry->asid >= dev->nas) + if (entry->start > entry->last || entry->asid >= dev->nas) return -EINVAL; asid = array_index_nospec(entry->asid, dev->nas); @@ -1312,18 +1312,18 @@ static int vduse_dev_iotlb_entry(struct vduse_dev *dev, spin_lock(&dev->as[asid].domain->iotlb_lock); map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb, - entry->v1.start, entry->v1.last); + entry->start, entry->last); if (map) { if (f) { const struct vdpa_map_file *map_file; map_file = (struct vdpa_map_file *)map->opaque; - entry->v1.offset = map_file->offset; + entry->offset = map_file->offset; *f = get_file(map_file->file); } - entry->v1.start = map->start; - entry->v1.last = map->last; - entry->v1.perm = map->perm; + entry->start = map->start; + entry->last = map->last; + entry->perm = map->perm; if (capability) { *capability = 0; @@ -1363,14 +1363,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; ret = -EFAULT; - if (cmd == VDUSE_IOTLB_GET_FD2) { - if (copy_from_user(&entry, argp, sizeof(entry))) - break; - } else { - if (copy_from_user(&entry.v1, argp, - sizeof(entry.v1))) - break; - } + if (copy_from_user(&entry, argp, _IOC_SIZE(cmd))) + break; ret = -EINVAL; if (!is_mem_zero((const char *)entry.reserved, @@ -1385,19 +1379,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (!f) break; - if (cmd == VDUSE_IOTLB_GET_FD2) - ret = copy_to_user(argp, &entry, - sizeof(entry)); - else - ret = copy_to_user(argp, &entry.v1, - sizeof(entry.v1)); - + ret = copy_to_user(argp, &entry, _IOC_SIZE(cmd)); if (ret) { ret = -EFAULT; fput(f); break; } - ret = receive_fd(f, NULL, perm_to_file_flags(entry.v1.perm)); + ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm)); fput(f); break; } @@ -1603,16 +1591,16 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, } else if (info.asid >= dev->nas) break; - entry.v1.start = info.start; - entry.v1.last = info.last; + entry.start = info.start; + entry.last = info.last; entry.asid = info.asid; ret = vduse_dev_iotlb_entry(dev, &entry, NULL, &info.capability); if (ret < 0) break; - info.start = entry.v1.start; - info.last = entry.v1.last; + info.start = entry.start; + info.last = entry.last; info.asid = entry.asid; ret = -EFAULT; diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 68b4287f9fac..361eea511c21 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -293,9 +293,13 @@ struct vduse_iova_info { * Structure used by VDUSE_IOTLB_GET_FD2 ioctl to find an overlapped IOVA region. */ struct vduse_iotlb_entry_v2 { - struct vduse_iotlb_entry v1; + __u64 offset; + __u64 start; + __u64 last; + __u8 perm; + __u8 padding[7]; __u32 asid; - __u32 reserved[12]; + __u32 reserved[11]; }; /* @@ -365,6 +369,7 @@ struct vduse_iova_range_v2 { __u64 start; __u64 last; __u32 asid; + __u32 padding; }; /** -- cgit v1.2.3 From c29214677a9fc1a3a4ee65e189afeb5fd10d676f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 21:34:28 +0000 Subject: io_uring/query: return support for custom rx page size Add an ability to query if the zcrx rx page size setting is available. Note, even when the API is supported by io_uring, the registration can still get rejected for various reasons, e.g. when the NIC or the driver doesn't support it, when the particular specified size is unsupported, when the memory area doesn't satisfy all requirements, etc. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 ++++++++ include/uapi/linux/io_uring/query.h | 3 ++- io_uring/query.c | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index da5156954731..c462bdf3c42c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1090,6 +1090,14 @@ enum zcrx_reg_flags { ZCRX_REG_IMPORT = 1, }; +enum zcrx_features { + /* + * The user can ask for the desired rx page size by passing the + * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. + */ + ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index 2456e6c5ebb5..0b6248175e26 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -50,7 +50,8 @@ struct io_uring_query_zcrx { __u64 area_flags; /* The number of supported ZCRX_CTRL_* opcodes */ __u32 nr_ctrl_opcodes; - __u32 __resv1; + /* Bitmask of ZCRX_FEATURE_* indicating which features are available */ + __u32 features; /* The refill ring header size */ __u32 rq_hdr_size; /* The alignment for the header */ diff --git a/io_uring/query.c b/io_uring/query.c index abdd6f3e1223..63cc30c9803d 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -39,7 +39,7 @@ static ssize_t io_query_zcrx(union io_query_data *data) e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; e->rq_hdr_size = sizeof(struct io_uring); e->rq_hdr_alignment = L1_CACHE_BYTES; - e->__resv1 = 0; + e->features = ZCRX_FEATURE_RX_PAGE_SIZE; e->__resv2 = 0; return sizeof(*e); } -- cgit v1.2.3 From 6b34f8edf8b807b7f87901623aa52dfa1b29ef93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Feb 2026 21:38:09 +0000 Subject: io_uring/query: add query.h copyright notice Add a copyright notice to io_uring's query uapi header. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/query.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index 0b6248175e26..95500759cc13 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ /* * Header file for the io_uring query interface. + * + * Copyright (C) 2026 Pavel Begunkov + * Copyright (C) Meta Platforms, Inc. */ #ifndef LINUX_IO_URING_QUERY_H #define LINUX_IO_URING_QUERY_H -- cgit v1.2.3 From be3573124e630736d2d39650b12f5ef220b47ac1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Feb 2026 10:00:44 -0700 Subject: io_uring/bpf_filter: pass in expected filter payload size It's quite possible that opcodes that have payloads attached to them, like IORING_OP_OPENAT/OPENAT2 or IORING_OP_SOCKET, that these paylods can change over time. For example, on the openat/openat2 side, the struct open_how argument is extensible, and could be extended in the future to allow further arguments to be passed in. Allow registration of a cBPF filter to give the size of the filter as seen by userspace. If that filter is for an opcode that takes extra payload data, allow it if the application payload expectation is the same size than the kernels. If that is the case, the kernel supports filtering on the payload that the application expects. If the size differs, the behavior depends on the IO_URING_BPF_FILTER_SZ_STRICT flag: 1) If IO_URING_BPF_FILTER_SZ_STRICT is set and the size expectation differs, fail the attempt to load the filter. 2) If IO_URING_BPF_FILTER_SZ_STRICT isn't set, allow the filter if the userspace pdu size is smaller than what the kernel offers. 3) Regardless if IO_URING_BPF_FILTER_SZ_STRICT, fail loading the filter if the userspace pdu size is bigger than what the kernel supports. An attempt to load a filter due to sizing will error with -EMSGSIZE. For that error, the registration struct will have filter->pdu_size populated with the pdu size that the kernel uses. Reported-by: Christian Brauner Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/bpf_filter.h | 8 +++- io_uring/bpf_filter.c | 65 ++++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h index 220351b81bc0..1b461d792a7b 100644 --- a/include/uapi/linux/io_uring/bpf_filter.h +++ b/include/uapi/linux/io_uring/bpf_filter.h @@ -35,13 +35,19 @@ enum { * If set, any currently unset opcode will have a deny filter attached */ IO_URING_BPF_FILTER_DENY_REST = 1, + /* + * If set, if kernel and application don't agree on pdu_size for + * the given opcode, fail the registration of the filter. + */ + IO_URING_BPF_FILTER_SZ_STRICT = 2, }; struct io_uring_bpf_filter { __u32 opcode; /* io_uring opcode to filter */ __u32 flags; __u32 filter_len; /* number of BPF instructions */ - __u32 resv; + __u8 pdu_size; /* expected pdu size for opcode */ + __u8 resv[3]; __u64 filter_ptr; /* pointer to BPF filter */ __u64 resv2[5]; }; diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c index 8ac7d06de122..28a23e92ee81 100644 --- a/io_uring/bpf_filter.c +++ b/io_uring/bpf_filter.c @@ -308,36 +308,69 @@ err: return ERR_PTR(-EBUSY); } -#define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST +#define IO_URING_BPF_FILTER_FLAGS (IO_URING_BPF_FILTER_DENY_REST | \ + IO_URING_BPF_FILTER_SZ_STRICT) -int io_register_bpf_filter(struct io_restriction *res, - struct io_uring_bpf __user *arg) +static int io_bpf_filter_import(struct io_uring_bpf *reg, + struct io_uring_bpf __user *arg) { - struct io_bpf_filters *filters, *old_filters = NULL; - struct io_bpf_filter *filter, *old_filter; - struct io_uring_bpf reg; - struct bpf_prog *prog; - struct sock_fprog fprog; + const struct io_issue_def *def; int ret; - if (copy_from_user(®, arg, sizeof(reg))) + if (copy_from_user(reg, arg, sizeof(*reg))) return -EFAULT; - if (reg.cmd_type != IO_URING_BPF_CMD_FILTER) + if (reg->cmd_type != IO_URING_BPF_CMD_FILTER) return -EINVAL; - if (reg.cmd_flags || reg.resv) + if (reg->cmd_flags || reg->resv) return -EINVAL; - if (reg.filter.opcode >= IORING_OP_LAST) + if (reg->filter.opcode >= IORING_OP_LAST) return -EINVAL; - if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS) + if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS) return -EINVAL; - if (reg.filter.resv) + if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv))) return -EINVAL; - if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2))) + if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2))) return -EINVAL; - if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS) + if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS) return -EINVAL; + /* Verify filter size */ + def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)]; + + /* same size, always ok */ + ret = 0; + if (reg->filter.pdu_size == def->filter_pdu_size) + ; + /* size differs, fail in strict mode */ + else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT) + ret = -EMSGSIZE; + /* userspace filter is bigger, always disallow */ + else if (reg->filter.pdu_size > def->filter_pdu_size) + ret = -EMSGSIZE; + + /* copy back kernel filter size */ + reg->filter.pdu_size = def->filter_pdu_size; + if (copy_to_user(&arg->filter, ®->filter, sizeof(reg->filter))) + return -EFAULT; + + return ret; +} + +int io_register_bpf_filter(struct io_restriction *res, + struct io_uring_bpf __user *arg) +{ + struct io_bpf_filters *filters, *old_filters = NULL; + struct io_bpf_filter *filter, *old_filter; + struct io_uring_bpf reg; + struct bpf_prog *prog; + struct sock_fprog fprog; + int ret; + + ret = io_bpf_filter_import(®, arg); + if (ret) + return ret; + fprog.len = reg.filter.filter_len; fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr); -- cgit v1.2.3 From 4edd4ba71ce0df015303dba75ea9d20d1a217546 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sat, 14 Feb 2026 15:54:06 +0100 Subject: include: uapi: netfilter_bridge.h: Cover for musl libc Musl defines its own struct ethhdr and thus defines __UAPI_DEF_ETHHDR to zero. To avoid struct redefinition errors, user space is therefore supposed to include netinet/if_ether.h before (or instead of) linux/if_ether.h. To relieve them from this burden, include the libc header here if not building for kernel space. Reported-by: Alyssa Ross Suggested-by: Florian Westphal Signed-off-by: Phil Sutter Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter_bridge.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h index f6e8d1e05c97..758de72b2764 100644 --- a/include/uapi/linux/netfilter_bridge.h +++ b/include/uapi/linux/netfilter_bridge.h @@ -5,6 +5,10 @@ /* bridge-specific defines for netfilter. */ +#ifndef __KERNEL__ +#include /* for __UAPI_DEF_ETHHDR if defined */ +#endif + #include #include #include -- cgit v1.2.3 From a284dbc96a47891a7a595a1c81b1e2da4d309cf6 Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Wed, 18 Feb 2026 14:47:59 +0000 Subject: mshv: Add nested virtualization creation flag Introduce HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE to indicate support for nested virtualization during partition creation. This enables clearer configuration and capability checks for nested virtualization scenarios. Signed-off-by: Stanislav Kinsburskii Signed-off-by: Muminul Islam Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 2 ++ include/hyperv/hvhdk.h | 1 + include/uapi/linux/mshv.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e5d94398528e..e490f8e5a8a5 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1947,6 +1947,8 @@ static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE; isol_props->as_uint64 = 0; diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 79d1f16a850a..f139c7c5bb2d 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -335,6 +335,7 @@ union hv_partition_isolation_properties { #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2 /* Note: Exo partition is enabled by default */ +#define HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE BIT(1) #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4) #define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) #define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index dee3ece28ce5..7ef5dd67a232 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -27,6 +27,7 @@ enum { MSHV_PT_BIT_X2APIC, MSHV_PT_BIT_GPA_SUPER_PAGES, MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES, + MSHV_PT_BIT_NESTED_VIRTUALIZATION, MSHV_PT_BIT_COUNT, }; -- cgit v1.2.3 From 8927a108a7662eb83eb667bc0c5a0633397122b1 Mon Sep 17 00:00:00 2001 From: Anatol Belski Date: Wed, 18 Feb 2026 14:48:02 +0000 Subject: mshv: Add SMT_ENABLED_GUEST partition creation flag Add support for HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST to allow userspace VMMs to enable SMT for guest partitions. Expose this via new MSHV_PT_BIT_SMT_ENABLED_GUEST flag in the UAPI. Without this flag, the hypervisor schedules guest VPs incorrectly, causing SMT unusable. Signed-off-by: Anatol Belski Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 2 ++ include/hyperv/hvhdk.h | 1 + include/uapi/linux/mshv.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e490f8e5a8a5..192467a25f66 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1949,6 +1949,8 @@ static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION)) *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE; + if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST; isol_props->as_uint64 = 0; diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index f139c7c5bb2d..245f3db53bf1 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -335,6 +335,7 @@ union hv_partition_isolation_properties { #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2 /* Note: Exo partition is enabled by default */ +#define HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST BIT(0) #define HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE BIT(1) #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4) #define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 7ef5dd67a232..e0645a34b55b 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -28,6 +28,7 @@ enum { MSHV_PT_BIT_GPA_SUPER_PAGES, MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES, MSHV_PT_BIT_NESTED_VIRTUALIZATION, + MSHV_PT_BIT_SMT_ENABLED_GUEST, MSHV_PT_BIT_COUNT, }; -- cgit v1.2.3